# Assignment A5-1 Car Rental Data Classification

## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

## Load data

### Create header

In [2]:
names = ['buying','maintenance','doors','persons','luggage_boot','safety', 'class']

In [3]:
df = pd.read_csv("/Users/pernillelorup/Desktop/Softwareudvikling/AI/MachineLearning/Week10/data/504b3eff-car.data", names=names)

## Get to know the data

In [4]:
print(df.shape)

(1728, 7)


In [5]:
list(df)

['buying',
 'maintenance',
 'doors',
 'persons',
 'luggage_boot',
 'safety',
 'class']

In [6]:
df.head()

Unnamed: 0,buying,maintenance,doors,persons,luggage_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   buying        1728 non-null   object
 1   maintenance   1728 non-null   object
 2   doors         1728 non-null   object
 3   persons       1728 non-null   object
 4   luggage_boot  1728 non-null   object
 5   safety        1728 non-null   object
 6   class         1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [8]:
print(df.describe())

buying maintenance doors persons luggage_boot safety  class
count    1728        1728  1728    1728         1728   1728   1728
unique      4           4     4       3            3      3      4
top       med         med     4    more          med    med  unacc
freq      432         432   432     576          576    576   1210


#### Check for null values in data

In [9]:
df.isnull().sum()

buying          0
maintenance     0
doors           0
persons         0
luggage_boot    0
safety          0
class           0
dtype: int64

In [10]:
# in case of null values - clean the data
np.nan_to_num(df)
df[df.notnull()]
df = df.dropna()

In [11]:
# See how many classes are included and how many records per class are distributed
print(df.groupby('class').size())

class
acc       384
good       69
unacc    1210
vgood      65
dtype: int64


## Preprocessing data

### Classification only work with numeric data, therefore label encoding is performed. encoding is perfomed below.


In [12]:
labels = {
    'buying': ['vhigh', 'high', 'med', 'low'], 
    'maintenance': ['vhigh', 'high', 'med', 'low'], 
    'doors': ['2', '3', '4', '5more'],
    'persons': ['2', '4', 'more'],
    'luggage_boot': ['small', 'med', 'big'],
    'safety': ['low', 'med', 'high'],
    'class': ['unacc', 'acc', 'good', 'vgood']
    }

In [13]:
label_encoders = {}
df_encoded = pd.DataFrame()
for column in df:
    if column in labels:
        label_encoders[column] = preprocessing.LabelEncoder()
        label_encoders[column].fit(labels[column])
        df_encoded[column] = label_encoders[column].transform(df[column])
    else:
        df_encoded[column] = df[column]

### Seperate features from labels

In [14]:
features = np.array(df_encoded.drop(['class'], 1))
label = np.array(df_encoded['class'])

## Training the data

### Splitting train and test data

In [15]:
features_train, features_test, label_train, label_test = model_selection.train_test_split(features, label, test_size=0.1)

## Decision Tree Classifier

In [16]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(features_train, label_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

### Score of model based on test data

In [17]:
dt_score = decision_tree.score(features_test, label_test)
dt_score

0.9595375722543352

### Classification report

In [18]:
print(classification_report(label_test, decision_tree.predict(features_test)))

precision    recall  f1-score   support

           0       0.95      0.93      0.94        44
           1       0.78      0.88      0.82         8
           2       0.97      0.99      0.98       113
           3       1.00      0.75      0.86         8

    accuracy                           0.96       173
   macro avg       0.93      0.89      0.90       173
weighted avg       0.96      0.96      0.96       173



## Random Forest Classifier
#### Trying to optimize classifier by using random forest.


### Create random forest classifier

In [19]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, max_depth=6)
random_forest_classifier.fit(features_train, label_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=6, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Train model

In [20]:
print(classification_report(label_test, random_forest_classifier.predict(features_test)))

precision    recall  f1-score   support

           0       0.66      0.66      0.66        44
           1       0.00      0.00      0.00         8
           2       0.88      0.98      0.93       113
           3       1.00      0.38      0.55         8

    accuracy                           0.83       173
   macro avg       0.64      0.50      0.53       173
weighted avg       0.79      0.83      0.80       173



### Score of model

In [21]:
random_forest_classifier.score(features_test, label_test)

0.8265895953757225

### See the importance of each feature

In [22]:
random_forest_classifier.feature_importances_

array([0.11103826, 0.10946778, 0.02397979, 0.34188355, 0.05740381,
       0.35622681])

### The third feature is the least important - check which feature it is and remove it

In [23]:
df_encoded.head()

Unnamed: 0,buying,maintenance,doors,persons,luggage_boot,safety,class
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2


In [24]:
features2 = np.array(df_encoded.drop(['class', 'doors'], 1))

### Retraining the classifier to see if there are any improvements

In [25]:
label2 = np.array(df_encoded['class'])

features_train2, features_test2, label_train2, label_test2 = model_selection.train_test_split(features2, label2, test_size=0.1)

random_forest_classifier2 = RandomForestClassifier(n_estimators=100, max_depth=6)
random_forest_classifier2.fit(features_train2, label_train2)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=6, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
print(classification_report(label_test2, random_forest_classifier2.predict(features_test2)))

precision    recall  f1-score   support

           0       0.69      0.87      0.77        38
           1       0.00      0.00      0.00         9
           2       0.96      0.95      0.95       122
           3       1.00      1.00      1.00         4

    accuracy                           0.88       173
   macro avg       0.66      0.70      0.68       173
weighted avg       0.85      0.88      0.86       173



### This did not do the trick - but if we add some extra parameters, it will improve

In [27]:
random_forest_classifier2 = RandomForestClassifier(n_estimators=150, max_depth=8, criterion='entropy', max_features=5)
random_forest_classifier2.fit(features_train2, label_train2)
print(classification_report(label_test2, random_forest_classifier2.predict(features_test2)))

precision    recall  f1-score   support

           0       0.82      0.87      0.85        38
           1       0.64      1.00      0.78         9
           2       1.00      0.93      0.97       122
           3       0.80      1.00      0.89         4

    accuracy                           0.92       173
   macro avg       0.82      0.95      0.87       173
weighted avg       0.94      0.92      0.93       173



In [28]:
rf_score = random_forest_classifier2.score(features_test2, label_test2)
rf_score

0.9248554913294798

In [29]:
print('Decision Tree Classifier: ', dt_score)
print('Random Forest Classifier: ', rf_score)

Decision Tree Classifier:  0.9595375722543352
Random Forest Classifier:  0.9248554913294798
