In [1]:
# Import Libraries

import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, GridSearchCV,cross_val_score
from sklearn.preprocessing import MinMaxScaler, label_binarize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.naive_bayes import GaussianNB

In [2]:
# Load dataset as dataframe 

car = pd.read_csv('car_data.csv', header = None)
y = car.iloc[:,0]   # Assigning target variable
X = car.iloc[:,1:]  # Assigning predictors
X.columns = range(X.columns.size)
car.head(5)

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
print('Target variable (Buying price) Values :')
print(y.head(5))

print('Predictor Values :')
print(X.head(5))

Target variable (Buying price) Values :
0    vhigh
1    vhigh
2    vhigh
3    vhigh
4    vhigh
Name: 0, dtype: object
Predictor Values :
       0  1  2      3     4      5
0  vhigh  2  2  small   low  unacc
1  vhigh  2  2  small   med  unacc
2  vhigh  2  2  small  high  unacc
3  vhigh  2  2    med   low  unacc
4  vhigh  2  2    med   med  unacc


In [4]:
y.value_counts()
print(X[0].value_counts())
print(X[1].value_counts())
print(X[2].value_counts())
print(X[3].value_counts())
print(X[4].value_counts())
print(X[5].value_counts())


vhigh    432
med      432
low      432
high     432
Name: 0, dtype: int64
5more    432
4        432
3        432
2        432
Name: 1, dtype: int64
4       576
more    576
2       576
Name: 2, dtype: int64
big      576
med      576
small    576
Name: 3, dtype: int64
med     576
low     576
high    576
Name: 4, dtype: int64
unacc    1210
acc       384
good       69
vgood      65
Name: 5, dtype: int64


In [5]:
X.iloc[:,0].replace({'low':0,'med':1/3,'high':2/3, 'vhigh':1}, inplace = True)
X.iloc[:,1].replace({'2':0,'3':1/3,'4':2/3,'5more':1},inplace = True)
X.iloc[:,2].replace({'2':0,'4':0.5,'more':1},inplace = True)
X.iloc[:,3].replace({'small':0,'med':0.5,'big':1},inplace = True)
X.iloc[:,4].replace({'low':0,'med':0.5,'high':1},inplace = True)
X.iloc[:,5].replace({'unacc':0,'acc':0.5,'good':1, 'vgood':1.5},inplace = True)

X.head(5)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.5,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.5,0.0,0.0
4,1.0,0.0,0.0,0.5,0.5,0.0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=45)
f_measure_score = {'decision_tree':{},'knn':{},'logistic':{},'NB':{},'svm':{}}

In [13]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=30)
cv

StratifiedKFold(n_splits=5, random_state=30, shuffle=True)

#### Decision Tree

In [14]:
para_tree = {'criterion':['entropy'],'max_depth':list(range(8,11)),'min_samples_leaf':list(range(1,4)),'random_state':[45]}
d_tree = DecisionTreeClassifier()
grid_tree = GridSearchCV(d_tree, para_tree, cv=cv,scoring='f1_micro')
grid_tree.fit(X_train,y_train)
y_pred_tree = grid_tree.predict(X_test)
nested_score_tree = cross_val_score(grid_tree, X = X, y = y, cv=cv) 
f_measure_score['decision_tree']['mean'] = np.mean(nested_score_tree)
f_measure_score['decision_tree']['std'] = np.std(nested_score_tree)

In [15]:
print('precision,recall,f-measure\n', classification_report(y_test,y_pred_tree))

precision,recall,f-measure
              precision    recall  f1-score   support

       high       0.11      0.13      0.12        94
        low       0.16      0.15      0.16        87
        med       0.03      0.02      0.03        86
      vhigh       0.20      0.22      0.21        79

avg / total       0.12      0.13      0.13       346



In [None]:
grid_tree.best_params_

#### KNN

In [None]:
para_knn = {'n_neighbors':list(range(12,17)),'weights':['uniform','distance']}
knn = KNeighborsClassifier()
grid_knn = GridSearchCV(knn, para_knn, cv = cv, scoring='f1_micro')
grid_knn.fit(X_train,y_train)
y_pred_knn = grid_knn.predict(X_test)
nested_score_knn = cross_val_score(grid_knn, X = X, y = y, cv = cv) 
f_measure_score['knn']['mean'] = np.mean(nested_score_knn)
f_measure_score['knn']['std'] = np.std(nested_score_knn)

In [None]:
print('precision,recall,f-measure\n', classification_report(y_test,y_pred_knn))

In [None]:
grid_knn.best_params_

#### Logistic Regression

In [None]:
para_log = {'C':[10,100,1000]}
logistic = LogisticRegression(multi_class='multinomial',solver='lbfgs',penalty = 'l2',random_state = 45)
grid_log = GridSearchCV(logistic, para_log, cv = cv, scoring='f1_micro')
grid_log.fit(X_train, y_train)
y_pred_log = grid_log.predict(X_test)
nested_score_log = cross_val_score(grid_log, X = X, y = y, cv = cv) 
f_measure_score['logistic']['mean'] = np.mean(nested_score_log)
f_measure_score['logistic']['std'] = np.std(nested_score_log)

In [None]:
print('precision,recall,f-measure\n', classification_report(y_test,y_pred_log),'\n')

In [None]:
grid_log.best_params_

#### Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
nested_score_nb = cross_val_score(nb, X = X, y = y, cv = cv) 
f_measure_score['NB']['mean'] = np.mean(nested_score_nb)
f_measure_score['NB']['std'] = np.std(nested_score_nb)

In [None]:
print('precision,recall,f-measure\n', classification_report(y_test,y_pred_nb),'\n')

#### SVM

In [None]:
para_svm = {'kernel':['rbf'],'C':[10],'gamma':[5]} 
svm = SVC(random_state = 45,probability = True)
grid_svm = GridSearchCV(svm, para_svm, cv = cv, scoring='f1_micro')
grid_svm.fit(X_train, y_train)
y_pred_svm = grid_svm.predict(X_test)
nested_score_svm = cross_val_score(grid_svm, X = X, y = y, cv = cv) 
f_measure_score['svm']['mean'] = np.mean(nested_score_svm)
f_measure_score['svm']['std'] = np.std(nested_score_svm)

In [None]:
print('precision,recall,f-measure\n', classification_report(y_test,y_pred_svm),'\n')

In [None]:
grid_svm.best_params_

#### Model Comparision

In [None]:
for k,v in f_measure_score.items():
    print(k, ': ', v)

In [None]:
accuracy_svm = grid_svm.score(X_test,y_test)
print('accuracy of SVM: ', accuracy_svm)

In [None]:
# Predict for Maintenance = High, Number of doors = 4, Lug Boot Size = Big, Safety = High, Class Value = Good

y_pred_svm = grid_svm.predict(X_test)

In [None]:
y_pred_svm[:5]

In [None]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
[2/3, , 0.5, ]

In [None]:
X.iloc[:,0:2].replace({'low':0,'med':1/3,'high':2/3, 'vhigh':1}, inplace = True)
X.iloc[:,2].replace({'2':0,'3':1/3,'4':2/3,'5more':1},inplace = True)
X.iloc[:,3].replace({'2':0,'4':0.5,'more':1},inplace = True)
X.iloc[:,4].replace({'small':0,'med':0.5,'big':1},inplace = True)
X.iloc[:,5].replace({'low':0,'med':0.5,'high':1},inplace = True)