In [117]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, cohen_kappa_score, classification_report, make_scorer 
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, CategoricalNB

5

Reading the data. Encoding the input and output variables with inputs as categoral.

In [110]:
data = pd.read_csv('../data/car.data', header=None)
X, Y = data.iloc[:, :-1], data.iloc[:, -1]
le = LabelEncoder()
Y = le.fit_transform(Y)
X = OneHotEncoder(sparse=False).fit_transform(X)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state = 29, stratify = Y)
le.classes_

array(['acc', 'good', 'unacc', 'vgood'], dtype=object)

In [111]:
np.unique(Y, return_counts=True)

(array([0, 1, 2, 3]), array([ 384,   69, 1210,   65], dtype=int64))

Setting up the classifiers and gridSearch for inner loops

In [112]:
kappa_scorer = make_scorer(cohen_kappa_score)   

complexity_values = range(1,10)

clf1 = DecisionTreeClassifier(class_weight='balanced')
p_grid1 = [{'max_depth': complexity_values, 'criterion': ['gini', 'entropy']}]

clf2 = KNeighborsClassifier(metric='hamming')
p_grid2 = [{'n_neighbors': complexity_values}]

clf3 = LogisticRegression(class_weight='balanced', max_iter=10000)
p_grid3 = [{'C': np.power(10, range(0,10))}]

clf4 = SVC(class_weight='balanced')
p_grid4 = [{'kernel': ['rbf'], 'C': np.power(10, range(0,5)), 'gamma': np.power(10., range(-5,0))},
        {'kernel': ['linear'], 'C': np.power(10, range(0,5))}]

clf5 = CategoricalNB()
p_grid5 = [{'fit_prior': [True, False], 'alpha': np.arange(0.1,1,0.3)}]


inner_cv = StratifiedKFold(n_splits=4, shuffle=True)
outer_cv = StratifiedKFold(n_splits=4, shuffle=True)
grid_cv = []

# Kappa is chosen for scoring because of imbalanced multi-class dataset
for p_grid, est in zip((p_grid1, p_grid2, p_grid3, p_grid4, p_grid5), (clf1, clf2, clf3, clf4, clf5)):
    gs = GridSearchCV(estimator=est, param_grid=p_grid, scoring=kappa_scorer, cv=inner_cv)
    grid_cv.append(gs)


Running the nested-cv and getting the mean and standard deviation of the scores for each of the 5 models

In [113]:
nested_scores = []
for gs in grid_cv:
    nested_score = cross_val_score(gs, X=x_train, y=y_train, cv=outer_cv, scoring=kappa_scorer)
    nested_scores.append((nested_score.mean(), nested_score.std()))

nested_scores

[(0.873271781364809, 0.015165179326646407),
 (0.7937089846948181, 0.04676177058390272),
 (0.8492872127399249, 0.02047771197204182),
 (0.9904841147813505, 0.009516000647598992),
 (0.7253691310266324, 0.031789407271968516)]

From the nested-cv, the best model seems to be the 4th one - SVC<br>
Find the best hyper-parameters for the SVC model and retrain on the whole train set

In [114]:
hp_model = GridSearchCV(estimator=clf4, param_grid=p_grid4, scoring=kappa_scorer, cv=inner_cv)
hp_model.fit(x_train, y_train)


GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=True),
             estimator=SVC(class_weight='balanced'),
             param_grid=[{'C': array([    1,    10,   100,  1000, 10000], dtype=int32),
                          'gamma': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01]),
                          'kernel': ['rbf']},
                         {'C': array([    1,    10,   100,  1000, 10000], dtype=int32),
                          'kernel': ['linear']}],
             scoring=make_scorer(cohen_kappa_score))

In [115]:
best_model = hp_model.best_estimator_
pred = best_model.predict(pd.DataFrame(x_test))
print(confusion_matrix(y_test, pred))
print(cohen_kappa_score(y_test, pred))
print(accuracy_score(y_test, pred))

[[ 77   0   0   0]
 [  0  14   0   0]
 [  0   0 242   0]
 [  1   0   0  12]]
0.9936854400116801
0.9971098265895953


In [119]:
print(classification_report(y_test, pred, target_names = ['acc', 'good', 'unacc', 'vgood']))

              precision    recall  f1-score   support

         acc       0.99      1.00      0.99        77
        good       1.00      1.00      1.00        14
       unacc       1.00      1.00      1.00       242
       vgood       1.00      0.92      0.96        13

    accuracy                           1.00       346
   macro avg       1.00      0.98      0.99       346
weighted avg       1.00      1.00      1.00       346



Looking at the f1-scores for all classes, the model is performing<br>
the worst on 'vgood' class with a score of 0.96. But the difference is not huge. <br>

Th input variables fit better as categorical becuase the difference <br>
in values is not the same at different values.  <br>
Treating inputs as numeric preserves the order but also <br>
 missplaces importance on values with large differences. <br> 
But we can test if treating them as numeric is better. <br>
Treating the inputs as numeric variables and repeating the process as above - 

In [None]:
X, Y = data.iloc[:, :-1], data.iloc[:, -1]
le = LabelEncoder()
Y = le.fit_transform(Y)
X[0].replace({'vhigh':4, 'high':3, 'med':2, 'low':1}, inplace=True)
X[1].replace({'vhigh':4, 'high':3, 'med':2, 'low':1}, inplace=True)
X[2].replace({'5more':5,'5':4,'4':3, '3':2, '2':1}, inplace=True)
X[3].replace({'more':3, '4':2, '2':1}, inplace=True)
X[4].replace({'big':3, 'med':2, 'small':1}, inplace=True)
X[5].replace({'high':3, 'med':2, 'low':1}, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state = 29, stratify = Y)

In [106]:
kappa_scorer = make_scorer(cohen_kappa_score)   

complexity_values = range(1,10)

clf1 = DecisionTreeClassifier(class_weight='balanced')
p_grid1 = [{'max_depth': complexity_values, 'criterion': ['gini', 'entropy']}]

clf2 = KNeighborsClassifier()
p_grid2 = [{'n_neighbors': complexity_values}]

clf3 = LogisticRegression(class_weight='balanced', max_iter=10000)
p_grid3 = [{'C': np.power(10, range(0,10))}]

clf4 = SVC(class_weight='balanced')
p_grid4 = [{'kernel': ['rbf'], 'C': np.power(10, range(0,5)), 'gamma': np.power(10., range(-5,0))},
        {'kernel': ['linear'], 'C': np.power(10, range(0,5))}]

clf5 = CategoricalNB()
p_grid5 = [{'fit_prior': [True, False], 'alpha': np.arange(0.1,1,0.3)}]


inner_cv = StratifiedKFold(n_splits=4, shuffle=True)
outer_cv = StratifiedKFold(n_splits=4, shuffle=True)
grid_cv = []

# Kappa is chosen for scoring because of imbalanced multi-class dataset
for p_grid, est in zip((p_grid1, p_grid2, p_grid3, p_grid4, p_grid5), (clf1, clf2, clf3, clf4, clf5)):
    gs = GridSearchCV(estimator=est, param_grid=p_grid, scoring=kappa_scorer, cv=inner_cv)
    grid_cv.append(gs)


In [107]:
nested_scores = []
for gs in grid_cv:
    nested_score = cross_val_score(gs, X=x_train, y=y_train, cv=outer_cv, scoring=kappa_scorer)
    nested_scores.append((nested_score.mean(), nested_score.std()))

nested_scores

[(0.9368419987387435, 0.012443109671196034),
 (0.811974157651929, 0.027782632356014948),
 (0.583570805323317, 0.02959426397700011),
 (0.9638221039403134, 0.0024986763767707435),
 (0.6268096426122767, 0.014399231709740709)]

In [108]:
hp_model = GridSearchCV(estimator=clf4, param_grid=p_grid4, scoring=kappa_scorer, cv=inner_cv)
hp_model.fit(x_train, y_train)


GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None, shuffle=True),
             estimator=SVC(class_weight='balanced'),
             param_grid=[{'C': array([    1,    10,   100,  1000, 10000], dtype=int32),
                          'gamma': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01]),
                          'kernel': ['rbf']},
                         {'C': array([    1,    10,   100,  1000, 10000], dtype=int32),
                          'kernel': ['linear']}],
             scoring=make_scorer(cohen_kappa_score))

In [109]:
best_model = hp_model.best_estimator_
pred = best_model.predict(pd.DataFrame(x_test))
print(confusion_matrix(y_test, pred))
print(cohen_kappa_score(y_test, pred))
print(accuracy_score(y_test, pred))

[[ 75   0   2   0]
 [  0  14   0   0]
 [  8   0 234   0]
 [  2   0   0  11]]
0.9254845656855707
0.9653179190751445


Therefore, treating the inputs as categorical values gives better model performance.