# Homework 4 - Multi-class classification
### Mihovil Mandic, Winter 2019

### Libraries and modules

In [65]:
import numpy as np
import pandas as pd
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn import svm, preprocessing
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, classification_report

### Loading the dataset

In [66]:
train_data = pd.read_csv("CS74_HW4_training_set.csv", sep=",")
test_data = pd.read_csv("CS74_HW4_test_set.csv", sep=",")
new_test_data = test_data.copy()

X_train = train_data.astype(float).iloc[:, 0:6]
y_train = train_data.iloc[:, 6]

### Data preprocessing

In [67]:
def feature_selection(X, y):
    # feature extraction
    test = SelectKBest(k='all')
    fit = test.fit(X, y)

    # summarize scores
    np.set_printoptions(precision=3)
    print("Scores")
    idx = 0
    for score in fit.scores_:
        idx += 1
        print("Feature_{0}: {1} ".format(idx, score))
    return

In [68]:
feature_selection(X_train, y_train)

Scores
Feature_1: 112.90771722985501 
Feature_2: 88.12072461283138 
Feature_3: 6.885545081048831 
Feature_4: 68.09609641702006 
Feature_5: 129.69918388243306 
Feature_6: 86.2169350649155 


Considering dropping Feature_3. Scores are based on the ANOVA F-values.

Update - Dropping that feature doesn't make a significant difference, so I will preserve it.

## Testing different classifiers

### Parameter tuning for SVM, including OVR / OVO

In [8]:
def svm_param_tuning(X, y, k_folds):
    Cs = [1, 10, 30, 50, 100]
    gammas = [1e-6, 1e-5, 1e-3, 1e-1]
    kernels = ['rbf', 'linear']
    decisions = ['ovr', 'ovo']
    
    param_grid = {'C': Cs, 'gamma' : gammas, 'kernel' : kernels, 'decision_function_shape'=decisions}
    model = svm.SVC(cache_size=25000)
    grid_search = GridSearchCV(model, param_grid, n_jobs=-1, cv=k_folds, scoring='accuracy')
    grid_search.fit(X, y)
    print('Best accuracy: ', grid_search.best_score_)
    
    return grid_search.best_params_

#print(svm_param_tuning(X_train, y_train, 3))

Best accuracy:  0.643
{'C': 30, 'gamma': 1e-06, 'kernel': 'rbf'}


### Random Forest

In [None]:
def random_forest_tuning(X, y, k_folds):
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 5)]
    # Number of features to consider at every split
    max_features = ['auto', 6]
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    
    
    param_grid = {'bootstrap': bootstrap,
                  'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf}    
    
    model = RandomForestClassifier()
    grid_search = GridSearchCV(model, param_grid, cv=k_folds, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X, y)
    print('Best accuracy: ', grid_search.best_score_)
    return grid_search.best_params_

#print(random_forest_tuning(X_train, y_train, 10))

Random forest worked a lot better than SVM, however it underperformed Extra trees.
Commented out SVM and RF grid search because of the runtime in case the entire notebook is executed.

### Extra trees - BEST

In [9]:
def extra_trees_tuning(X, y, k_folds):
    # Number of trees in random forest
    n_estimators = [100, 180, 200, 250, 300, 800, 1500, 2000]
    # Number of features to consider at every split
    max_features = ['auto']
    # Maximum number of levels in tree
    #max_depth = [1, 10, 30, 50, 70, 100]
    max_depth = [52, 55]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 6, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    
    
    param_grid = {'bootstrap': bootstrap,
                  'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf}    
    
    model = ExtraTreesClassifier()
    grid_search = GridSearchCV(model, param_grid, cv=k_folds, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X, y)
    print('Best accuracy: ', grid_search.best_score_)
    return grid_search.best_params_

print(extra_trees_tuning(X_train, y_train, 10))

Best accuracy:  0.7613333333333333
{'bootstrap': False, 'max_depth': 52, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}


This classifier seemed the best with default parameters when I was testing all possible classifiers.
Thus, I've decided to **optimize hyperparameters** for it.

Those ended up being
{'bootstrap': False, 'max_depth': 52, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}

With a mean accuracy of 0.7613 and std. dev +- 0.028.

### Training

In [97]:
def train(X, y):
    clf = ExtraTreesClassifier(n_estimators=200,
                          max_depth=52,
                          max_features='auto',
                          min_samples_split=6,
                          min_samples_leaf=1,
                          bootstrap=False,
                          n_jobs=-1)
    
    #clf = OneVsRestClassifier(clf)
    #clf = OneVsOneClassifier(clf)
    
    clf.fit(X, y)
    return clf

clf = train(X_train, y_train)

I've commented out OVR and OVO classifiers as they worsened accuracy (in both cases, OVO did even worse).

### 10-fold CV accuracy test

In [98]:
scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=10)
print(scores)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

[0.767 0.801 0.744 0.757 0.737 0.781 0.781 0.727 0.764 0.743]
Accuracy: 0.760 (+/- 0.044)


With no feature selection or training data preprocessing, the mean accuracy is roughly **76%** after a 10-fold cross-validation.

In [116]:
def per_class_CV(train_data):
    X = train_data.columns[0:6] # features (all)
    y = train_data.columns[6] # target (class label - last column)

    X_train, X_test, y_train, y_test = train_test_split(train_data[X], train_data[y])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(classification_report(y_test, y_pred))
    print('Accuracy is:', accuracy_score(y_test, y_pred))
    return

per_class_CV(train_data)

              precision    recall  f1-score   support

           1       0.94      0.97      0.95       533
           2       0.74      0.77      0.76       368
           3       0.71      0.73      0.72       529
           4       0.65      0.61      0.63       459
           5       0.73      0.68      0.70       361

   micro avg       0.76      0.76      0.76      2250
   macro avg       0.75      0.75      0.75      2250
weighted avg       0.76      0.76      0.76      2250

Accuracy is: 0.7617777777777778


### Predicting

In [117]:
def predict(clf, test_data):
    return clf.predict(test_data)

new_test_data['Label'] = predict(clf, test_data)

### Looking at the predicted labels

In [121]:
print("Training data - Label counts")
print(train_data['Label'].value_counts(normalize=True), '\n')

print("Testing data - Label counts")
print(new_test_data['Label'].value_counts(normalize=True))

Training data - Label counts
3    0.244444
1    0.222222
4    0.200000
5    0.166667
2    0.166667
Name: Label, dtype: float64 

Testing data - Label counts
3    0.262037
1    0.229630
2    0.178704
4    0.175926
5    0.153704
Name: Label, dtype: float64


The proportions are different, considering that there was an even number of class 5 and class 2 rows in the training data, but not in the testing data.

### Saving into a new CSV

In [122]:
new_test_data.to_csv("CS74_HW4_test_set_labeled.csv", index=False)

### Appendix - Conclusion and other classifiers I've tested

In [None]:
# clf = OneVsOneClassifier(RandomForestClassifier(n_estimators=1600,
#                              max_depth = None,
#                              max_features='auto',
#                              min_samples_split=5,
#                              min_samples_leaf=1,
#                             bootstrap=True,
#                             n_jobs=-1))

# Best so far
clf = OneVsRestClassifier(ExtraTreesClassifier(n_estimators=200,
                          max_depth=52,
                          max_features='auto',
                          min_samples_split=6,
                          min_samples_leaf=1,
                          bootstrap=False,
                          n_jobs=-1))

#clf = ExtraTreesClassifier(n_jobs=-1)

#clf = KNeighborsClassifier(n_neighbors=20)

#clf = BaggingClassifier(KNeighborsClassifier(algorithm='kd_tree'), max_samples=0.5, max_features=0.5)

#clf = RadiusNeighborsClassifier(radius=2000000)

# clf = GradientBoostingClassifier(n_estimators=1600,
#                                 max_depth=None,
#                                 min_samples_split=5,
#                                 min_samples_leaf=1,)

#clf = GradientBoostingClassifier(loss='exponential', learning_rate=0.5, n_estimators=1500)

# clf = GaussianProcessClassifier(multi_class="one_vs_rest",
#                                 n_jobs=-1,)

#clf = Perceptron(penalty='elasticnet', alpha=1e-7, max_iter=20000, tol=1e-6)

#clf = DecisionTreeClassifier()

scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=10)
print(scores)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

In conclusion, I've selected Extra Trees as my classifier as it performed most-consistently at 75% level accuracy with a low standard deviation. OneVsRest and OneVsOne haven't made a huge impact - except for SVM. However it significantly prolonged my runtime, so I wasn't able to run a full grid search and 10-fold CV while testing both OVR and OVO.