# Comparison of supervised machine learning classifiers



## Import modules

In [5]:
import pandas as pd # for data handling
from sklearn.model_selection import cross_val_score # for cross-validation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # evaluation metrics
import matplotlib.pyplot as plt # for plotting

# scikit-learn classifiers evaluated (change as desired)
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

## Get data

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')
! unzip '/content/gdrive/MyDrive/data.zip'

Mounted at /content/gdrive
Archive:  /content/gdrive/MyDrive/data.zip
   creating: data/
  inflating: __MACOSX/._data         
  inflating: data/test.csv           
  inflating: __MACOSX/data/._test.csv  
  inflating: data/new.csv            
  inflating: __MACOSX/data/._new.csv  
  inflating: data/train.csv          
  inflating: __MACOSX/data/._train.csv  


### Read data into *pandas* dataframes

In [8]:
# Read data from CSV files into pandas dataframes
train = pd.read_csv('data/train.csv') # training data
test = pd.read_csv('data/test.csv') # test data
new = pd.read_csv('data/new.csv') # unlabeled data
# Show number of rows and columns in each dataframe
print('Training data contains %d rows and %d columns.' %train.shape)
print('Test data contains %d rows and %d columns.' %test.shape)
print('Unlabeled data contains %d rows and %d columns.' %new.shape)
print('First 3 rows in training data :')
train.head(3) # display first 3 training samples

Training data contains 8000 rows and 11 columns.
Test data contains 2000 rows and 11 columns.
Unlabeled data contains 30 rows and 11 columns.
First 3 rows in training data :


Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,3.0,-3.514,0.417,-0.652,3.013,-0.88,-3.023,0.946,1.908,0.26,-0.065
1,2.0,-2.782,3.509,1.544,3.551,3.31,2.546,-4.98,1.942,3.296,-1.043
2,0.0,0.198,-2.976,0.476,3.329,0.915,-3.29,2.049,-0.525,-1.313,1.195


### Specify inputs and outputs







In [9]:
features = list(train)[1:] # all but the first column header are feature names
print("features:", features)
X_train, X_test, X_new = train[features], test[features], new[features]
y_train, y_test = train.y, test.y
print('Shapes :')
print(f'X_train: {X_train.shape}, X_test: {X_test.shape}, X_new: {X_new.shape}')
print(f'y_train: {y_train.shape}, y_test: {y_test.shape}')

features: ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']
Shapes :
X_train: (8000, 10), X_test: (2000, 10), X_new: (30, 10)
y_train: (8000,), y_test: (2000,)



# Evaluate models using *k*-fold cross-validation

### GaussianNB

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

In [10]:
%%time
model = GaussianNB() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9344
CPU times: user 43.6 ms, sys: 0 ns, total: 43.6 ms
Wall time: 52.2 ms


### DecisionTreeClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [11]:
%%time
model = DecisionTreeClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9140
CPU times: user 845 ms, sys: 1.77 ms, total: 847 ms
Wall time: 859 ms


### RandomForestClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [12]:
%%time
model = RandomForestClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9681
CPU times: user 15.1 s, sys: 10.8 ms, total: 15.1 s
Wall time: 15.2 s


### ExtraTreesClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html

In [13]:
%%time
model = ExtraTreesClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9735
CPU times: user 3.63 s, sys: 86.9 ms, total: 3.72 s
Wall time: 3.78 s


### KNeighborsClassifier

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [14]:
%%time
model = KNeighborsClassifier() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9774
CPU times: user 497 ms, sys: 898 Âµs, total: 498 ms
Wall time: 501 ms


### LogisticRegression

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [15]:
%%time
model = LogisticRegression() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9646
CPU times: user 1.46 s, sys: 7.03 ms, total: 1.46 s
Wall time: 1.31 s


### SVC

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [16]:
%%time
model = SVC() # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9794
CPU times: user 1.81 s, sys: 5.8 ms, total: 1.81 s
Wall time: 1.82 s


### MLPClassifier
https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

In [17]:
%%time
model = MLPClassifier(max_iter=1000) # change hyperparameters as desired
score = cross_val_score(model, X_train, y_train, cv=2).mean() # mean cross-validation accuracy
print(f'Mean cross-validation accuracy = {score:0.4f}')

Mean cross-validation accuracy = 0.9728
CPU times: user 28.1 s, sys: 32.8 ms, total: 28.1 s
Wall time: 28.6 s


## Select a good model
Since both *Support Vector Classifier* and *K Nearest Neighbor Classifier* produced high cmean cross-validation accuracy with default hyper-parameter values, we shall search for good hyper-parameter values for these models using cross-validation and choose a model with a good set of hyper-parameter values.

### SVC
Search for a good value of the penalty *C*.

In [18]:
%%time
for penalty in [1, 10, 20]: # values to try
    model = SVC(C=penalty)
    score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
    print(f'Mean cross-validation accuracy = {score:0.4f} for SVC with C = {penalty:0.1f}')

Mean cross-validation accuracy = 0.9794 for SVC with C = 1.0
Mean cross-validation accuracy = 0.9805 for SVC with C = 10.0
Mean cross-validation accuracy = 0.9795 for SVC with C = 20.0
CPU times: user 6.68 s, sys: 7.73 ms, total: 6.69 s
Wall time: 6.62 s


### KNeighborsClassifier
Search for a good value of the number of nearest neighbors

In [19]:
for k in range(1,15): # values to try
    model = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(model, X_train, y_train).mean() # mean cross-validation accuracy
    print(f'Mean cross-validation accuracy = {score:0.4f} for KNeighborsClassifier with {k} neighbors')

Mean cross-validation accuracy = 0.9636 for KNeighborsClassifier with 1 neighbors
Mean cross-validation accuracy = 0.9601 for KNeighborsClassifier with 2 neighbors
Mean cross-validation accuracy = 0.9760 for KNeighborsClassifier with 3 neighbors
Mean cross-validation accuracy = 0.9765 for KNeighborsClassifier with 4 neighbors
Mean cross-validation accuracy = 0.9774 for KNeighborsClassifier with 5 neighbors
Mean cross-validation accuracy = 0.9770 for KNeighborsClassifier with 6 neighbors
Mean cross-validation accuracy = 0.9776 for KNeighborsClassifier with 7 neighbors
Mean cross-validation accuracy = 0.9769 for KNeighborsClassifier with 8 neighbors
Mean cross-validation accuracy = 0.9772 for KNeighborsClassifier with 9 neighbors
Mean cross-validation accuracy = 0.9768 for KNeighborsClassifier with 10 neighbors
Mean cross-validation accuracy = 0.9770 for KNeighborsClassifier with 11 neighbors
Mean cross-validation accuracy = 0.9768 for KNeighborsClassifier with 12 neighbors
Mean cross-va

# Decide on a model
Therefore, we will choose *SVC* with *C*=10.

In [20]:
chosen_model = SVC(C=10)
print('Selected model: ', chosen_model)
print('Parameters')
for param, val in chosen_model.get_params().items():
    print(f'\t{param}: {val}')

Selected model:  SVC(C=10)
Parameters
	C: 10
	break_ties: False
	cache_size: 200
	class_weight: None
	coef0: 0.0
	decision_function_shape: ovr
	degree: 3
	gamma: scale
	kernel: rbf
	max_iter: -1
	probability: False
	random_state: None
	shrinking: True
	tol: 0.001
	verbose: False


# Train and test selected model

In [21]:
%%time
chosen_model = SVC(C=10)
chosen_model.fit(X_train, y_train) # train selected model on ALL training examples
predicted = chosen_model.predict(X_test) # predicted classes for test examples
acc = accuracy_score(y_test, predicted) # accuracy on test samples
print(f'Accuracy on test samples = {acc:0.4f}') # show test accuracy
print("Classification report on test samples:") # for precision, recall, F1-score
print(classification_report(y_test, predicted, digits=4)) # rounded to 4 decimal places

Accuracy on test samples = 0.9845
Classification report on test samples:
              precision    recall  f1-score   support

         0.0     0.9862    0.9881    0.9872       506
         1.0     0.9940    0.9803    0.9871       508
         2.0     0.9840    0.9821    0.9831       502
         3.0     0.9735    0.9876    0.9805       484

    accuracy                         0.9845      2000
   macro avg     0.9844    0.9845    0.9845      2000
weighted avg     0.9846    0.9845    0.9845      2000

CPU times: user 1.08 s, sys: 3.91 ms, total: 1.08 s
Wall time: 1.11 s


In [22]:
cm = pd.DataFrame(confusion_matrix(y_test, predicted)) # confusion matrix
cm.to_csv('cm.csv')
cm

Unnamed: 0,0,1,2,3
0,500,1,1,4
1,1,498,5,4
2,2,2,493,5
3,4,0,2,478


#Predict class for unlabeled samples
We shall use our trained model to predict the output class for the unlabeled samples.

In [23]:
predicted_new = chosen_model.predict(X_new) # predicted classes for unlabeled samples
hw2q2_prediction = pd.DataFrame() # dataframe with predicted classes
hw2q2_prediction['ID'] = new.ID # identifiers for unlabeled samples
hw2q2_prediction['y'] = predicted_new # # predicted classes for unlabeled samples
hw2q2_prediction.to_csv('prediction.csv', index=False) # save as CSV file
hw2q2_prediction # display results

Unnamed: 0,ID,y
0,ID_001,0.0
1,ID_002,0.0
2,ID_003,0.0
3,ID_004,0.0
4,ID_005,0.0
5,ID_006,0.0
6,ID_007,0.0
7,ID_008,0.0
8,ID_009,0.0
9,ID_010,0.0
