Reading data

In [None]:
#block 1: Download cifar-100 data installation package from website
!wget https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz

In [None]:
 #block 2: Unzip the data package
 !tar -xf cifar-100-python.tar.gz

In [None]:
 #block 3: named
import numpy as np
import pandas as pd

def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
      dict = pickle.load(fo, encoding='bytes')
    return dict

train_data = np.array(unpickle("cifar-100-python/train")[b'data'])
test_data = np.array(unpickle("cifar-100-python/test")[b'data'])

train_label = np.array(unpickle("cifar-100-python/train")[b'fine_labels'])
test_label = np.array(unpickle("cifar-100-python/test")[b'fine_labels'])

labels_names = unpickle("cifar-100-python/meta")[b'fine_label_names']

pre-processing

In [None]:
#block 4 : Min-max preprocessing of training and test data
train_data_mM = train_data / 255 #min-max
test_data_mM = test_data / 255

In [None]:
#block 5 : PCA preprocessing of training and test data
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA()
pca.fit(train_data_mM)
cumsum = np.cumsum(pca.explained_variance_ratio_)

#plot the explained variance vs number of dimensions
plt.figure(figsize = (6,4))
plt.plot(cumsum, linewidth=3)
plt.axis([0, 45, 0, 1])
plt.xlabel("dimensions")
plt.ylabel("explained variance")
plt.grid(True)
plt.show()

#choosing 25 dimensions: n_components = 0.80
pca = PCA(n_components = 0.80)
pca.fit(train_data_mM)

#dimension
train_data_pca = pca.transform(train_data_mM)
test_data_pca = pca.transform(test_data_mM)

Models

In [None]:
#Block 6
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

#Split the training data into training data and test data when training the model
X_train, X_test, y_train, y_test = train_test_split(train_data_pca, train_label, stratify=train_label, random_state=42)

print("Size of train and train label set: {}, {}.".format(X_train.shape,y_train.shape))
print("Size of test and test label set: {}, {}.".format(X_test.shape,y_test.shape))

# training the SVM model
lin_svm=SVC(kernel="linear",decision_function_shape="ovr")
poly_svm=SVC(kernel="poly",degree=2,decision_function_shape="ovr")

#training linear model
lin_svm.fit(X_train,y_train)
y_pred_lin=lin_svm.predict(X_test)

#training poly model
poly_svm.fit(X_train,y_train)
y_pred_poly=poly_svm.predict(X_test)

print("Accuracy on test set using linear svm: {:.3f}".format(accuracy_score(y_test,y_pred_lin)))
print("Accuracy on test set quadratic svm: {:.3f}".format(accuracy_score(y_test,y_pred_poly)))

# training the KNN model
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)

print("Accuracy on test set: {:.2f}".format(accuracy_score(y_test,y_pred)))

In [None]:
#Block 7
### 10-fold to find the best parameter for SVM
param_grid={'degree':[2,3,4]}
print("Parameter grid:\n{}".format(param_grid))

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

### Using grid search to fit
svm = SVC(kernel= 'poly')
grid_search = GridSearchCV(svm, param_grid, cv=10, 
                           return_train_score=True, n_jobs = -1)
grid_search.fit(train_data_pca, train_label)

### Typing the best parameters and cross-validation score
print("Test set score: {:.2f}".format(grid_search.score(test_data_pca,test_label)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator:\n{}".format(grid_search.best_estimator_))

### Printing the whole results
resultSVM = grid_search.cv_results_
result_pretty = pd.DataFrame(resultSVM)
print(result_pretty.params)
print(result_pretty.mean_test_score)

### 10-fold to find the best parameter for KNN
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors' : [3, 5, 7], 'p' : [1, 2]}
print("Parameter grid:\n{}".format(param_grid))

### Using grid search to fit
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=10, return_train_score=True, n_jobs = -1)
grid_search.fit(train_data_pca, train_label)

### Typing the best parameters and cross-validation score
print("Test set score: {:.2f}".format(grid_search.score(test_data_pca,test_label)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator:\n{}".format(grid_search.best_estimator_))

### Printing the whole results
resultKNN = grid_search.cv_results_
result_pretty = pd.DataFrame(resultKNN)
print(result_pretty.params)
print(result_pretty.mean_test_score)

In [None]:
#Block 8
### Using hyper parameter to determine values

poly_svm = SVC(kernel="poly",degree=2)#polynomial kernel with degree 2
poly_svm.fit(train_data_pca, train_label)
label_pred_poly = poly_svm.predict(test_data_pca)

### Using hyper parameter to determine accuracy for SVM
print("Accuracy on test set: {:.3f}".format(accuracy_score(test_label, label_pred_poly)))

### Using hyper parameter to determine precisions including macro and micro averages
from sklearn.metrics import precision_score

print("Precision of macro average on test set: {:.3f}".format(precision_score(test_label, label_pred_poly, average='macro')))
print("Precision of micro average on test set: {:.3f}".format(precision_score(test_label, label_pred_poly, average='micro')))

### Using hyper parameter to determine recall for SVM
from sklearn.metrics import recall_score

print("Recall on test set: {:.3f}".format(recall_score(test_label, label_pred_poly, average='micro')))


### Using hyper parameter to determine accuracy for KNN
knn = KNeighborsClassifier(n_neighbors=7, p=2)
knn.fit(train_data_pca, train_label)

label_pred = knn.predict(test_data_pca)
print("Accuracy on test set: {:.3f}".format(accuracy_score(test_label, label_pred)))

### Using hyper parameter to determine precision for KNN
print("Precision of macro average on test set: {:.3f}".format(precision_score(test_label, label_pred, average='macro')))
print("Precision of micro average on test set: {:.3f}".format(precision_score(test_label, label_pred, average='micro')))