In [None]:
import numpy as np
import pandas as pd
from sklearn import svm, metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score

In [None]:
import pickle

In [None]:
seed = 100

### Load cifar10

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [None]:
# for colab only
import os
colab = '/content/drive/Othercomputers/내 iMac/Meta_Learning/코스웍/2022 고급기계학습주제 (김광인)/과제/CW1/data'
PATH1 = os.path.join(colab, 'cifar-10-batches-py/data_batch_1')
PATH2 = os.path.join(colab, 'cifar-100-python/test')

In [None]:
# cifar10: b'data', b'labels'
# cifar100: b'data', b'fine_labels'
cifar10 = unpickle(PATH1)
cifar100 = unpickle(PATH2)

In [None]:
PATH1_t = os.path.join(colab, 'cifar-10-batches-py/data_batch_2')
PATH2_t = os.path.join(colab, 'cifar-100-python/train')

In [None]:
# cifar10: b'data', b'labels'
# cifar100: b'data', b'fine_labels'
test_cifar10 = unpickle(PATH1_t)
test_cifar100 = unpickle(PATH2_t)

--------------

### Normalise image data

In [None]:
def dataset_generator(data_dict, size, cifar_type, seed=100):
    if cifar_type == 'cifar10':
        labels = b'labels'
    elif cifar_type == 'cifar100':
        labels = b'fine_labels'

    assert size <= len(data_dict[labels])
    ratio = float(size/len(data_dict[labels]))
    _, _X, _, _y = train_test_split(data_dict[b'data'],
                                    data_dict[labels],
                                    test_size = ratio,
                                    random_state = seed)

    dataset = {'data': _X, 'labels': _y}
    return dataset

In [None]:
cifar10[b'data'] = cifar10[b'data']/255
cifar100[b'data'] = cifar100[b'data']/255

In [None]:
test_cifar10[b'data'] = test_cifar10[b'data']/255
test_cifar100[b'data'] = test_cifar100[b'data']/255

In [None]:
# check
print('cifar10')
print(cifar10[b'data'][0])
print(test_cifar10[b'data'][0])
print('cifar100')
print(cifar100[b'data'][0])
print(test_cifar100[b'data'][0])

cifar10
[0.23137255 0.16862745 0.19607843 ... 0.54901961 0.32941176 0.28235294]
[0.1372549  0.10588235 0.09803922 ... 0.6627451  0.65882353 0.65882353]
cifar100
[0.78039216 0.76862745 0.76470588 ... 0.84313725 0.82745098 0.71764706]
[1.         1.         1.         ... 0.03921569 0.23137255 0.30980392]


In [None]:
cifar10_train = dataset_generator(cifar10, 3000, 'cifar10')
cifar10_test = dataset_generator(test_cifar10, 300, 'cifar10')
cifar100_train = dataset_generator(cifar100, 3000, 'cifar100')
cifar100_test = dataset_generator(test_cifar100, 300, 'cifar100')

---------------------

### Linear SVM

In [None]:
linear_clf = svm.SVC(kernel='linear', C=1, random_state=seed)
# linear_clf.fit(X_train, y_train)

5-fold cross validation<br>



*   linear
*   cifar 10, 100
*   2400 for train / 600 for test

In [None]:
# cifar 10 - C=1
df = pd.DataFrame(cross_validate(linear_clf,
                                 cifar10_train['data'],
                                 cifar10_train['labels'],
                                 scoring=['accuracy'],
                                 cv =5))

print(df)

    fit_time  score_time  test_accuracy
0  13.160133    5.043334       0.305000
1  11.154256    4.152277       0.298333
2  11.118886    4.164580       0.286667
3  11.032084    4.147673       0.316667
4  10.892714    4.163548       0.315000


In [None]:
# cifar 100 - C=1
df2 = pd.DataFrame(cross_validate(linear_clf,
                                 cifar100_train['data'],
                                 cifar100_train['labels'],
                                 scoring=['accuracy'],
                                 cv =5))

print(df2)

    fit_time  score_time  test_accuracy
0  15.126377    4.832221       0.125000
1  14.850858    4.862410       0.123333
2  12.524088    5.427286       0.120000
3  12.429066    4.857528       0.118333
4  12.044189    5.009994       0.143333


### Hyperparameter Search

5-fold Cross-Validation

*   linear / nonlinear - rbf
*   cifar 10
*   3000 for cross-validation / 300 for test


In [None]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

svc = svm.SVC()
clf = GridSearchCV(svc, param_grid, cv=5)
clf.fit(cifar10_train['data'], cifar10_train['labels'])

GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}])

In [None]:
df_cifar10_cv = pd.DataFrame(clf.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
df_cifar10_cv

Unnamed: 0,mean_test_score,std_test_score,params
0,0.304333,0.011086,"{'C': 1, 'kernel': 'linear'}"
1,0.303667,0.010975,"{'C': 10, 'kernel': 'linear'}"
2,0.303667,0.010975,"{'C': 100, 'kernel': 'linear'}"
3,0.303667,0.010975,"{'C': 1000, 'kernel': 'linear'}"
4,0.366,0.017082,"{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}"
5,0.264333,0.013808,"{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}"
6,0.395333,0.012083,"{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}"
7,0.365,0.015019,"{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}"
8,0.381667,0.013416,"{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}"
9,0.375,0.013703,"{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}"


In [None]:
# Best params, accuracy
print(f"The Best Parameters: {clf.best_params_}")
print(f"The Best Accuracy: {clf.best_score_}")

The Best Parameters: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
The Best Accuracy: 0.3953333333333333


In [None]:
y_pred = clf.predict(cifar10_test['data'])

In [None]:
print(metrics.classification_report(cifar10_test['labels'], y_pred))

              precision    recall  f1-score   support

           0       0.53      0.53      0.53        43
           1       0.32      0.28      0.30        25
           2       0.24      0.32      0.27        25
           3       0.30      0.33      0.31        24
           4       0.30      0.38      0.33        32
           5       0.39      0.27      0.32        26
           6       0.56      0.50      0.53        36
           7       0.33      0.23      0.27        30
           8       0.47      0.49      0.48        37
           9       0.40      0.45      0.43        22

    accuracy                           0.39       300
   macro avg       0.38      0.38      0.38       300
weighted avg       0.40      0.39      0.39       300



In [None]:
# save the model to disk
filename = 'cifar10_cv.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
# load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# y_pred = loaded_model.best_estimator_.predict(cifar10_test['data'])

5-fold Cross-Validation

*   linear / nonlinear - rbf
*   cifar 100
*   3000 for cross-validation / 300 for test

In [None]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

svc2 = svm.SVC()
clf2 = GridSearchCV(svc2, param_grid, cv=5)
clf2.fit(cifar100_train['data'], cifar100_train['labels'])

GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}])

In [None]:
df_cifar100_cv = pd.DataFrame(clf2.cv_results_)[['mean_test_score', 'std_test_score', 'params']]
df_cifar100_cv

Unnamed: 0,mean_test_score,std_test_score,params
0,0.126,0.008981,"{'C': 1, 'kernel': 'linear'}"
1,0.126,0.008981,"{'C': 10, 'kernel': 'linear'}"
2,0.126,0.008981,"{'C': 100, 'kernel': 'linear'}"
3,0.126,0.008981,"{'C': 1000, 'kernel': 'linear'}"
4,0.09,0.009006,"{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}"
5,0.030667,0.0062,"{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}"
6,0.139667,0.006182,"{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}"
7,0.096,0.00786,"{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}"
8,0.133333,0.007226,"{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}"
9,0.125333,0.004643,"{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}"


In [None]:
# Best params, accuracy
print(f"The Best Parameters: {clf2.best_params_}")
print(f"The Best Accuracy: {clf2.best_score_}")

The Best Parameters: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
The Best Accuracy: 0.13966666666666666


In [None]:
y_pred2 = clf2.predict(cifar100_test['data'])

In [None]:
print(metrics.classification_report(cifar100_test['labels'], y_pred2))

              precision    recall  f1-score   support

           0       0.40      0.29      0.33         7
           1       0.33      0.33      0.33         3
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         5
           6       0.33      0.33      0.33         3
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         3
          15       0.33      0.50      0.40         2
          16       0.25      0.25      0.25         4
          17       0.25    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# save the model to disk
filename = 'cifar100_cv.sav'
pickle.dump(clf2, open(filename, 'wb'))

Train / Test validation

In [None]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

In [None]:
C = param_grid[0]['C']

cifar10

In [None]:
acc = []
for c in C:
  linear_clf = svm.SVC(kernel='linear', C=c, random_state=seed)
  linear_clf.fit(cifar10_train['data'], cifar10_train['labels'])
  y_pred = linear_clf.predict(cifar10_test['data'])
  a = accuracy_score(cifar10_test['labels'], y_pred)
  acc.append(a)

1000


In [None]:
print(acc)

[0.32, 0.31666666666666665, 0.31666666666666665, 0.31666666666666665]


In [None]:
Gamma = param_grid[1]['gamma']

In [None]:
acc = []
for g in Gamma:
    for c in C:
        linear_clf = svm.SVC(kernel='rbf', C=c, gamma=g, random_state=seed)
        linear_clf.fit(cifar10_train['data'], cifar10_train['labels'])
        y_pred = linear_clf.predict(cifar10_test['data'])
        a = accuracy_score(cifar10_test['labels'], y_pred)
        acc.append(a)
print(acc)

[0.39666666666666667, 0.3933333333333333, 0.4066666666666667, 0.4033333333333333, 0.29333333333333333, 0.3933333333333333, 0.37333333333333335, 0.38]


Best model is {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'} for cifar10

cifar 100

In [None]:
acc100 = []
for c in C:
  linear_clf = svm.SVC(kernel='linear', C=c, random_state=seed)
  linear_clf.fit(cifar100_train['data'], cifar100_train['labels'])
  y_pred = linear_clf.predict(cifar100_test['data'])
  a = accuracy_score(cifar100_test['labels'], y_pred)
  acc100.append(a)
print(acc100)

[0.11666666666666667, 0.11666666666666667, 0.11666666666666667, 0.11666666666666667]


In [None]:
for g in Gamma:
    for c in C:
        print(g, c)

0.001 1
0.001 10
0.001 100
0.001 1000
0.0001 1
0.0001 10
0.0001 100
0.0001 1000


In [None]:
acc100 = []
for g in Gamma:
    for c in C:
        linear_clf = svm.SVC(kernel='rbf', C=c, gamma=g, random_state=seed)
        linear_clf.fit(cifar100_train['data'], cifar100_train['labels'])
        y_pred = linear_clf.predict(cifar100_test['data'])
        a = accuracy_score(cifar100_test['labels'], y_pred)
        acc100.append(a)
print(acc100)

[0.11, 0.14, 0.13333333333333333, 0.13666666666666666, 0.04, 0.09666666666666666, 0.14333333333333334, 0.11333333333333333]


Best model is {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'} for cifar100

### Check run time

In [None]:
# cifar 10 training time - cv best
import time
start = time.time()

clf = svm.SVC(kernel='rbf', C=10, gamma=0.001, random_state=seed)
clf.fit(cifar10_train['data'], cifar10_train['labels'])

end = time.time()
print("Run time [s]: ",end-start)

Run time [s]:  17.560858726501465


In [None]:
# cifar 10 inference time - cv best
import time
start = time.time()

y_pred = linear_clf.predict(cifar10_test['data'])

end = time.time()
print("Run time [s]: ",end-start)

Run time [s]:  4.858469009399414


In [None]:
# cifar 10 training time - train/test val best
import time
start = time.time()

clf = svm.SVC(kernel='rbf', C=100, gamma=0.001, random_state=seed)
clf.fit(cifar10_train['data'], cifar10_train['labels'])

end = time.time()
print("Run time [s]: ",end-start)

# cifar 10 inference time
import time
start = time.time()

y_pred = linear_clf.predict(cifar10_test['data'])

end = time.time()
print("Run time [s]: ",end-start)

Run time [s]:  16.97023606300354
Run time [s]:  4.0259339809417725


In [None]:
# cifar 100 training time
import time
start = time.time()

clf = svm.SVC(kernel='rbf', C=10, gamma=0.001, random_state=seed)
clf.fit(cifar100_train['data'], cifar100_train['labels'])

end = time.time()
print("Run time [s]: ",end-start)

Run time [s]:  22.20487689971924


In [None]:
# cifar 100 inference time
import time
start = time.time()

y_pred = linear_clf.predict(cifar100_test['data'])

end = time.time()
print("Run time [s]: ",end-start)

Run time [s]:  5.466325283050537


### Varying training set

In [None]:
cifar10_1000 = dataset_generator(cifar10, 1000, 'cifar10')
cifar10_2000 = dataset_generator(cifar10, 2000, 'cifar10')

In [None]:
clf = svm.SVC(kernel='rbf', C=10, gamma=0.001, random_state=seed)
clf.fit(cifar10_1000['data'], cifar10_1000['labels'])
y_pred = clf.predict(cifar10_test['data'])
accuracy_score(cifar10_test['labels'], y_pred)

0.36666666666666664

In [None]:
clf = svm.SVC(kernel='rbf', C=10, gamma=0.001, random_state=seed)
clf.fit(cifar10_2000['data'], cifar10_2000['labels'])
y_pred = clf.predict(cifar10_test['data'])
accuracy_score(cifar10_test['labels'], y_pred)

0.3933333333333333

In [None]:
cifar100_1000 = dataset_generator(cifar100, 1000, 'cifar100')
cifar100_2000 = dataset_generator(cifar100, 2000, 'cifar100')

In [None]:
clf = svm.SVC(kernel='rbf', C=10, gamma=0.001, random_state=seed)
clf.fit(cifar100_1000['data'], cifar100_1000['labels'])
y_pred = clf.predict(cifar100_test['data'])
accuracy_score(cifar100_test['labels'], y_pred)

0.10666666666666667

In [None]:
clf = svm.SVC(kernel='rbf', C=10, gamma=0.001, random_state=seed)
clf.fit(cifar100_2000['data'], cifar100_2000['labels'])
y_pred = clf.predict(cifar100_test['data'])
accuracy_score(cifar100_test['labels'], y_pred)

0.14