In [1]:
# import standard libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC # "Support vector classifier"
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

In [3]:
# load train and test data
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

# different classes
train_data.label.unique()

# separate into labels and data
X = train_data.iloc[:,1:]
y = train_data.iloc[:,0]

print(train_data.shape)
print(X.shape)
print(y.shape)
#convert pandas DataFrame to numpy array if it needed!
train_data_numpy = train_data.to_numpy()
test_data_numpy = test_data.to_numpy()

(32000, 785)
(32000, 784)
(32000,)


In [4]:
# dimensionality reduction
# PCA
pca = PCA(0.91)
pca.fit(X)
X_PCA = pca.transform(X)
test_data_PCA=pca.transform(test_data)
print('Raw Data: ', X.shape)
print('After PCA: ', X_PCA.shape)
print(test_data_PCA.shape)

Raw Data:  (32000, 784)
After PCA:  (32000, 95)
(8000, 95)


In [5]:
# submission
id = [i for i in range(test_data_numpy.shape[0])]

model_all = SVC(C=1000, gamma=4.34E-7).fit(X_PCA, y)
test_data_result = model_all.predict(test_data_PCA)
test_submission = pd.DataFrame({'id': id, 'label': np.array(test_data_result)})

test_submission.to_csv('submission12.csv', index=False)

Below is code I used for testing. This code is not used in the final submission.

In [7]:
# separate into training and testing data
X_tr, X_test, y_tr, y_test = train_test_split(X_PCA, y, test_size=0.3, random_state=0)

In [9]:
# parameter tuning
parameters = {
    # 'kernel': ['rbf'],
    'C': [1000],
    # 'decision_function_shape': ['ovo'],
    'gamma': [4.34e-7, 4.36e-7],
    # 'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    # 'cache_size': [100, 200, 300, 500, 800, 1000],
    # 'class_weight': ['balanced']
    }

svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_tr, y_tr)
print("Best parameters set found on development set:")
print(clf.best_params_)
print("Grid scores on development set:")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
  print("%0.9f (+/-%0.09f) for %r" % (mean, std * 2, params))

Best parameters set found on development set:
{'C': 1000, 'gamma': 4.34e-07}
Grid scores on development set:
0.980491071 (+/-0.002566091) for {'C': 1000, 'gamma': 4.34e-07}
0.980491071 (+/-0.002566091) for {'C': 1000, 'gamma': 4.36e-07}


In [13]:
# Internal testing
model = SVC(C=1e10, gamma=4.32E-7)
for i in [0.9, 0.92]:
  pca = PCA(i).fit(X)
  X_PCA = pca.transform(X)
  print(i, X_PCA.shape)
  scores = cross_val_score(model, X_PCA, y, cv=5)
  print("%0.9f accuracy with a standard deviation of %0.9f" % (scores.mean(), scores.std()))
# model = SVC(C=1e10, gamma=4.2E-7)
# scores = cross_val_score(model, X, y, cv=5)
# print("%0.9f accuracy with a standard deviation of %0.9f" % (scores.mean(), scores.std()))

# model = SVC(C=1e10, gamma=4.2E-7)
# model.fit(X_tr, y_tr)
# result = model.predict(X_test)
# print(accuracy_score(result, y_test))

# numpy_data = np.array(result)
# pd.DataFrame(numpy_data).to_csv('internal_test.csv', index=False)

0.9 (32000, 86)
0.984000000 accuracy with a standard deviation of 0.001707635
0.92 (32000, 105)
0.983843750 accuracy with a standard deviation of 0.001741609
