In [93]:
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV

In [55]:
def read_data(filename, has_y):
    with open(filename, "r") as f:
        data = f.readlines()

    # Slice off first line which is the list of words
    data = data[1:]
    # Split the string on each line into ints and convert to numpy array
    data = np.array([[int(x) for x in line.split(" ")] for line in data], dtype="float64")
    
    if has_y:
        # y is the first column while X is everything else
        X = data[:, 1:]
        y = data[:, 0]

        return X, y
    else:
        return data

X_train, y_train = read_data("training_data.txt", has_y=True)
X_test = read_data("test_data.txt", has_y=False)

# Shuffle the order of the training data just in case
X_train, y_train = sklearn.utils.shuffle(X_train, y_train)
# Make a smaller dataset with only 10% the size
X_train10 = X_train[:len(X_train)//10]
y_train10 = y_train[:len(y_train)//10]

In [47]:
# Preliminary trial on full dataset
# svc = SVC()
# svc.fit(X_train, y_train)
# svc.score(X_train, y_train)
# >>> 0.84655
# svc.support_.shape
# >>> (1285,)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [59]:
# Now I'll experimentation with the workflow
# Vary data normalisation, SVM parameters, regularisation
# Check with cross-validation

svc = SVC()
svc.fit(X_train10, y_train10)
print(svc.score(X_train10, y_train10))

0.786


In [78]:
scaler10 = preprocessing.StandardScaler().fit(X_train10)
X_train10_scaled = scaler10.transform(X_train10)

svc.fit(X_train10_scaled, y_train10)
print(svc.score(X_train10_scaled, y_train10))



0.98


In [135]:
pipe = make_pipeline(preprocessing.StandardScaler(), SVC())
grid = GridSearchCV(pipe, cv=3, param_grid={"svc__C":[2.0, 4.0, 6.0], "svc__gamma":[0.0002, 0.0004, 0.0006]}, n_jobs=4, return_train_score=True, verbose=1)

In [136]:
grid

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'svc__C': [2.0, 4.0, 6.0], 'svc__gamma': [0.0002, 0.0004, 0.0006]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [137]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed: 62.5min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'svc__C': [2.0, 4.0, 6.0], 'svc__gamma': [0.0002, 0.0004, 0.0006]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [138]:
def grid_summary(grid):
    print("Best parameters set found on development set:\n")
    print(grid.best_params_)
    print("Best CV score: ", grid.best_score_)
    print()
    print("Grid scores on development set:\n")

    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
        
grid_summary(grid)

Best parameters set found on development set:

{'svc__C': 4.0, 'svc__gamma': 0.0002}
Best CV score:  0.85125

Grid scores on development set:

0.850 (+/-0.002) for {'svc__C': 2.0, 'svc__gamma': 0.0002}
0.851 (+/-0.003) for {'svc__C': 2.0, 'svc__gamma': 0.0004}
0.851 (+/-0.006) for {'svc__C': 2.0, 'svc__gamma': 0.0006}
0.851 (+/-0.006) for {'svc__C': 4.0, 'svc__gamma': 0.0002}
0.850 (+/-0.006) for {'svc__C': 4.0, 'svc__gamma': 0.0004}
0.848 (+/-0.007) for {'svc__C': 4.0, 'svc__gamma': 0.0006}
0.850 (+/-0.009) for {'svc__C': 6.0, 'svc__gamma': 0.0002}
0.849 (+/-0.007) for {'svc__C': 6.0, 'svc__gamma': 0.0004}
0.845 (+/-0.007) for {'svc__C': 6.0, 'svc__gamma': 0.0006}


In [139]:
grid.score(X_train, y_train)



0.90955

In [140]:
grid.cv_results_

{'mean_fit_time': array([219.29951612, 215.95452372, 274.43245848, 189.61298402,
        220.95706391, 431.43186021, 186.47770079, 282.20888503,
        433.93683712]),
 'mean_score_time': array([90.66649564, 87.71958105, 92.1079669 , 78.15310884, 82.16637222,
        90.33819858, 76.05771438, 80.8949999 , 67.11564112]),
 'mean_test_score': array([0.8503 , 0.85095, 0.85055, 0.85125, 0.8498 , 0.84785, 0.84965,
        0.84855, 0.84505]),
 'mean_train_score': array([0.899025  , 0.92977502, 0.950275  , 0.91465001, 0.95005   ,
        0.97017499, 0.92455004, 0.96192497, 0.97932499]),
 'param_svc__C': masked_array(data=[2.0, 2.0, 2.0, 4.0, 4.0, 4.0, 6.0, 6.0, 6.0],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_svc__gamma': masked_array(data=[0.0002, 0.0004, 0.0006, 0.0002, 0.0004, 0.0006, 0.0002,
                    0.0004, 0.0006],
              mask=[False, False, False, F

In [141]:
pred = grid.predict(X_test)



In [142]:
with open("out2.txt", "w") as f:
    f.write("Id,Prediction\n")
    for i in range(len(pred)):
        f.write("{0},{1}\n".format(i+1, pred[i]))