In [78]:
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV

In [79]:
def read_data(filename, has_y):
    with open(filename, "r") as f:
        data = f.readlines()

    # Slice off first line which is the list of words
    data = data[1:]
    # Split the string on each line into ints and convert to numpy array
    data = np.array([[int(x) for x in line.split(" ")] for line in data], dtype="float64")
    
    if has_y:
        # y is the first column while X is everything else
        X = data[:, 1:]
        y = data[:, 0]

        return X, y
    else:
        return data
    
def grid_summary(grid):
    print("Best parameters set found on development set:\n")
    print(grid.best_params_)
    print("Best CV score: ", grid.best_score_)
    print()
    print("Grid scores on development set:\n")

    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

X_train, y_train = read_data("training_data.txt", has_y=True)
X_test = read_data("test_data.txt", has_y=False)

# Shuffle the order of the training data just in case
X_train, y_train = sklearn.utils.shuffle(X_train, y_train)
# Make a smaller dataset with only 10% the size
X_train10 = X_train[:len(X_train)//10]
y_train10 = y_train[:len(y_train)//10]

In [60]:
# Preliminary trial on full dataset
# svc = SVC()
# svc.fit(X_train, y_train)
# svc.score(X_train, y_train)
# >>> 0.84655
# svc.support_.shape
# >>> (1285,)

In [82]:
pipe = make_pipeline(preprocessing.StandardScaler(), SVC(kernel="sigmoid"))
grid = GridSearchCV(pipe, return_train_score=True, verbose=1,
                    cv=3, 
                    param_grid={"svc__C":[0.1, 0.3, 1.0, 3.0],
                                "svc__gamma":[0.0003, 0.001, 0.003, 0.01, 0.03],
                                "svc__coef0":[0.0001, 0.0003, 0.001]}, 
                    n_jobs=4)

# grid = GridSearchCV(pipe, return_train_score=True, verbose=1,
#                     cv=3, 
#                     param_grid={"svc__C":[0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
#                                 "svc__degree":[2, 3, 4, 5],
#                                 "svc__gamma":[0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03],
#                                 "svc__coef0":[0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03]}, 
#                     n_jobs=4)

In [83]:
# As a test just fit on 10% to check parameter values to use 
grid.fit(X_train10, y_train10)
grid_summary(grid)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:  4.4min finished


Best parameters set found on development set:

{'svc__C': 0.1, 'svc__coef0': 0.0001, 'svc__gamma': 0.01}
Best CV score:  0.806

Grid scores on development set:

0.511 (+/-0.001) for {'svc__C': 0.1, 'svc__coef0': 0.0001, 'svc__gamma': 0.0003}
0.574 (+/-0.006) for {'svc__C': 0.1, 'svc__coef0': 0.0001, 'svc__gamma': 0.001}
0.762 (+/-0.038) for {'svc__C': 0.1, 'svc__coef0': 0.0001, 'svc__gamma': 0.003}
0.806 (+/-0.006) for {'svc__C': 0.1, 'svc__coef0': 0.0001, 'svc__gamma': 0.01}
0.786 (+/-0.001) for {'svc__C': 0.1, 'svc__coef0': 0.0001, 'svc__gamma': 0.03}
0.511 (+/-0.001) for {'svc__C': 0.1, 'svc__coef0': 0.0003, 'svc__gamma': 0.0003}
0.574 (+/-0.006) for {'svc__C': 0.1, 'svc__coef0': 0.0003, 'svc__gamma': 0.001}
0.762 (+/-0.038) for {'svc__C': 0.1, 'svc__coef0': 0.0003, 'svc__gamma': 0.003}
0.806 (+/-0.012) for {'svc__C': 0.1, 'svc__coef0': 0.0003, 'svc__gamma': 0.01}
0.786 (+/-0.006) for {'svc__C': 0.1, 'svc__coef0': 0.0003, 'svc__gamma': 0.03}
0.511 (+/-0.001) for {'svc__C': 0.1, 'svc

In [86]:
pipe = make_pipeline(preprocessing.StandardScaler(), SVC(kernel="sigmoid"))
grid = GridSearchCV(pipe, return_train_score=True, verbose=1,
                    cv=5, 
                    param_grid={"svc__C":[0.1, 0.3],
                                "svc__gamma":[0.003, 0.01],
                                "svc__coef0":[0.0001, 0.0003, 0.001]}, 
                    n_jobs=4)
# As a test just fit on 10% to check parameter values to use 
grid.fit(X_train10, y_train10)
grid_summary(grid)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:  2.0min finished


Best parameters set found on development set:

{'svc__C': 0.1, 'svc__coef0': 0.001, 'svc__gamma': 0.01}
Best CV score:  0.8075

Grid scores on development set:

0.790 (+/-0.041) for {'svc__C': 0.1, 'svc__coef0': 0.0001, 'svc__gamma': 0.003}
0.799 (+/-0.050) for {'svc__C': 0.1, 'svc__coef0': 0.0001, 'svc__gamma': 0.01}
0.790 (+/-0.039) for {'svc__C': 0.1, 'svc__coef0': 0.0003, 'svc__gamma': 0.003}
0.801 (+/-0.047) for {'svc__C': 0.1, 'svc__coef0': 0.0003, 'svc__gamma': 0.01}
0.790 (+/-0.041) for {'svc__C': 0.1, 'svc__coef0': 0.001, 'svc__gamma': 0.003}
0.807 (+/-0.049) for {'svc__C': 0.1, 'svc__coef0': 0.001, 'svc__gamma': 0.01}
0.804 (+/-0.040) for {'svc__C': 0.3, 'svc__coef0': 0.0001, 'svc__gamma': 0.003}
0.786 (+/-0.049) for {'svc__C': 0.3, 'svc__coef0': 0.0001, 'svc__gamma': 0.01}
0.801 (+/-0.036) for {'svc__C': 0.3, 'svc__coef0': 0.0003, 'svc__gamma': 0.003}
0.791 (+/-0.050) for {'svc__C': 0.3, 'svc__coef0': 0.0003, 'svc__gamma': 0.01}
0.804 (+/-0.044) for {'svc__C': 0.3, 'svc__coe

In [87]:
pipe = make_pipeline(preprocessing.StandardScaler(), SVC(kernel="sigmoid"))
grid = GridSearchCV(pipe, return_train_score=True, verbose=1,
                    cv=4, 
                    param_grid={"svc__C":[0.1, 0.3],
                                "svc__gamma":[0.003, 0.01],
                                "svc__coef0":[0.0001, 0.0003, 0.001]}, 
                    n_jobs=4)
# As a test just fit on 10% to check parameter values to use 
grid.fit(X_train, y_train)
grid_summary(grid)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=4)]: Done  48 out of  48 | elapsed: 99.5min finished


Best parameters set found on development set:

{'svc__C': 0.1, 'svc__coef0': 0.0001, 'svc__gamma': 0.003}
Best CV score:  0.84615

Grid scores on development set:

0.846 (+/-0.006) for {'svc__C': 0.1, 'svc__coef0': 0.0001, 'svc__gamma': 0.003}
0.827 (+/-0.010) for {'svc__C': 0.1, 'svc__coef0': 0.0001, 'svc__gamma': 0.01}
0.846 (+/-0.004) for {'svc__C': 0.1, 'svc__coef0': 0.0003, 'svc__gamma': 0.003}
0.829 (+/-0.007) for {'svc__C': 0.1, 'svc__coef0': 0.0003, 'svc__gamma': 0.01}
0.846 (+/-0.004) for {'svc__C': 0.1, 'svc__coef0': 0.001, 'svc__gamma': 0.003}
0.827 (+/-0.011) for {'svc__C': 0.1, 'svc__coef0': 0.001, 'svc__gamma': 0.01}
0.841 (+/-0.010) for {'svc__C': 0.3, 'svc__coef0': 0.0001, 'svc__gamma': 0.003}
0.809 (+/-0.017) for {'svc__C': 0.3, 'svc__coef0': 0.0001, 'svc__gamma': 0.01}
0.840 (+/-0.009) for {'svc__C': 0.3, 'svc__coef0': 0.0003, 'svc__gamma': 0.003}
0.809 (+/-0.013) for {'svc__C': 0.3, 'svc__coef0': 0.0003, 'svc__gamma': 0.01}
0.842 (+/-0.008) for {'svc__C': 0.3, 'svc__

In [88]:
# Check the training score in comparison with the CV score above
print(grid.score(X_train, y_train))

0.82985


In [89]:
pred = grid.predict(X_test)

In [90]:
with open("out5.txt", "w") as f:
    f.write("Id,Prediction\n")
    for i in range(len(pred)):
        f.write("{0},{1}\n".format(i+1, int(pred[i])))