In [58]:
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV

In [59]:
def read_data(filename, has_y):
    with open(filename, "r") as f:
        data = f.readlines()

    # Slice off first line which is the list of words
    data = data[1:]
    # Split the string on each line into ints and convert to numpy array
    data = np.array([[int(x) for x in line.split(" ")] for line in data], dtype="float64")
    
    if has_y:
        # y is the first column while X is everything else
        X = data[:, 1:]
        y = data[:, 0]

        return X, y
    else:
        return data
    
def grid_summary(grid):
    print("Best parameters set found on development set:\n")
    print(grid.best_params_)
    print("Best CV score: ", grid.best_score_)
    print()
    print("Grid scores on development set:\n")

    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

X_train, y_train = read_data("training_data.txt", has_y=True)
X_test = read_data("test_data.txt", has_y=False)

# Shuffle the order of the training data just in case
X_train, y_train = sklearn.utils.shuffle(X_train, y_train)
# Make a smaller dataset with only 10% the size
X_train10 = X_train[:len(X_train)//10]
y_train10 = y_train[:len(y_train)//10]

In [60]:
# Preliminary trial on full dataset
# svc = SVC()
# svc.fit(X_train, y_train)
# svc.score(X_train, y_train)
# >>> 0.84655
# svc.support_.shape
# >>> (1285,)

In [65]:
pipe = make_pipeline(preprocessing.StandardScaler(), SVC(kernel="poly"))
grid = GridSearchCV(pipe, return_train_score=True, verbose=1,
                    cv=3, 
                    param_grid={"svc__C":[0.01, 0.1, 1.0],
                                "svc__degree":[2, 3, 4, 5],
                                "svc__gamma":[0.0001, 0.001, 0.01],
                                "svc__coef0":[0.0001, 0.001, 0.01]}, 
                    n_jobs=4)

# grid = GridSearchCV(pipe, return_train_score=True, verbose=1,
#                     cv=3, 
#                     param_grid={"svc__C":[0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
#                                 "svc__degree":[2, 3, 4, 5],
#                                 "svc__gamma":[0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03],
#                                 "svc__coef0":[0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03]}, 
#                     n_jobs=4)

In [66]:
# As a test just fit on 10% to check parameter values to use 
grid.fit(X_train10, y_train10)
grid_summary(grid)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  6.5min
[Parallel(n_jobs=4)]: Done 324 out of 324 | elapsed: 11.0min finished


Best parameters set found on development set:

{'svc__C': 1.0, 'svc__coef0': 0.01, 'svc__degree': 2, 'svc__gamma': 0.01}
Best CV score:  0.677

Grid scores on development set:

0.502 (+/-0.001) for {'svc__C': 0.01, 'svc__coef0': 0.0001, 'svc__degree': 2, 'svc__gamma': 0.0001}
0.502 (+/-0.001) for {'svc__C': 0.01, 'svc__coef0': 0.0001, 'svc__degree': 2, 'svc__gamma': 0.001}
0.641 (+/-0.050) for {'svc__C': 0.01, 'svc__coef0': 0.0001, 'svc__degree': 2, 'svc__gamma': 0.01}
0.502 (+/-0.001) for {'svc__C': 0.01, 'svc__coef0': 0.0001, 'svc__degree': 3, 'svc__gamma': 0.0001}
0.502 (+/-0.001) for {'svc__C': 0.01, 'svc__coef0': 0.0001, 'svc__degree': 3, 'svc__gamma': 0.001}
0.530 (+/-0.019) for {'svc__C': 0.01, 'svc__coef0': 0.0001, 'svc__degree': 3, 'svc__gamma': 0.01}
0.502 (+/-0.001) for {'svc__C': 0.01, 'svc__coef0': 0.0001, 'svc__degree': 4, 'svc__gamma': 0.0001}
0.502 (+/-0.001) for {'svc__C': 0.01, 'svc__coef0': 0.0001, 'svc__degree': 4, 'svc__gamma': 0.001}
0.512 (+/-0.008) for {'svc__C'

In [67]:
grid = GridSearchCV(pipe, return_train_score=True, verbose=1,
                    cv=3, 
                    param_grid={"svc__C":[1.0, 3.0, 10.0],
                                "svc__degree":[2, 3, 4],
                                "svc__gamma":[0.01, 0.03, 0.1]}, 
                    n_jobs=4)
grid.fit(X_train10, y_train10)
grid_summary(grid)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  81 out of  81 | elapsed:  3.0min finished


Best parameters set found on development set:

{'svc__C': 1.0, 'svc__degree': 2, 'svc__gamma': 0.01}
Best CV score:  0.672

Grid scores on development set:

0.672 (+/-0.054) for {'svc__C': 1.0, 'svc__degree': 2, 'svc__gamma': 0.01}
0.664 (+/-0.031) for {'svc__C': 1.0, 'svc__degree': 2, 'svc__gamma': 0.03}
0.657 (+/-0.037) for {'svc__C': 1.0, 'svc__degree': 2, 'svc__gamma': 0.1}
0.594 (+/-0.088) for {'svc__C': 1.0, 'svc__degree': 3, 'svc__gamma': 0.01}
0.558 (+/-0.071) for {'svc__C': 1.0, 'svc__degree': 3, 'svc__gamma': 0.03}
0.558 (+/-0.071) for {'svc__C': 1.0, 'svc__degree': 3, 'svc__gamma': 0.1}
0.564 (+/-0.085) for {'svc__C': 1.0, 'svc__degree': 4, 'svc__gamma': 0.01}
0.514 (+/-0.006) for {'svc__C': 1.0, 'svc__degree': 4, 'svc__gamma': 0.03}
0.514 (+/-0.006) for {'svc__C': 1.0, 'svc__degree': 4, 'svc__gamma': 0.1}
0.666 (+/-0.044) for {'svc__C': 3.0, 'svc__degree': 2, 'svc__gamma': 0.01}
0.657 (+/-0.037) for {'svc__C': 3.0, 'svc__degree': 2, 'svc__gamma': 0.03}
0.657 (+/-0.037) for 

In [69]:
grid = GridSearchCV(pipe, return_train_score=True, verbose=1,
                    cv=3, 
                    param_grid={"svc__C":[0.3, 1.0, 3.0],
                                "svc__degree":[2, 3],
                                "svc__gamma":[0.003, 0.01, 0.03]}, 
                    n_jobs=4)
grid.fit(X_train, y_train)
grid_summary(grid)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 251.3min
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed: 328.1min finished


Best parameters set found on development set:

{'svc__C': 1.0, 'svc__degree': 2, 'svc__gamma': 0.003}
Best CV score:  0.792

Grid scores on development set:

0.785 (+/-0.008) for {'svc__C': 0.3, 'svc__degree': 2, 'svc__gamma': 0.003}
0.786 (+/-0.009) for {'svc__C': 0.3, 'svc__degree': 2, 'svc__gamma': 0.01}
0.775 (+/-0.002) for {'svc__C': 0.3, 'svc__degree': 2, 'svc__gamma': 0.03}
0.662 (+/-0.009) for {'svc__C': 0.3, 'svc__degree': 3, 'svc__gamma': 0.003}
0.740 (+/-0.007) for {'svc__C': 0.3, 'svc__degree': 3, 'svc__gamma': 0.01}
0.752 (+/-0.008) for {'svc__C': 0.3, 'svc__degree': 3, 'svc__gamma': 0.03}
0.792 (+/-0.009) for {'svc__C': 1.0, 'svc__degree': 2, 'svc__gamma': 0.003}
0.779 (+/-0.002) for {'svc__C': 1.0, 'svc__degree': 2, 'svc__gamma': 0.01}
0.771 (+/-0.003) for {'svc__C': 1.0, 'svc__degree': 2, 'svc__gamma': 0.03}
0.703 (+/-0.003) for {'svc__C': 1.0, 'svc__degree': 3, 'svc__gamma': 0.003}
0.750 (+/-0.007) for {'svc__C': 1.0, 'svc__degree': 3, 'svc__gamma': 0.01}
0.747 (+/-0.0

In [70]:
# Check the training score in comparison with the CV score above
print(grid.score(X_train, y_train))

0.99085


In [71]:
pred = grid.predict(X_test)

In [73]:
with open("out4.txt", "w") as f:
    f.write("Id,Prediction\n")
    for i in range(len(pred)):
        f.write("{0},{1}\n".format(i+1, int(pred[i])))