In [54]:
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV

In [55]:
def read_data(filename, has_y):
    with open(filename, "r") as f:
        data = f.readlines()

    # Slice off first line which is the list of words
    data = data[1:]
    # Split the string on each line into ints and convert to numpy array
    data = np.array([[int(x) for x in line.split(" ")] for line in data], dtype="float64")
    
    if has_y:
        # y is the first column while X is everything else
        X = data[:, 1:]
        y = data[:, 0]

        return X, y
    else:
        return data
    
def grid_summary(grid):
    print("Best parameters set found on development set:\n")
    print(grid.best_params_)
    print("Best CV score: ", grid.best_score_)
    print()
    print("Grid scores on development set:\n")

    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

X_train, y_train = read_data("training_data.txt", has_y=True)
X_test = read_data("test_data.txt", has_y=False)

# Shuffle the order of the training data just in case
X_train, y_train = sklearn.utils.shuffle(X_train, y_train)
# Make a smaller dataset with only 10% the size
X_train10 = X_train[:len(X_train)//10]
y_train10 = y_train[:len(y_train)//10]

In [47]:
# Preliminary trial on full dataset
# svc = SVC()
# svc.fit(X_train, y_train)
# svc.score(X_train, y_train)
# >>> 0.84655
# svc.support_.shape
# >>> (1285,)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [47]:
pipe = make_pipeline(preprocessing.StandardScaler(), SVC(kernel="linear"))
# grid = GridSearchCV(pipe, return_train_score=True, verbose=1,
#                     cv=3, 
#                     param_grid={"svc__C":[2.0, 4.0, 6.0], "svc__gamma":[0.0002, 0.0004, 0.0006]}, 
#                     n_jobs=4)
        
grid = GridSearchCV(pipe, return_train_score=True, verbose=1,
                    cv=5, 
                    param_grid={"svc__C":[0.0001, 0.0003, 0.001, 0.003]}, 
                    n_jobs=4)

In [48]:
# As a test just fit on 10% to check parameter values to use 
grid.fit(X_train10, y_train10)
grid_summary(grid)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:   49.0s finished


Best parameters set found on development set:

{'svc__C': 0.0003}
Best CV score:  0.797

Grid scores on development set:

0.782 (+/-0.044) for {'svc__C': 0.0001}
0.797 (+/-0.028) for {'svc__C': 0.0003}
0.794 (+/-0.031) for {'svc__C': 0.001}
0.775 (+/-0.035) for {'svc__C': 0.003}


In [49]:
# Use the model trained on 10% to check on 100%
print(grid.score(X_train10, y_train10))
print(grid.score(X_train, y_train))

0.8955
0.81395


In [50]:
# Use the restricted parameter range to search on the actual data set
grid.fit(X_train, y_train)
grid_summary(grid)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed: 49.0min finished


Best parameters set found on development set:

{'svc__C': 0.0003}
Best CV score:  0.8487

Grid scores on development set:

0.842 (+/-0.007) for {'svc__C': 0.0001}
0.849 (+/-0.003) for {'svc__C': 0.0003}
0.849 (+/-0.008) for {'svc__C': 0.001}
0.845 (+/-0.007) for {'svc__C': 0.003}


In [51]:
# Check the training score in comparison with the CV score above
print(grid.score(X_train, y_train))

0.8718


In [52]:
pred = grid.predict(X_test)

In [53]:
with open("out3.txt", "w") as f:
    f.write("Id,Prediction\n")
    for i in range(len(pred)):
        f.write("{0},{1}\n".format(i+1, pred[i]))