In [2]:
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [3]:
def read_data(filename, has_y):
    with open(filename, "r") as f:
        data = f.readlines()

    # Slice off first line which is the list of words
    data = data[1:]
    # Split the string on each line into ints and convert to numpy array
    data = np.array([[int(x) for x in line.split(" ")] for line in data], dtype="float64")
    
    if has_y:
        # y is the first column while X is everything else
        X = data[:, 1:]
        y = data[:, 0]

        return X, y
    else:
        return data

X_train, y_train = read_data("training_data.txt", has_y=True)
X_test = read_data("test_data.txt", has_y=False)

In [4]:
pipe = make_pipeline(preprocessing.StandardScaler(), SVC(C=4.0, gamma=0.0002))

In [5]:
# Compute cross-validation score
cross_val_score(pipe, X_train, y_train, cv=4, n_jobs=4, verbose=10)

[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  5.2min
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  5.2min remaining:  5.2min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  5.2min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  5.2min finished


array([0.85322935, 0.8438    , 0.8356    , 0.85437087])

In [6]:
# Fit to the training data
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=4.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0002, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [8]:
# Check training score
pipe.score(X_train, y_train)

0.9096

In [9]:
# Read the test data and predict output
pred = pipe.predict(X_test)

In [142]:
# Write to the output file

# with open("out2.txt", "w") as f:
#     f.write("Id,Prediction\n")
#     for i in range(len(pred)):
#         f.write("{0},{1}\n".format(i+1, pred[i]))