In [5]:
import csv
import numpy as np 
import pandas as pd 
import random
import matplotlib.pyplot as plt
# sklearn modules
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.decomposition import PCA
from sklearn.svm import SVR
# Metric mean squared error, the lower the better
from sklearn.metrics import mean_squared_error

def read_csv(csv_file):
    '''Returns numpy array and panda version of the file'''
    csv_file = csv_file +".csv"
    df = pd.DataFrame()
    df = df.from_csv(csv_file, header=0, sep=',', index_col=0)
#n=df.shape[0] # number of samples
#d=df.shape[1] # number of features
    array = np.asarray(df,dtype="float64")
    if np.shape(array)[1] == 1:
        array = array.ravel()
    return array, df

def publish_pred(y_pred, file_name):
    df = pd.DataFrame()
    df = df.from_csv("data/reg_sample_submission.csv", header=0, sep=',', index_col=0)
    df["Output"] = list(y_pred)
    df.to_csv(file_name)

tr_in, df_in = read_csv("data/reg_train_in")
tr_in = np.asarray(tr_in,dtype="float64")
tr_out, df_out = read_csv("data/reg_train_out")
tr_out = np.asarray(tr_out,dtype="float64")
te_in, df = read_csv("data/reg_test_in") # Still have to deal with NaNs best

df = pd.DataFrame()
df = df.from_csv("data/reg_test_gp.csv", header=None, sep=',', index_col=None)
te_gp= np.asarray(df,dtype="float64")

# Step 2, 3: Split and clean data

In [6]:
cv = ShuffleSplit(n_splits=1,  test_size=0.35, random_state=random.randint(0,20), train_size=None) # 10_splits
for train_index, test_index in cv.split(tr_in):
    print("TRAIN:", len(list(train_index)), "TEST:", len(test_index))
X_tr , y_tr = tr_in[train_index], tr_out[train_index]
X_vl , y_vl = tr_in[test_index], tr_out[test_index]

('TRAIN:', 22230, 'TEST:', 11970)


In [7]:
tr_scale = preprocessing.StandardScaler().fit(X_tr)
X = tr_scale.transform(tr_in)
y = tr_out

X_train = tr_scale.transform(X_tr)
y_train = y_tr
X_val = tr_scale.transform(X_vl)
y_val = y_vl

X_test = tr_scale.transform(te_gp)

# Cross-validation function

In [8]:
def cross_validation(tr_in, tr_out, regression_type, test_percent, folds):
    cv = ShuffleSplit(n_splits=folds,  test_size=test_percent, random_state=random.randint(0,20), train_size=None)
    scores = [50]
    for train_index, test_index in cv.split(X):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        pca = PCA(n_components=4)
        pca.fit(X_train)
        Z = pca.transform(X_train)
        Z_test = pca.transform(X_test)
        regression_type.fit(Z, y_train)
        y_pred = regression_type.predict(Z_test)
        score = mean_squared_error(y_test, y_pred)
        print score
        if score < min(scores+[50]):
            regression = regression_type
            best_score = score
        scores.append(score)
    results = [np.mean(scores), regression, best_score]
    return results

# Dimensionality reduction: 4-comp PCA

In [11]:
pca = PCA(n_components=4)
pca.fit(X_train)
print "Variance taken by each of the principal components", pca.explained_variance_ratio_
Z_train = pca.transform(X_train)
Z_val = pca.transform(X_val)
Z_test = pca.transform(X_test)

Variance taken by each of the principal components [ 0.76537487  0.07231771  0.07129095  0.07084865]


# Exp 2: SVR linear


In [12]:
from sklearn.svm import SVR
regression = SVR(kernel='linear')
regression.fit(Z_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [13]:
y_pred = regression.predict(Z_val)
score = mean_squared_error(y_val, y_pred)
print np.sqrt(score)

0.988995821704


# Exp 3: SVR rbf

In [14]:
from sklearn.svm import SVR
regression = SVR(kernel='rbf')
regression.fit(Z_train, y_train)
y_pred = regression.predict(Z_val)
score = mean_squared_error(y_val, y_pred)
print np.sqrt(score)

0.695174775612


### Overfitting watch out!!

In [15]:
for i in [1, 1.25, 1.5, 2]:
    regression = SVR(kernel='rbf', C=1.0, gamma=i)
    regression.fit(Z_train, y_train)
    y_pred_test = regression.predict(Z_test)
    y_pred = regression.predict(Z_val)
    score = mean_squared_error(y_val, y_pred)
    print np.sqrt(score)

0.713634017428
0.720923490235
0.727704824242
0.737429104611


In [27]:
publish_pred(df2[0], "predictions/reg_pred_gp.csv")