In [1]:
import csv
import numpy as np 
import pandas as pd 
import random
import matplotlib.pyplot as plt
# sklearn modules
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.decomposition import PCA
from sklearn.svm import SVR
# Metric mean squared error, the lower the better
from sklearn.metrics import mean_squared_error

def read_csv(csv_file):
    '''Returns numpy array and panda version of the file'''
    csv_file = csv_file +".csv"
    df = pd.DataFrame()
    df = df.from_csv(csv_file, header=0, sep=',', index_col=0)
#n=df.shape[0] # number of samples
#d=df.shape[1] # number of features
    array = np.asarray(df,dtype="|S6")
    if np.shape(array)[1] == 1:
        array = array.ravel()
    return array, df

def publish_pred(y_pred, file_name):
    df = pd.DataFrame()
    df = df.from_csv("reg_sample_submission.csv", header=0, sep=',', index_col=0)
    df["Output"] = list(y_pred)
    df.to_csv(file_name)

tr_in, df_in = read_csv("reg_train_in")
tr_in = np.asarray(tr_in,dtype="float64")
tr_out, df_out = read_csv("reg_train_out")
tr_out = np.asarray(tr_out,dtype="float64")
te_in, df = read_csv("reg_test_in") # Still have to deal with NaNs best

# Cleanning data

In [2]:
X = preprocessing.scale(tr_in)# Clean (mean 0, variance 1) data-set
y = tr_out

# Test-data with replacements

In [3]:
mean_tr = np.mean(tr_in,axis = 0)
std_tr = np.std(tr_in,axis = 0)
df = pd.DataFrame()
df = df.from_csv("reg_test_gp.csv", header=None, sep=',', index_col=None)
te_gp= np.asarray(df,dtype="float64")
X_test = np.divide(np.add(te_gp,-mean_tr),std_tr)

# Split data: train & test (5-fold cv)

In [4]:
cv = ShuffleSplit(n_splits=1,  test_size=0.35, random_state=random.randint(0,20), train_size=None) # 10_splits
for train_index, test_index in cv.split(X):
    print("TRAIN:", len(list(train_index)), "TEST:", len(test_index))

('TRAIN:', 22230, 'TEST:', 11970)


In [5]:
X_train , y_train = X[train_index], y[train_index]
Xs , ys = X[test_index], y[test_index]

# Dimensionality reduction: 4-comp PCA

In [6]:
pca = PCA(n_components=4)
pca.fit(X_train)
print "Variance taken by each of the principal components", pca.explained_variance_ratio_
Z_train = pca.transform(X_train)
Zs = pca.transform(Xs)

Variance taken by each of the principal components [ 0.76459237  0.07244221  0.07189891  0.07092882]


In [7]:
pca = PCA(n_components=4)
pca.fit(X)
print "Variance taken by each of the principal components", pca.explained_variance_ratio_
Z = pca.transform(X)
Z_test = pca.transform(X_test)

Variance taken by each of the principal components [ 0.76530528  0.07220808  0.07137663  0.07086174]


# Cross-validation function

In [8]:
def cross_validation(tr_in, tr_out, regression_type, test_percent, folds):
    cv = ShuffleSplit(n_splits=folds,  test_size=test_percent, random_state=random.randint(0,20), train_size=None)
    scores = [50]
    for train_index, test_index in cv.split(X):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        pca = PCA(n_components=4)
        pca.fit(X_train)
        Z = pca.transform(X_train)
        Z_test = pca.transform(X_test)
        regression_type.fit(Z, y_train)
        y_pred = regression_type.predict(Z_test)
        score = mean_squared_error(y_test, y_pred)
        print score
        if score < min(scores+[50]):
            regression = regression_type
            best_score = score
        scores.append(score)
    results = [np.mean(scores), regression, best_score]
    return results

In [9]:
#regression = SVR(kernel='linear')
#cross_validation(tr_in, tr_out, regression, 0.2, 5)

# Exp 1: lasso

# Exp 2: SVR linear


In [9]:
from sklearn.svm import SVR
regression = SVR(kernel='linear')
regression.fit(Z_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [62]:
y_pred = regression.predict(Zs)
score = mean_squared_error(ys, y_pred)
print score

0.976873892727


# Exp 3: SVR rbf

In [34]:
from sklearn.svm import SVR
regression = SVR(kernel='rbf')
regression.fit(Z_train, y_train)
y_pred = regression.predict(Zs)
score = mean_squared_error(y_test, y_pred)
print np.sqrt(score)

0.69081530806


In [10]:
regression = SVR(kernel='rbf')
regression.fit(Z, y)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [12]:
y_pred_test = regression.predict(Z_test)
y_pred = regression.predict(Z)
score = mean_squared_error(y, y_pred)
print np.sqrt(score)
publish_pred(y_pred_test, "reg_pred.csv")


0.674454061824
