In [46]:
import csv
import numpy as np 
import pandas as pd 
import random
import matplotlib.pyplot as plt
# sklearn modules
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.decomposition import PCA
from sklearn.svm import SVR
# Metric mean squared error, the lower the better
from sklearn.metrics import mean_squared_error

def read_csv(csv_file):
    '''Returns numpy array and panda version of the file'''
    csv_file = csv_file +".csv"
    df = pd.DataFrame()
    df = df.from_csv(csv_file, header=0, sep=',', index_col=0)
#n=df.shape[0] # number of samples
#d=df.shape[1] # number of features
    array = np.asarray(df,dtype="|S6")
    if np.shape(array)[1] == 1:
        array = array.ravel()
    return array, df

def publish_pred(y_pred, file_name):
    df = pd.DataFrame()
    df = df.from_csv("reg_sample_submission.csv", header=0, sep=',', index_col=0)
    df["Output"] = list(y_pred)
    df.to_csv(file_name)

tr_in, df_in = read_csv("reg_train_in")
tr_in = np.asarray(tr_in,dtype="float64")
tr_out, df_out = read_csv("reg_train_out")
tr_out = np.asarray(tr_out,dtype="float64")
te_in, df = read_csv("reg_test_in") # Still have to deal with NaNs best

# Cleanning data

In [2]:
tr_mean = tr_in.mean(axis=0)
tr_std = tr_in.std(axis=0)
tr_in_scaled = preprocessing.scale(tr_in)  # Clean (mean 0, variance 1) data-set

# Split data: train & test (5-fold cv)

In [3]:
cv = ShuffleSplit(n_splits=1,  test_size=0.20, random_state=random.randint(0,20), train_size=None) # 10_splits
for train_index, test_index in cv.split(tr_in_scaled):
    print("TRAIN:", len(list(train_index)), "TEST:", len(test_index))

('TRAIN:', 27360, 'TEST:', 6840)


In [4]:
X_train = tr_in_scaled[train_index]
y_train = tr_out[train_index]
X_test = tr_in_scaled[test_index]
y_test = tr_out[test_index]
np.shape(y_test)

(6840,)

# Dimensionality reduction: 4-comp PCA

In [5]:
pca = PCA(n_components=4)
pca.fit(X_train)
print "Variance taken by each of the principal components", pca.explained_variance_ratio_
X_tr_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

Variance taken by each of the principal components [ 0.76477684  0.07234427  0.07163393  0.07100756]


# Exp 1: lasso

# Exp 2: SVR linear


In [96]:
from sklearn.svm import SVR
regression = SVR(kernel='linear')
regression.fit(X_tr_pca, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [97]:
y_pred = regression.predict(X_test_pca)
score = mean_squared_error(y_test, y_pred)
print score

0.968108147417


# Exp 3: SVR rbf

In [69]:
from sklearn.svm import SVR
regression = SVR(kernel='rbf')
regression.fit(X_tr_pca, y_train)
y_pred = regression.predict(X_test_pca)
score = mean_squared_error(y_test, y_pred)
print score

0.468455379458


# Cross-validation function

In [94]:
def cross_validation(tr_in, tr_out, regression_type, test_percent, folds):
    cv = ShuffleSplit(n_splits=folds,  test_size=test_percent, random_state=random.randint(0,20), train_size=None)
    scores = [50]
    for train_index, test_index in cv.split(X):
        X_train = tr_in_scaled[train_index]
        y_train = tr_out[train_index]
        X_test = tr_in_scaled[test_index]
        y_test = tr_out[test_index]
        pca = PCA(n_components=4)
        pca.fit(X_train)
        X_tr_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)
        regression_type.fit(X_tr_pca, y_train)
        y_pred = regression_type.predict(X_test_pca)
        score = mean_squared_error(y_test, y_pred)
        print score
        if score < min(scores+[50]):
            regression = regression_type
            best_score = score
        scores.append(score)
    results = [np.mean(scores), regression, best_score]
    return results

In [81]:
regression = SVR(kernel='linear')
cross_validation(tr_in, tr_out, regression, 0.2, 5)

0.967134987146
0.983332714343
0.982830039698
0.980513834457
0.958039434238


In [29]:
mean_tr = np.mean(tr_in,axis = 0)
std_tr = np.std(tr_in,axis = 0)

In [40]:
df = pd.DataFrame()
df = df.from_csv("reg_test_gp.csv", header=None, sep=',', index_col=None)
te_gp= np.asarray(df,dtype="float64")
X_test = np.divide(np.add(te_gp,-mean_tr),std_tr)

pca = PCA(n_components=4)
pca.fit(tr_in)
Z = pca.transform(tr_in)
Z_test = pca.transform(X_test)



In [41]:
from sklearn.svm import SVR
regression = SVR(kernel='rbf')
regression.fit(Z, tr_out)
y_pred = regression.predict(Z_test)

publish_pred(y_pred, "reg_pre")

SyntaxError: invalid syntax (<ipython-input-45-edd6c92da89f>, line 1)

In [30]:
np.shape(mean_tr)

(14,)