In [1]:
import csv
import numpy as np 
import pandas as pd 
import random
import matplotlib.pyplot as plt
# sklearn modules
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.decomposition import PCA
from sklearn.svm import SVR
# Metric mean squared error, the lower the better
from sklearn.metrics import mean_squared_error

def read_csv(csv_file):
    '''Returns numpy array and panda version of the file'''
    csv_file = csv_file +".csv"
    df = pd.DataFrame()
    df = df.from_csv(csv_file, header=0, sep=',', index_col=0)
#n=df.shape[0] # number of samples
#d=df.shape[1] # number of features
    array = np.asarray(df,dtype="|S6")
    if np.shape(array)[1] == 1:
        array = array.ravel()
    return array, df

def publish_pred(y_pred, file_name):
    df = pd.DataFrame()
    df = df.from_csv("reg_sample_submission.csv", header=0, sep=',', index_col=0)
    df["Output"] = list(y_pred)
    df.to_csv(file_name)

tr_in, df_in = read_csv("reg_train_in")
tr_in = np.asarray(tr_in,dtype="float64")
tr_out, df_out = read_csv("reg_train_out")
tr_out = np.asarray(tr_out,dtype="float64")
te_in, df = read_csv("reg_test_in") # Still have to deal with NaNs best


X = preprocessing.scale(tr_in)# Clean (mean 0, variance 1) data-set
y = tr_out

# Training and Testing sets for cv

In [7]:
cv = ShuffleSplit(n_splits=1,  test_size=0.35, random_state=random.randint(0,20), train_size=None) # 10_splits
for train_index, test_index in cv.split(X):
    print("TRAIN:", len(list(train_index)), "TEST:", len(test_index))

('TRAIN:', 22230, 'TEST:', 11970)


In [8]:
X_train , y_train = X[train_index], y[train_index]
Xs , ys = X[test_index], y[test_index]

In [2]:
mean_tr = np.mean(tr_in,axis = 0)
std_tr = np.std(tr_in,axis = 0)
df = pd.DataFrame()
df = df.from_csv("reg_test_gp.csv", header=None, sep=',', index_col=None)
te_gp= np.asarray(df,dtype="float64")
X_test = np.divide(np.add(te_gp,-mean_tr),std_tr)

# Dimensionality reduction: 4-comp PCA

Everything below is good, the PCA with two comp for 0,1,2, etc is not as good 1 comp for 1,2 etc and then 0 separatelly

corr = [0,1,2,3,4,5,7,8,9,11,12,13]
PCA only on related features, leave 'noisy' features separate 7,11
pca = PCA(n_components=2)
pca.fit(X[:,corr])
print "Variance taken by each of the principal components", pca.explained_variance_ratio_
X_pca = pca.transform(X[:,corr])
Z = np.concatenate((X_pca,X[:,[6,10]]),axis=1)
X_test_pca = pca.transform(X_test[:,corr])
Z_test = np.concatenate((X_test_pca,X_test[:,[6,10]]),axis=1)

In [107]:
def use_pca(X,list_of_features, n_comp):
    out = list(set(range(0,14))-set(list_of_features))
    out.sort()
    pca = PCA(n_components=n_comp)
    pca.fit(X[:,list_of_features])
    X_pca = pca.transform(X[:,list_of_features])
    Z = np.concatenate((X_pca,X[:,out]),axis=1)
    print pca.explained_variance_ratio_
    return Z

In [108]:
corr = [1,2,3,4,5,7,8,9,11,12,13]
Z = use_pca(X,corr,1)
Z_test = use_pca(X_test,corr,1)

[ 0.97402246]
[ 0.94915771]


# Adding noise to the Network 

noise_Z = np.random.randn(34200,1)
Z_noisy = np.concatenate((Z,noise_Z),axis=1)
noise_Z_test = np.random.randn(1800,1)
Z_test_noisy = np.concatenate((Z_test,noise_Z_test),axis=1)

In [49]:
def add_noise_feature(X):
    noise_X = np.random.randn(np.shape(X)[0],1)
    X_noisy = np.concatenate((X,noise_X),axis=1)
    return X_noisy

# Multilayer perceptron 


In [110]:
from sklearn.neural_network import MLPRegressor
regression = MLPRegressor(hidden_layer_sizes=(50,50,50,50), activation='relu', solver='sgd', alpha=1)
regression.fit(add_noise_feature(Z), y)
y_pred_test = regression.predict(add_noise_feature(Z_test))
y_pred = regression.predict(add_noise_feature(Z))
score = mean_squared_error(y, y_pred)
print np.sqrt(score)
# to improve 0.680718311155 without New PCA and add_noise
# with noise and everythin still 0.680920057225

0.680920057225


In [10]:
publish_pred(y_pred_test, "reg_pred.csv")

http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor

0.628975302474 hidden_layer_sizes=(50,50,50,50,50), activation='tanh', solver='sgd', alpha=0.1 vs 0.77 in Kaggle
0.683643321796 hidden_layer_sizes=(50,50,50,50), activation='tanh', solver='sgd', alpha=1
0.680718311155 hidden_layer_sizes=(50,50,50,50), activation='relu', solver='sgd', alpha=1)

# Extra feature with just gaussian noise : no improvement 
Good way to reduce overfitting of the training set

In [48]:
from sklearn.neural_network import MLPRegressor
regression = MLPRegressor(hidden_layer_sizes=(10,10,10,10,10,10,10), activation='relu', solver='sgd', alpha=0.1)
regression.fit(Z_noisy, y)
y_pred_test = regression.predict(Z_test_noisy)
y_pred = regression.predict(Z_noisy)
score = mean_squared_error(y, y_pred)
print np.sqrt(score)

0.68499305435


In [52]:
from sklearn.neural_network import MLPRegressor
regression = MLPRegressor(hidden_layer_sizes=(50,50,50,50), activation='relu', solver='sgd', alpha=0.1)
regression.fit(add_noise_feature(X), y)
y_pred_test = regression.predict(add_noise_feature(X_test))
y_pred = regression.predict(add_noise_feature(X))
score = mean_squared_error(y, y_pred)
print np.sqrt(score)

0.617552541041


publish_pred(y_pred_test, "reg_pred.csv")

0.6828975302474 (noisy) hidden_layer_sizes=(50,50,50,50,50), activation='tanh', solver='sgd', alpha=0.1 vs 0.77 in Kaggle