# Ranking train data on how similar to test data they are, and having a good validation dataset ( first suggested by https://github.com/zygmuntz)

In [9]:

"train a classifier to distinguish between train and test"
"save train examples in order of similarity to test (ascending)"

import numpy as np
import pandas as pd
from sklearn import cross_validation as CV
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score as AUC

train_file = 'numerai_training_data.csv'
test_file = 'numerai_tournament_data.csv'
output_file = 'train_sorted.csv'



train = pd.read_csv( train_file )
test = pd.read_csv( test_file )

test.drop( 't_id', axis = 1, inplace = True )
test['target'] = 0 # dummy for preserving column order when concatenating

train['is_test'] = 0
test['is_test'] = 1

orig_train = train.copy()
assert( np.all( orig_train.columns == test.columns ))

train = pd.concat(( orig_train, test ))
train.reset_index( inplace = True, drop = True )

x = train.drop( [ 'is_test', 'target' ], axis = 1 )
y = train.is_test


n_estimators = 200
clf = RF( n_estimators = n_estimators, n_jobs = -1 )

predictions = np.zeros( y.shape )

cv = CV.StratifiedKFold( y, n_folds = 5, shuffle = True, random_state = 10000 )

for f, ( train_i, test_i ) in enumerate( cv ):

    print("# fold {}".format( f + 1 ))

    x_train = x.iloc[train_i]
    x_test = x.iloc[test_i]
    y_train = y.iloc[train_i]
    y_test = y.iloc[test_i]
    
    clf.fit( x_train, y_train )	

    p = clf.predict_proba( x_test )[:,1]

    auc = AUC( y_test, p )
    print("# AUC: {:.2%}\n".format( auc ))
    predictions[ test_i ] = p


train['p'] = predictions

i = predictions.argsort()
train_sorted = train.iloc[i]


train_sorted = train_sorted.loc[ train_sorted.is_test == 0 ]
assert( train_sorted.target.sum() == orig_train.target.sum())

train_sorted.drop( 'is_test', axis = 1, inplace = True )
train_sorted.to_csv( output_file, index = False )

# fold 1
# AUC: 78.73%

# fold 2
# AUC: 78.53%

# fold 3
# AUC: 79.92%

# fold 4
# AUC: 78.84%

# fold 5
# AUC: 79.22%



# Using Gridsearch to choose the best C parameter of Logistic regression 

In [27]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import log_loss



train_file = 'train_sorted.csv'
test_file = 'numerai_tournament_data.csv'
lr_output_file = 'lr.csv'


train = pd.read_csv( train_file )
test = pd.read_csv( test_file )

x_train = train.drop( ['target','p'], axis = 1 )
y_train = train.target.values
x_test = test.drop( 't_id', axis = 1 )
val_size = len(test)

val = train.iloc[-val_size:]
y_val = val.target.values
x_val = val.drop( ['target','p'], axis = 1 )

pipe_lr = Pipeline([('lr', LR())])
params_lr= dict(lr__C=[.7+.1*k for k in range(10)])
grid_search = GridSearchCV(pipe_lr, param_grid=params_lr)
grid_search.fit( x_train, y_train )
y_lr=grid_search.predict_proba( x_test )[:,1]
auc = AUC( y_val, y_lr )
ll = log_loss( y_val, y_lr )
print("AUC: {:.2%}, log loss: {:.2%} \n".format( auc, ll ))
test['y_lr'] = y_lr
test.to_csv( lr_output_file, columns = ( 't_id', 'y_lr' ), header = ( 't_id', 'probability' ), index = None )

AUC: 50.32%, log loss: 69.43% 



# How adding polynomial features will modify the result?

In [28]:
from sklearn.preprocessing import PolynomialFeatures

poly_output_file = 'lr_poly.csv'
poly_output_file_iO = 'lr_poly_interactionOnly.csv'

lr_poly = make_pipeline( PolynomialFeatures(degree=2), LR()) 
lr_poly.fit( x_train, y_train )
lr_poly_iO = make_pipeline( PolynomialFeatures(degree=2,interaction_only=True), LR()) 
lr_poly_iO.fit( x_train, y_train )

y_poly = lr_poly.predict_proba( x_test )[:,1]
auc = AUC( y_val, y_poly )
ll = log_loss( y_val, y_poly )
print("AUC: {:.2%}, log loss: {:.2%} \n".format( auc, ll ))

y_poly_iO = lr_poly_iO.predict_proba( x_test )[:,1]
auc = AUC( y_val, y_poly_iO )
ll = log_loss( y_val, y_poly_iO )
print("AUC: {:.2%}, log loss: {:.2%} \n".format( auc, ll ))

test['y_poly'] = y_poly
test['y_poly_iO'] = y_poly_iO
test.to_csv( poly_output_file, columns = ( 't_id', 'y_poly' ), header = ( 't_id', 'probability' ), index = None )
test.to_csv( poly_output_file_iO, columns = ( 't_id', 'y_poly_iO' ), header = ( 't_id', 'probability' ), index = None )

AUC: 50.38%, log loss: 69.55% 

AUC: 50.34%, log loss: 69.55% 



# Ensemble learning classification methods

In [40]:
import numpy as np
np.random.seed(10)

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

n_estimator = 8
# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, Y_train, y_train_lr = train_test_split(x_train,
                                                            y_train,
                                                            test_size=0.5,random_state=int(2e+4))

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
    random_state=0)

rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, Y_train)
y_pred_rt = pipeline.predict_proba(x_test)[:, 1]
auc = AUC( y_val, y_pred_rt )
ll = log_loss( y_val, y_pred_rt )
print("AUC: {:.2%}, log loss: {:.2%} \n".format( auc, ll ))
test['y_rt'] = y_pred_rt
test.to_csv( 'y_rt.csv', columns = ( 't_id', 'y_rt' ), header = ( 't_id', 'probability' ), index = None )

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, Y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)
y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(x_test)))[:, 1]
auc = AUC( y_val, y_pred_rf_lm )
ll = log_loss( y_val, y_pred_rf_lm )
print("AUC: {:.2%}, log loss: {:.2%} \n".format( auc, ll ))
test['y_rf+lr'] = y_pred_rf_lm
test.to_csv( 'y_rf+lr.csv', columns = ( 't_id', 'y_rf+lr' ), header = ( 't_id', 'probability' ), index = None )


grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, Y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(grd_enc.transform(grd.apply(x_test)[:, :, 0]))[:, 1]
auc = AUC( y_val, y_pred_grd_lm )
ll = log_loss( y_val, y_pred_grd_lm )
print("AUC: {:.2%}, log loss: {:.2%} \n".format( auc, ll ))
test['y_gb+lr'] = y_pred_grd_lm
test.to_csv( 'y_gb+lr.csv', columns = ( 't_id', 'y_gb+lr' ), header = ( 't_id', 'probability' ), index = None )

# The gradient boosted model by itself
y_pred_grd = grd.predict_proba(x_test)[:, 1]
auc = AUC( y_val, y_pred_grd )
ll = log_loss( y_val, y_pred_grd )
print("AUC: {:.2%}, log loss: {:.2%} \n".format( auc, ll ))
test['y_gb'] = y_pred_grd
test.to_csv( 'y_gb.csv', columns = ( 't_id', 'y_gb' ), header = ( 't_id', 'probability' ), index = None )


# The random forest model by itself
y_pred_rf = rf.predict_proba(x_test)[:, 1]
auc = AUC( y_val, y_pred_rf )
ll = log_loss( y_val, y_pred_rf )
print("AUC: {:.2%}, log loss: {:.2%} \n".format( auc, ll ))
test['y_rf'] = y_pred_rf
test.to_csv( 'y_rf.csv', columns = ( 't_id', 'y_rf' ), header = ( 't_id', 'probability' ), index = None )

AUC: 49.79%, log loss: 69.47% 

AUC: 50.15%, log loss: 69.49% 

AUC: 50.16%, log loss: 69.47% 

AUC: 50.13%, log loss: 69.33% 

AUC: 49.99%, log loss: 69.37% 



# PCA dimension reduction

In [30]:
from sklearn import decomposition


pca = decomposition.PCA()
lr=LR()
pipe = Pipeline(steps=[('pca', pca), ('lr', lr)])

n_components = [7, 14, 21]
Cs = np.logspace(-4, 4, 3)

#Parameters of pipelines can be set using ‘__’ separated parameter names:

estimator = GridSearchCV(pipe,dict(pca__n_components=n_components,lr__C=Cs))
estimator.fit(x_train, y_train)
y_pca=estimator.predict_proba( x_test )[:,1]
auc = AUC( y_val, y_pca )
ll = log_loss( y_val, y_pca )
print("AUC: {:.2%}, log loss: {:.2%} \n".format( auc, ll ))
test['y_PCA'] = y_pca
test.to_csv( 'y_PCA.csv', columns = ( 't_id', 'y_PCA' ), header = ( 't_id', 'probability' ), index = None )

AUC: 50.22%, log loss: 69.42% 

