In [1]:
import pandas as pd
import numpy as np

import tubesml as tml

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from sklearn.linear_model import Lasso, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb

from source import utility as ut
from source import transform as tr
from source.train import train_model

import warnings

pd.set_option('max_columns', 100)

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
subs = pd.read_csv('data/sample_submission.csv')

In [3]:
train_set, test_set = ut.make_test(df_train, 0.25, random_state=516, strat_feat='cat9')

train_set.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
217278,362397,A,A,A,A,B,D,A,E,G,F,0.992968,0.062644,0.137455,0.771707,0.494401,0.889793,0.477072,0.777355,0.894772,0.832945,0.528586,0.712178,0.876089,0.594609,6.405349
168076,280139,A,A,A,A,B,A,A,E,A,F,0.70634,0.297613,0.57449,0.685358,0.280016,0.980636,0.875379,0.650188,0.210627,0.543731,0.551951,0.821223,0.898238,0.422612,7.616449
85899,142986,B,B,A,C,B,D,A,E,G,M,0.735604,0.770009,0.350257,0.561776,0.266121,0.918674,0.828429,0.345399,0.901242,0.759109,0.726503,0.637654,0.901267,0.729128,7.678379
66230,110149,A,B,A,C,B,D,A,E,G,I,0.85576,0.617747,0.145037,0.21851,0.677832,0.520412,0.304368,0.67624,0.862597,0.153856,0.177909,0.848022,0.311321,0.766169,6.642889
202344,337370,A,A,A,C,B,B,A,E,C,F,0.39799,0.069838,0.452991,0.174816,0.775703,0.30572,0.349109,0.70281,0.304682,0.441488,0.240995,0.381703,0.302658,0.719237,8.427913


In [4]:
del train_set['id']
del test_set['id']

In [5]:
numeric_pipe = Pipeline([('fs', tml.DtypeSel('numeric'))])


cat_pipe = Pipeline([('fs', tml.DtypeSel('category')),
                     ('tarenc', tr.TargetEncoder(to_encode=[f'cat{c}' for c in range(3,10)])),
                     ('dummies', tml.Dummify(match_cols=True, drop_first=True))])


processing_pipe = tml.FeatureUnionDf(transformer_list=[('cat_pipe', cat_pipe),
                                                 ('num_pipe', numeric_pipe)])


full_pipe = Pipeline([('processing', processing_pipe), 
                      ('scaler', tml.DfScaler())])

In [6]:
tmp = train_set.copy()

full_pipe.fit_transform(tmp, train_set['target']).head()

Unnamed: 0,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat0_B,cat1_B,cat2_B,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
217278,-1.341853,-0.051596,0.332185,-0.155146,-0.315579,-2.154934,0.25267,-0.256535,-0.918681,-0.291209,2.287695,-1.884997,-1.533378,1.363426,0.195187,1.648689,-0.004339,1.571904,1.975767,1.502005,0.119316,0.792054,1.594923,0.405926,-1.185823
168076,-1.341853,-0.051596,-0.184932,-0.155146,-0.315579,0.384099,0.25267,-0.256535,-0.918681,-0.291209,0.888085,-0.886978,0.651933,1.001687,-0.87299,2.04111,2.067582,0.950252,-1.127138,0.083912,0.235358,1.264619,1.695274,-0.357946,0.179625
85899,0.777714,-0.051596,0.332185,-0.155146,-0.315579,-2.154934,0.237122,3.898103,1.088517,-0.291209,1.030983,1.119491,-0.469301,0.483966,-0.942224,1.773447,1.823353,-0.539708,2.005114,1.139966,1.102253,0.469091,1.708997,1.003351,0.249448
66230,0.777714,-0.051596,0.332185,-0.155146,-0.315579,-2.154934,-1.342507,-0.256535,1.088517,-0.291209,1.617708,0.472768,-1.495469,-0.954062,1.109139,0.053036,-0.902713,1.077606,1.829842,-1.827745,-1.622287,1.380757,-0.963831,1.16786,-0.918009
202344,0.777714,-0.051596,0.090663,-0.155146,-0.315579,-0.165315,0.25267,-0.256535,-0.918681,-0.291209,-0.617599,-1.854438,0.044401,-1.137108,1.596782,-0.874391,-0.669978,1.207492,-0.700559,-0.417412,-1.308976,-0.640117,-1.003081,0.959426,1.094507


In [7]:
del tmp

y = train_set['target'].copy()
del train_set['target']
y_test = test_set['target'].copy()
del test_set['target']

In [8]:
models = [('lasso', Lasso(alpha=0.01)), ('ridge', Ridge()), ('sgd', SGDRegressor()), 
          ('xgb', xgb.XGBRegressor(n_estimators=200, objective='reg:squarederror', n_jobs=5)), 
          ('lgb', lgb.LGBMRegressor(n_estimators=200, n_jobs=5))]

mod_name = []
rmse_train = []
rmse_test = []

folds = KFold(5, shuffle=True, random_state=541)

warnings.filterwarnings("ignore", 
                        message="The dummies in this set do not match the ones in the train set, we corrected the issue.")

for model in models:
    
    train = train_set.copy()
    test = test_set.copy()
    print(model[0])
    mod_name.append(model[0])
    
    pipe = [('processing', processing_pipe),
             ('scaler', tml.DfScaler())] + [model]
    
    model_pipe = Pipeline(pipe)
            
    inf_preds = tml.cv_score(data=train, target=y, cv=folds, estimator=model_pipe)
    
    model_pipe.fit(train, y)  # refit on full train set
    
    preds = model_pipe.predict(test)
    
    rmse_train.append(mean_squared_error(y, inf_preds))
    rmse_test.append(mean_squared_error(y_test, preds))
    
    print(f'\tTrain set RMSE: {round(np.sqrt(mean_squared_error(y, inf_preds)), 4)}')
    print(f'\tTest set RMSE: {round(np.sqrt(mean_squared_error(y_test, preds)), 4)}')
    
    print('_'*40)
    print('\n')
    
results = pd.DataFrame({'model_name': mod_name, 
                        'rmse_train': rmse_train, 'rmse_test': rmse_test})

results

lasso
	Train set RMSE: 0.8656
	Test set RMSE: 0.8849
________________________________________


ridge
	Train set RMSE: 0.8637
	Test set RMSE: 0.8839
________________________________________


sgd
	Train set RMSE: 0.8678
	Test set RMSE: 0.8846
________________________________________


xgb
	Train set RMSE: 0.854
	Test set RMSE: 0.8697
________________________________________


lgb
	Train set RMSE: 0.8456
	Test set RMSE: 0.8624
________________________________________




Unnamed: 0,model_name,rmse_train,rmse_test
0,lasso,0.749187,0.783071
1,ridge,0.745986,0.781301
2,sgd,0.753137,0.782587
3,xgb,0.729254,0.756377
4,lgb,0.715122,0.743775


In [8]:
mod = xgb.XGBRegressor(n_estimators=2000, objective='reg:squarederror', n_jobs=5)
folds = KFold(10, shuffle=True, random_state=541)

oof, pred, imp = train_model(train_set, test_set, y, full_pipe, mod, folds, early_stopping=100, verbose=False)

print(np.sqrt(mean_squared_error(y_pred=oof, y_true=y)))
print(np.sqrt(mean_squared_error(y_pred=pred, y_true=y_test)))

imp

0.8481473588595013
0.8454222119992677


Unnamed: 0_level_0,mean,std
feat,Unnamed: 1_level_1,Unnamed: 2_level_1
cat1_B,0.23075,0.006698
cat2_B,0.221456,0.011769
cat6,0.064192,0.002688
cat0_B,0.054665,0.002437
cat9,0.032724,0.000987
cont8,0.030014,0.000784
cont13,0.027288,0.000646
cat5,0.027073,0.000829
cont11,0.026293,0.000651
cont0,0.025445,0.000604


In [9]:
mod = lgb.LGBMRegressor(n_estimators=2000, n_jobs=5)
folds = KFold(10, shuffle=True, random_state=541)

oof, pred, imp = train_model(train_set, test_set, y, full_pipe, mod, folds, early_stopping=100, verbose=False)

print(np.sqrt(mean_squared_error(y_pred=oof, y_true=y)))
print(np.sqrt(mean_squared_error(y_pred=pred, y_true=y_test)))
imp.head(15)

0.8451908031386214
0.8452139942274264


Unnamed: 0_level_0,mean,std
feat,Unnamed: 1_level_1,Unnamed: 2_level_1
cont8,730.4,45.363334
cont0,611.2,45.118445
cont9,594.1,45.001358
cont6,526.8,45.859366
cont5,510.2,42.017868
cont11,499.0,43.984004
cont13,481.8,48.921656
cont1,464.9,46.19028
cont10,442.3,40.481988
cont3,407.1,46.318663


In [8]:
numeric_pipe = Pipeline([('fs', tml.DtypeSel('numeric'))])


cat_pipe = Pipeline([('fs', tml.DtypeSel('category')),
                     ('tarenc', tr.TargetEncoder(to_encode=[f'cat{c}' for c in range(3,10)])),
                     ('dummies', tml.Dummify(match_cols=True, drop_first=True))])


processing_pipe = tml.FeatureUnionDf(transformer_list=[('cat_pipe', cat_pipe),
                                                 ('num_pipe', numeric_pipe)])


full_pipe = Pipeline([('processing', processing_pipe), 
                      ('scaler', tml.DfScaler()), 
                      ('pca', tr.PCADf(n_components=0.9))])

In [9]:
mod = xgb.XGBRegressor(n_estimators=2000, objective='reg:squarederror', n_jobs=5)
folds = KFold(10, shuffle=True, random_state=541)

oof, pred, imp = train_model(train_set, test_set, y, full_pipe, mod, folds, early_stopping=100, verbose=False)

print(np.sqrt(mean_squared_error(y_pred=oof, y_true=y)))
print(np.sqrt(mean_squared_error(y_pred=pred, y_true=y_test)))

imp

0.8675225751364006
0.8662722779671723


Unnamed: 0_level_0,mean,std
feat,Unnamed: 1_level_1,Unnamed: 2_level_1
pca_2,0.261035,0.003658
pca_1,0.047164,0.000578
pca_8,0.046357,0.000831
pca_9,0.045638,0.001325
pca_15,0.044974,0.000405
pca_18,0.043846,0.000501
pca_10,0.043258,0.000638
pca_17,0.042455,0.000441
pca_6,0.042118,0.000807
pca_5,0.041407,0.000591


In [10]:
mod = lgb.LGBMRegressor(n_estimators=2000, n_jobs=5)
folds = KFold(10, shuffle=True, random_state=541)

oof, pred, imp = train_model(train_set, test_set, y, full_pipe, mod, folds, early_stopping=100, verbose=False)

print(np.sqrt(mean_squared_error(y_pred=oof, y_true=y)))
print(np.sqrt(mean_squared_error(y_pred=pred, y_true=y_test)))
imp.head(15)

0.8661063919617858
0.8660830971420326


Unnamed: 0_level_0,mean,std
feat,Unnamed: 1_level_1,Unnamed: 2_level_1
pca_2,217.2,16.096008
pca_1,200.9,23.961621
pca_15,194.7,25.679434
pca_0,191.9,25.759309
pca_18,165.0,24.991357
pca_3,153.3,17.333369
pca_10,152.1,22.073642
pca_17,150.9,22.948385
pca_8,148.8,18.594968
pca_4,144.4,18.815019


In [18]:
numeric_pipe = Pipeline([('fs', tml.DtypeSel('numeric'))])


cat_pipe = Pipeline([('fs', tml.DtypeSel('category')),
                     ('tarenc', tr.TargetEncoder(to_encode=[f'cat{c}' for c in range(3,10)])),
                     ('dummies', tml.Dummify(match_cols=True, drop_first=True))])


processing_pipe = tml.FeatureUnionDf(transformer_list=[('cat_pipe', cat_pipe),
                                                 ('num_pipe', numeric_pipe)])


full_pipe = Pipeline([('processing', processing_pipe), 
                      ('scaler', tml.DfScaler()), 
                      ('pca', tr.PCADf(n_components=0.95, compress=True))])

In [19]:
mod = xgb.XGBRegressor(n_estimators=2000, objective='reg:squarederror', n_jobs=5)
folds = KFold(10, shuffle=True, random_state=541)

oof, pred, imp = train_model(train_set, test_set, y, full_pipe, mod, folds, early_stopping=100, verbose=False)

print(np.sqrt(mean_squared_error(y_pred=oof, y_true=y)))
print(np.sqrt(mean_squared_error(y_pred=pred, y_true=y_test)))

imp

0.8557500308427787
0.8531082163660006


Unnamed: 0_level_0,mean,std
feat,Unnamed: 1_level_1,Unnamed: 2_level_1
cat1_B,0.112212,0.002621
cat2_B,0.075384,0.001364
cont11,0.05628,0.000751
cont13,0.056132,0.000703
cat9,0.053214,0.000556
cont6,0.043474,0.000555
cat8,0.041205,0.000551
cont10,0.040057,0.000409
cont0,0.03957,0.000193
cont3,0.0392,0.000293


In [20]:
mod = lgb.LGBMRegressor(n_estimators=2000, n_jobs=5)
folds = KFold(10, shuffle=True, random_state=541)

oof, pred, imp = train_model(train_set, test_set, y, full_pipe, mod, folds, early_stopping=100, verbose=False)

print(np.sqrt(mean_squared_error(y_pred=oof, y_true=y)))
print(np.sqrt(mean_squared_error(y_pred=pred, y_true=y_test)))
imp.head(15)

0.852784202767206
0.8525180027022232


Unnamed: 0_level_0,mean,std
feat,Unnamed: 1_level_1,Unnamed: 2_level_1
cont6,378.4,21.486832
cont11,367.0,20.205121
cont1,319.9,21.955876
cont13,319.4,18.12181
cont10,313.5,18.250824
cont0,297.5,18.09662
cont3,280.5,22.522006
cont2,269.6,20.355604
cat5,249.3,20.779887
cat3,248.5,18.006343
