In [1]:
import pandas as pd
import numpy as np

import tubesml as tml

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from sklearn.linear_model import Lasso, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb

from source import utility as ut
from source import transform as tr

import warnings

pd.set_option('max_columns', 100)

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
subs = pd.read_csv('data/sample_submission.csv')

In [3]:
train_set, test_set = ut.make_test(df_train, 0.25, random_state=516, strat_feat='cat9')

train_set.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
217278,362397,A,A,A,A,B,D,A,E,G,F,0.992968,0.062644,0.137455,0.771707,0.494401,0.889793,0.477072,0.777355,0.894772,0.832945,0.528586,0.712178,0.876089,0.594609,6.405349
168076,280139,A,A,A,A,B,A,A,E,A,F,0.70634,0.297613,0.57449,0.685358,0.280016,0.980636,0.875379,0.650188,0.210627,0.543731,0.551951,0.821223,0.898238,0.422612,7.616449
85899,142986,B,B,A,C,B,D,A,E,G,M,0.735604,0.770009,0.350257,0.561776,0.266121,0.918674,0.828429,0.345399,0.901242,0.759109,0.726503,0.637654,0.901267,0.729128,7.678379
66230,110149,A,B,A,C,B,D,A,E,G,I,0.85576,0.617747,0.145037,0.21851,0.677832,0.520412,0.304368,0.67624,0.862597,0.153856,0.177909,0.848022,0.311321,0.766169,6.642889
202344,337370,A,A,A,C,B,B,A,E,C,F,0.39799,0.069838,0.452991,0.174816,0.775703,0.30572,0.349109,0.70281,0.304682,0.441488,0.240995,0.381703,0.302658,0.719237,8.427913


In [4]:
del train_set['id']
del test_set['id']

In [6]:
numeric_pipe = Pipeline([('fs', tml.DtypeSel('numeric'))])


cat_pipe = Pipeline([('fs', tml.DtypeSel('category')),
                     ('simp', tr.CatSimp()),
                     ('dummies', tml.Dummify(match_cols=True))])


processing_pipe = tml.FeatureUnionDf(transformer_list=[('cat_pipe', cat_pipe),
                                                 ('num_pipe', numeric_pipe)])


full_pipe = Pipeline([('processing', processing_pipe), 
                      ('scaler', tml.DfScaler())])

In [7]:
tmp = train_set.copy()

full_pipe.fit_transform(tmp).head()

Unnamed: 0,cat0_A,cat0_B,cat1_A,cat1_B,cat2_A,cat2_B,cat3_A,cat3_B,cat3_C,cat3_D,cat4_A,cat4_B,cat4_C,cat5_A,cat5_B,cat5_C,cat5_D,cat6_A,cat6_B,cat7_B,cat7_D,cat7_E,cat7_G,cat8_A,cat8_C,cat8_D,cat8_E,cat8_G,cat9_A,cat9_B,cat9_F,cat9_G,cat9_H,cat9_I,cat9_J,cat9_K,cat9_L,cat9_M,cat9_N,cat9_O,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
217278,0.256535,-0.256535,0.918681,-0.918681,0.291209,-0.291209,1.369989,-0.045311,-1.258869,-0.196661,-0.079071,0.0942,-0.050881,-0.114408,-0.996903,-0.201547,1.106445,0.158618,-0.158618,-0.13998,-0.297148,0.347856,-0.08746,-0.37946,-0.822551,-0.112235,-0.682473,2.474598,-0.216297,-0.107617,1.340293,-0.189592,-0.299921,-0.447557,-0.155513,-0.274033,-0.406165,-0.184127,-0.117886,-0.144949,2.287695,-1.884997,-1.533378,1.363426,0.195187,1.648689,-0.004339,1.571904,1.975767,1.502005,0.119316,0.792054,1.594923,0.405926,-1.185823
168076,0.256535,-0.256535,0.918681,-0.918681,0.291209,-0.291209,1.369989,-0.045311,-1.258869,-0.196661,-0.079071,0.0942,-0.050881,8.740674,-0.996903,-0.201547,-0.903795,0.158618,-0.158618,-0.13998,-0.297148,0.347856,-0.08746,2.635321,-0.822551,-0.112235,-0.682473,-0.404106,-0.216297,-0.107617,1.340293,-0.189592,-0.299921,-0.447557,-0.155513,-0.274033,-0.406165,-0.184127,-0.117886,-0.144949,0.888085,-0.886978,0.651933,1.001687,-0.87299,2.04111,2.067582,0.950252,-1.127138,0.083912,0.235358,1.264619,1.695274,-0.357946,0.179625
85899,-3.898103,3.898103,-1.088517,1.088517,0.291209,-0.291209,-0.729933,-0.045311,0.794364,-0.196661,-0.079071,0.0942,-0.050881,-0.114408,-0.996903,-0.201547,1.106445,0.158618,-0.158618,-0.13998,-0.297148,0.347856,-0.08746,-0.37946,-0.822551,-0.112235,-0.682473,2.474598,-0.216297,-0.107617,-0.746105,-0.189592,-0.299921,-0.447557,-0.155513,-0.274033,-0.406165,5.431028,-0.117886,-0.144949,1.030983,1.119491,-0.469301,0.483966,-0.942224,1.773447,1.823353,-0.539708,2.005114,1.139966,1.102253,0.469091,1.708997,1.003351,0.249448
66230,0.256535,-0.256535,-1.088517,1.088517,0.291209,-0.291209,-0.729933,-0.045311,0.794364,-0.196661,-0.079071,0.0942,-0.050881,-0.114408,-0.996903,-0.201547,1.106445,0.158618,-0.158618,-0.13998,-0.297148,0.347856,-0.08746,-0.37946,-0.822551,-0.112235,-0.682473,2.474598,-0.216297,-0.107617,-0.746105,-0.189592,-0.299921,2.234352,-0.155513,-0.274033,-0.406165,-0.184127,-0.117886,-0.144949,1.617708,0.472768,-1.495469,-0.954062,1.109139,0.053036,-0.902713,1.077606,1.829842,-1.827745,-1.622287,1.380757,-0.963831,1.16786,-0.918009
202344,0.256535,-0.256535,0.918681,-0.918681,0.291209,-0.291209,-0.729933,-0.045311,0.794364,-0.196661,-0.079071,0.0942,-0.050881,-0.114408,1.003107,-0.201547,-0.903795,0.158618,-0.158618,-0.13998,-0.297148,0.347856,-0.08746,-0.37946,1.215731,-0.112235,-0.682473,-0.404106,-0.216297,-0.107617,1.340293,-0.189592,-0.299921,-0.447557,-0.155513,-0.274033,-0.406165,-0.184127,-0.117886,-0.144949,-0.617599,-1.854438,0.044401,-1.137108,1.596782,-0.874391,-0.669978,1.207492,-0.700559,-0.417412,-1.308976,-0.640117,-1.003081,0.959426,1.094507


In [8]:
del tmp

y = train_set['target'].copy()
del train_set['target']
y_test = test_set['target'].copy()
del test_set['target']

In [8]:
models = [('lasso', Lasso(alpha=0.01)), ('ridge', Ridge()), ('sgd', SGDRegressor()), 
          ('xgb', xgb.XGBRegressor(n_estimators=200, objective='reg:squarederror', n_jobs=5)), 
          ('lgb', lgb.LGBMRegressor(n_estimators=200, n_jobs=5))]

mod_name = []
rmse_train = []
rmse_test = []

folds = KFold(5, shuffle=True, random_state=541)

warnings.filterwarnings("ignore", 
                        message="The dummies in this set do not match the ones in the train set, we corrected the issue.")

for model in models:
    
    train = train_set.copy()
    test = test_set.copy()
    print(model[0])
    mod_name.append(model[0])
    
    pipe = [('processing', processing_pipe),
             ('scaler', tml.DfScaler())] + [model]
    
    model_pipe = Pipeline(pipe)
            
    inf_preds = tml.cv_score(data=train, target=y, cv=folds, estimator=model_pipe)
    
    model_pipe.fit(train, y)  # refit on full train set
    
    preds = model_pipe.predict(test)
    
    rmse_train.append(mean_squared_error(y, inf_preds))
    rmse_test.append(mean_squared_error(y_test, preds))
    
    print(f'\tTrain set RMSE: {round(np.sqrt(mean_squared_error(y, inf_preds)), 4)}')
    print(f'\tTest set RMSE: {round(np.sqrt(mean_squared_error(y_test, preds)), 4)}')
    
    print('_'*40)
    print('\n')
    
results = pd.DataFrame({'model_name': mod_name, 
                        'rmse_train': rmse_train, 'rmse_test': rmse_test})

results

lasso
	Train set RMSE: 0.8656
	Test set RMSE: 0.8849
________________________________________


ridge
	Train set RMSE: 0.8637
	Test set RMSE: 0.8839
________________________________________


sgd
	Train set RMSE: 223495306393.6919
	Test set RMSE: 0.8847
________________________________________


xgb
	Train set RMSE: 0.8541
	Test set RMSE: 0.8697
________________________________________


lgb
	Train set RMSE: 0.8457
	Test set RMSE: 0.8624
________________________________________




Unnamed: 0,model_name,rmse_train,rmse_test
0,lasso,0.7492038,0.783071
1,ridge,0.7459932,0.781301
2,sgd,4.995015e+22,0.782691
3,xgb,0.7294766,0.756377
4,lgb,0.7152755,0.743775


In [9]:
models = [('lasso', Lasso(alpha=0.01)), ('ridge', Ridge()), ('sgd', SGDRegressor()), 
          ('xgb', xgb.XGBRegressor(n_estimators=200, objective='reg:squarederror', n_jobs=5)), 
          ('lgb', lgb.LGBMRegressor(n_estimators=200, n_jobs=5))]

mod_name = []
rmse_train = []
rmse_test = []

folds = KFold(5, shuffle=True, random_state=541)

warnings.filterwarnings("ignore", 
                        message="The dummies in this set do not match the ones in the train set, we corrected the issue.")

for model in models:
    
    train = train_set.copy()
    test = test_set.copy()
    print(model[0])
    mod_name.append(model[0])
    
    pipe = [('processing', full_pipe)] + [model]
    
    model_pipe = Pipeline(pipe)
            
    inf_preds = tml.cv_score(data=train, target=y, cv=folds, estimator=model_pipe)
    
    model_pipe.fit(train, y)  # refit on full train set
    
    preds = model_pipe.predict(test)
    
    rmse_train.append(mean_squared_error(y, inf_preds))
    rmse_test.append(mean_squared_error(y_test, preds))
    
    print(f'\tTrain set RMSE: {round(np.sqrt(mean_squared_error(y, inf_preds)), 4)}')
    print(f'\tTest set RMSE: {round(np.sqrt(mean_squared_error(y_test, preds)), 4)}')
    
    print('_'*40)
    print('\n')
    
results = pd.DataFrame({'model_name': mod_name, 
                        'rmse_train': rmse_train, 'rmse_test': rmse_test})

results

lasso
	Train set RMSE: 0.8656
	Test set RMSE: 0.8849
________________________________________


ridge
	Train set RMSE: 0.8637
	Test set RMSE: 0.8839
________________________________________


sgd
	Train set RMSE: 0.8665
	Test set RMSE: 0.8851
________________________________________


xgb
	Train set RMSE: 0.854
	Test set RMSE: 0.8697
________________________________________


lgb
	Train set RMSE: 0.8456
	Test set RMSE: 0.8624
________________________________________




Unnamed: 0,model_name,rmse_train,rmse_test
0,lasso,0.749187,0.783071
1,ridge,0.745986,0.781301
2,sgd,0.750808,0.78348
3,xgb,0.729254,0.756377
4,lgb,0.715122,0.743775
