In [68]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn import impute
import itertools as it
import time as time
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from catboost import CatBoostRegressor

data = pd.read_csv('train.csv')

# Log transformation
y = np.log(data.y)
X = data.drop('y', axis=1)


# Drop columns with >5% NaN
inst = X.shape[0]
nan_col = [col for col in X if (X[col].isna().sum() / inst > 0.05)]
X = X.drop(nan_col, axis=1)


#Impute NaN using KNNImputer
imputer = impute.KNNImputer(n_neighbors=5)
X = imputer.fit_transform(X)

scaler = StandardScaler()
X = scaler.fit_transform(X)

X = pd.DataFrame(X)

In [69]:
# Delete 0 variance columns
X = pd.DataFrame(X)
zero_var = []

print(X.shape)

res=[]
for col in X:
     if len(X[col].unique()) == 1:
        res.append(col)
        X = X.drop(col, axis=1)

print('Columns dropped:', len(res))
# 11 columns with only 1 value

print(X.shape)

(5380, 727)
Columns dropped: 11
(5380, 716)


In [70]:
#Feature Selection
np.warnings.filterwarnings('ignore')

#Set random columns to assist search for non-important columns
from numpy import random
random.seed(seed=0)
for i in range(X.shape[1]):
  random = pd.Series(np.random.randn(X.shape[0]))
  X[f'random{i}'] = random

X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,random706,random707,random708,random709,random710,random711,random712,random713,random714,random715
0,-1.731729,0.970650,-0.607871,1.646275,0.000475,1.290638,0.292676,-0.579861,-0.612177,-0.278328,...,-1.350809,-0.444715,-0.617309,2.260477,0.817205,-0.921166,0.352251,1.115675,-0.144248,-0.924522
1,-1.731085,-0.823592,-0.024523,-1.275256,-0.298702,-0.712843,-1.443005,0.958220,-0.101120,0.742470,...,0.566180,-1.077653,0.953611,-1.257498,-0.159406,1.578460,-0.408644,-0.311205,0.174422,1.412041
2,-1.730441,-0.269384,-0.860578,-0.786297,-0.302801,-0.269982,-1.443005,-0.512824,-0.942933,-0.150716,...,-0.190203,-0.052280,0.271154,0.480227,1.471382,1.196333,-0.543250,-1.797030,0.880623,-0.944331
3,-1.729797,-0.639873,0.145935,-0.676281,-0.294604,-0.447105,-1.443005,0.219178,-0.021785,-0.282743,...,-0.240556,1.257769,-1.271715,0.150072,0.662781,0.762751,0.230514,0.342159,0.470614,0.135640
4,-1.729153,-0.299226,-0.891836,1.352899,-0.261818,-0.852600,-0.267221,-0.592556,-0.884625,-0.275005,...,1.782200,-0.230312,2.062528,-0.431987,1.347628,0.402258,-1.541744,0.961216,-0.405204,-1.544741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5375,1.729153,-0.811236,0.872754,0.387205,-0.282309,-1.070487,0.964553,-0.179294,0.701509,-0.288272,...,0.344991,-0.527333,0.500919,-1.021350,-1.252521,-0.202212,-1.444496,-1.273921,-1.163294,-0.405767
5376,1.729797,0.893357,0.620776,0.668357,-0.212638,0.061688,0.236687,-0.265555,0.571452,-0.273831,...,-0.654771,0.379921,-1.569201,1.439617,0.517010,-1.555055,-0.767590,0.369824,-0.762460,0.915443
5377,1.730441,-0.368885,0.746220,-0.065082,-0.257719,0.472798,0.516635,-0.374215,0.252689,-0.222228,...,-0.423307,1.751722,0.010549,0.480817,-1.149284,0.077780,0.147306,-1.903969,0.416462,-0.162161
5378,1.731085,-0.052157,-0.713373,1.695171,-0.257719,-0.892355,0.180697,-0.595953,-0.742034,-0.278881,...,-0.451310,0.955744,0.093256,-0.558246,0.329794,-1.201309,0.364055,-0.622458,-0.774547,-1.064202


In [71]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

X.columns = X.columns.astype(str)
rf = RandomForestRegressor().fit(X,y)

In [72]:
imp = rf.feature_importances_
importances = pd.DataFrame({'feature':X.columns, 'importance':imp})

random_imp = importances.loc[range(717, 1432), 'importance'].max()
random_imp

0.0009331991623544296

In [73]:
selected_predictors = importances[importances['importance'] > random_imp]['feature'].astype(str).tolist()

#Down to 62 predictors after importance selection
X = X.loc[:,selected_predictors]

In [28]:
# Tuned RandomForest
params = {'max_depth':[20, 25, 30],
          'max_leaf_nodes':[450, 500, 550],
          'bootstrap': [True],
         'max_features': [0.7, 0.8, 0.9, 1]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)

rf = GridSearchCV(RandomForestRegressor(random_state=1, n_jobs=-1, bootstrap=True, n_estimators=100), 
                                      param_grid =params, cv=cv, n_jobs=-1, verbose=1, scoring = 'neg_mean_squared_error')

rf.fit(X, y)

print('Best Parameters : ',rf.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters :  {'bootstrap': True, 'max_depth': 25, 'max_features': 0.8, 'max_leaf_nodes': 500}


In [74]:
tuned_rf = RandomForestRegressor(random_state=1, n_jobs=-1, bootstrap=True, n_estimators=100,
                                 max_depth= 25, max_features= 0.8, max_leaf_nodes= 500).fit(X,y)
np.sqrt(mean_squared_error(np.exp(y), np.exp(tuned_rf.predict(X))))


6.691244510862965

In [31]:
# Lasso
np.warnings.filterwarnings('ignore')

score = pd.DataFrame(columns=['a', 'rmse'])
for a in np.arange(0.001, 0.2, 0.001):
    lasso = Lasso(alpha=a)
    lasso.fit(X, y)
    
    score = score.append(pd.DataFrame({'alpha':a, 'rmse':-cross_val_score(lasso, X, y, cv=5, scoring='neg_mean_squared_error').mean()}, index=[0]))


In [32]:
score.reset_index(drop=True).loc[score['rmse'].argmin()]

a             NaN
rmse     0.505964
alpha       0.006
Name: 5, dtype: object

In [75]:
lasso = Lasso(alpha=0.06).fit(X,y)
#0.05 old one
np.sqrt(mean_squared_error(np.exp(y), np.exp(lasso.predict(X))))


10.682026010554948

In [34]:
# XGBoost
param_grid = {'max_depth': [4,6,8],
              'learning_rate': [0.01, 0.05, 0.1],
               'reg_lambda':[0, 1, 10],
                'n_estimators':[100, 500, 1000],
                'gamma': [0, 10, 100],
                'subsample': [0.5, 0.75, 1.0],
                'colsample_bytree': [0.5, 0.75, 1.0]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_distributions = param_grid,
                             n_iter=100,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                             scoring = 'neg_mean_squared_error')
optimal_params.fit(X,y)
print("Optimal parameter values =", optimal_params.best_params_)
print("Optimal cross validation R-squared = ",optimal_params.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Optimal parameter values = {'subsample': 0.5, 'reg_lambda': 1, 'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 1.0}
Optimal cross validation R-squared =  -0.4822806336503399


In [76]:
xg = xgb.XGBRegressor(random_state=1, subsample= 0.5, reg_lambda= 1, n_estimators= 500, 
                       max_depth= 4, learning_rate= 0.05, gamma= 0, colsample_bytree= 1.0).fit(X,y)
np.sqrt(mean_squared_error(np.exp(y), np.exp(xg.predict(X))))

6.093963975754468

In [36]:
# CatBoost

param_grid = {'max_depth': [4,6,8],
              'num_leaves': [20, 31, 40],
              'learning_rate': [0.01, 0.05, 0.1],
               'reg_lambda':[0, 10, 100],
                'n_estimators':[100, 500, 1000],
                'subsample': [0.5, 0.75, 1.0]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = RandomizedSearchCV(estimator=CatBoostRegressor(random_state=1, verbose=False),                                                       
                             param_distributions = param_grid, n_iter = 100,
                             verbose = 1,random_state = 1,
                             n_jobs=-1,
                             cv = cv)
optimal_params.fit(X,y)

# np.sqrt(mean_squared_error(np.exp(y), np.exp(model_cat.predict(X))))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [77]:
model_cat = CatBoostRegressor(random_state=1, verbose=False, subsample=0.75, reg_lambda=10, num_leaves=31,
                              n_estimators=1000, max_depth=4, learning_rate=0.1).fit(X,y)
np.sqrt(mean_squared_error(np.exp(y), np.exp(model_cat.predict(X))))


{'subsample': 0.75, 'reg_lambda': 10, 'num_leaves': 31, 'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.1}


6.142781416636383

In [78]:
# ensemble
# StackingRegressor using LinearRegression as the metamodel

from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

en = StackingRegressor(estimators = [('cat', model_cat), ('xg', xg), ('rf', tuned_rf)],
                     final_estimator=LinearRegression(),                                          
                    cv = KFold(n_splits = 5, shuffle = True, random_state=1))
en.fit(X,y)

In [79]:
en_train_pred = en.predict(X)
np.sqrt(mean_squared_error(np.exp(y), np.exp(en_train_pred)))

5.9999581459084155

In [80]:
# Test Predictions
data = pd.read_csv('test.csv')
id = data.id

data = data.drop(nan_col, axis=1)

imputer = impute.KNNImputer(n_neighbors=5)
data = imputer.fit_transform(data)

data = scaler.transform(data)

data = pd.DataFrame(data)

data.columns = data.columns.astype(int)
data = data.drop(res, axis=1)
data.columns = data.columns.astype(str)

data = data.loc[:,selected_predictors]


print(data.shape)

test_pred = np.exp(en.predict(data))

(4403, 77)


In [81]:
data = pd.read_csv('test.csv')
id = data.id

output = pd.DataFrame()
output['id'] = id
output['y'] = test_pred


output.loc[output['y']<0, 'y'] = 0
output.loc[output['y']>100, 'y'] = 100
output

Unnamed: 0,id,y
0,5380,4.248909
1,5381,6.772176
2,5382,2.177992
3,5383,6.470433
4,5384,3.652845
...,...,...
4398,9778,2.949133
4399,9779,7.990218
4400,9780,9.699207
4401,9781,8.879816


In [82]:
output = output.set_index('id')
output.to_csv('submission72.csv')

Importing packages

In [1]:
!git clone --single-branch --branch v0.2dev https://github.com/scikit-learn-contrib/py-earth.git
%cd py-earth
!python setup.py install --cythonize

Cloning into 'py-earth'...
remote: Enumerating objects: 3303, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 3303 (delta 21), reused 17 (delta 17), pack-reused 3278[K
Receiving objects: 100% (3303/3303), 13.03 MiB | 11.59 MiB/s, done.
Resolving deltas: 100% (2505/2505), done.
/content/py-earth
Compiling pyearth/_util.pyx because it changed.
Compiling pyearth/_basis.pyx because it changed.
Compiling pyearth/_record.pyx because it depends on ./pyearth/_util.pxd.
Compiling pyearth/_pruning.pyx because it depends on ./pyearth/_util.pxd.
Compiling pyearth/_forward.pyx because it changed.
Compiling pyearth/_knot_search.pyx because it depends on ./pyearth/_util.pxd.
Compiling pyearth/_qr.pyx because it depends on ./pyearth/_types.pxd.
[1/7] Cythonizing pyearth/_basis.pyx
  tree = Parsing.p_module(s, pxd, full_module_name)
[2/7] Cythonizing pyearth/_forward.pyx
  tree = Parsing.p_module(s, pxd, full_module_name)
[3/7] 

In [2]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2
