<a href="https://colab.research.google.com/github/matheus-asilva/meetup_looqbox/blob/master/ensemble_meetup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print('Catboost')
!pip install catboost
print('MLens')
!pip install mlens

Catboost
MLens


In [0]:
####################################################
### Importing Libraries
####################################################

###### Default Libraries ######
import pandas as pd
import numpy as np
import warnings
import gc

###### Machine Learning ######
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

###### Preprocessing ######
from sklearn.model_selection import train_test_split

###### Metrics ######
from sklearn.metrics import mean_absolute_error, mean_squared_error

warnings.simplefilter('ignore')

# 1 - Preprocessing

In [3]:
df_train = pd.read_csv('https://raw.githubusercontent.com/matheus-asilva/meetup_looqbox/master/data/train.csv')
df_test  = pd.read_csv('https://raw.githubusercontent.com/matheus-asilva/meetup_looqbox/master/data/test.csv')

df_train.drop('Unnamed: 0', axis=1, inplace=True)

print('Training set rows and columns:', df_train.shape)
print('Test set rows and columns:', df_test.shape)

Training set rows and columns: (13730, 166)
Test set rows and columns: (4576, 47)


In [4]:
# Drop Different Columns from train and test
target   = 'NU_NOTA_MT'
dif      = list(set(df_train.drop(target,axis=1).columns).difference(set(df_test.columns)))
df_train = df_train.drop(dif, axis=1)

# Check Columns with constant values
dropcols_train = [c for c in df_train.columns if (df_train[c].nunique() == 1) & (df_train[c].isnull().sum() == 0)]

#Columns to be dropped
cols_to_drop = dropcols_train + ['NU_INSCRICAO']
print("Columns that'll be dropped:", cols_to_drop)

# Store and Drops Id's from datasets, and Target from Train Dataset
ID       = 'NU_INSCRICAO'
y_train  = df_train[target].values
train_id = df_train[ID].values
test_id  = df_test[ID].values

# Merge Data before preprocessing:
df_merge = pd.concat([df_train.drop(target,axis=1),df_test],axis=0)
df_merge = df_merge.drop(cols_to_drop, axis=1)

Columns that'll be dropped: ['IN_CEGUEIRA', 'NU_INSCRICAO']


In [5]:
# Missing Data on Merge Dataset
total        = df_merge.isnull().sum().sort_values(ascending=False)
percent      = (df_merge.isnull().sum()/df_merge.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
TP_ENSINO,12544,0.68524
TP_DEPENDENCIA_ADM_ESC,12544,0.68524
Q027,9861,0.538676
TP_STATUS_REDACAO,4796,0.261991
NU_NOTA_COMP4,4796,0.261991
NU_NOTA_LC,4796,0.261991
NU_NOTA_COMP1,4796,0.261991
NU_NOTA_COMP2,4796,0.261991
NU_NOTA_COMP3,4796,0.261991
NU_NOTA_COMP5,4796,0.261991


In [6]:
# Filling Missing Values
df_merge['TP_ENSINO'].fillna(4, inplace=True)
print("Imputing TP_ENSINO: '4' for missing values")
df_merge['TP_DEPENDENCIA_ADM_ESC'].fillna(5, inplace=True)
print("Imputing TP_DEPENDENCIA_ADM_ESC: '5' for missing values")
df_merge['Q027'].fillna('None', inplace=True)
print("Imputing Q027: 'None' for missing values")

Imputing TP_ENSINO: '4' for missing values
Imputing TP_DEPENDENCIA_ADM_ESC: '5' for missing values
Imputing Q027: 'None' for missing values


In [0]:
# Restoring dataframes
df_train         = df_merge[:len(df_train)]
df_train[target] = y_train.tolist()
df_test          = df_merge[len(df_train):]
df_train[ID]     = train_id
df_test[ID]      = test_id

# Store ID's of test set which we'll set the prediction result as NaN
NaNs_ID  = df_test.loc[df_test['TP_STATUS_REDACAO'].isnull(), ID]
df_test  = df_test[~df_test['TP_STATUS_REDACAO'].isnull()]   # Removing those examples from test set
df_train = df_train[~df_train['TP_STATUS_REDACAO'].isnull()] # Removing those examples from train set

y_train  = df_train[target].values
test_id  = df_test[ID].values
df_merge = pd.concat([df_train.drop(target, axis=1), df_test], axis=0)
df_merge = df_merge.drop(ID, axis=1)

In [8]:
total        = df_merge.isnull().sum().sort_values(ascending=False)
percent      = (df_merge.isnull().sum()/df_merge.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
NU_NOTA_CN,53,0.003923
NU_NOTA_CH,53,0.003923
Q047,0,0.0
TP_DEPENDENCIA_ADM_ESC,0,0.0
IN_IDOSO,0,0.0
IN_GESTANTE,0,0.0
IN_SABATISTA,0,0.0
IN_DISCALCULIA,0,0.0
IN_DISLEXIA,0,0.0
IN_SURDEZ,0,0.0


In [0]:
df_merge['NU_NOTA_CN'].fillna(0, inplace=True)
df_merge['NU_NOTA_CH'].fillna(0, inplace=True)

In [0]:
###### Target Encoder ######
cat_features = [f for f in df_merge.columns if (df_merge[f].dtypes == object) | (df_merge[f].dtypes == bool)]

for cat in cat_features:
  ordered_labels = df_train.groupby(cat).agg({'NU_NOTA_MT':'mean'}).sort_values('NU_NOTA_MT').index
  ordinal_labels = {k:i for i, k in enumerate(ordered_labels, 0)}
  df_merge[cat]  = df_merge[cat].map(ordinal_labels)

df_train         = df_merge[:len(df_train)]
df_train[target] = y_train.tolist()
df_test          = df_merge[len(df_train):]

In [0]:
size_test = .33
random_state = 42

train_target = df_train[target] # Just for code readibility
predictors = df_train.drop(target, axis=1)

X_train, X_val, y_train, y_val = train_test_split(predictors, 
                                                  train_target,
                                                  test_size=size_test, 
                                                  random_state=int(random_state * 2))

In [0]:
lgb_model = LGBMRegressor(random_state=random_state)
lgb_model.fit(X_train, y_train)

lgb_pred = lgb_model.predict(X_val)

In [0]:
rf_model = RandomForestRegressor(random_state=random_state)
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_val)

In [14]:
cat_model = CatBoostRegressor(random_state=random_state)
cat_model.fit(X_train, y_train)

cat_pred = cat_model.predict(X_val)

0:	learn: 478.8046333	total: 60.4ms	remaining: 1m
1:	learn: 465.1277574	total: 72.3ms	remaining: 36.1s
2:	learn: 451.8918434	total: 83.9ms	remaining: 27.9s
3:	learn: 438.9733598	total: 100ms	remaining: 25s
4:	learn: 426.4320375	total: 114ms	remaining: 22.6s
5:	learn: 414.2858757	total: 126ms	remaining: 20.8s
6:	learn: 402.5713824	total: 137ms	remaining: 19.4s
7:	learn: 391.2383772	total: 149ms	remaining: 18.4s
8:	learn: 380.1106636	total: 159ms	remaining: 17.5s
9:	learn: 369.4224001	total: 169ms	remaining: 16.7s
10:	learn: 359.0446318	total: 180ms	remaining: 16.1s
11:	learn: 349.0242712	total: 190ms	remaining: 15.6s
12:	learn: 339.3397852	total: 200ms	remaining: 15.2s
13:	learn: 329.9422380	total: 210ms	remaining: 14.8s
14:	learn: 320.7723703	total: 221ms	remaining: 14.5s
15:	learn: 311.8404846	total: 231ms	remaining: 14.2s
16:	learn: 303.2616366	total: 245ms	remaining: 14.2s
17:	learn: 294.8774279	total: 255ms	remaining: 13.9s
18:	learn: 286.8943177	total: 272ms	remaining: 14s
19:	lea

In [15]:
print('LightGBM RMSE:', np.sqrt(mean_squared_error(y_val, lgb_pred)))
print('Random Forest RMSE:', np.sqrt(mean_squared_error(y_val, rf_pred)))
print('CatBoost Regressor RMSE:', np.sqrt(mean_squared_error(y_val, cat_pred)))

LightGBM RMSE: 72.58666993312877
Random Forest RMSE: 74.84730821665337
CatBoost Regressor RMSE: 71.3844035395971


In [16]:
from mlens.ensemble import SuperLearner
from sklearn.linear_model import Ridge

[MLENS] backend: threading


In [0]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [18]:
ensemble = SuperLearner(scorer=rmse, random_state=random_state, verbose=2)
ensemble.add([lgb_model, rf_model, cat_model])
ensemble.add_meta(Ridge(alpha=.6, copy_X=True, fit_intercept=True, max_iter=1000,
                        normalize=False, random_state=random_state, solver='auto', tol=0.01))

SuperLearner(array_check=None, backend=None, folds=2,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=7270, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=2, raise_on_ex...dae5950>)],
   n_jobs=-1, name='group-1', raise_on_exception=True, transformers=[])],
   verbose=1)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=42, sample_size=20,
       scorer=<function rmse at 0x7fcaddae5950>, shuffle=False, verbose=2)

In [0]:
ensemble.fit(X_train, y_train)


Fitting 2 layers
Processing layer-1             0:	learn: 478.8046333	total: 21ms	remaining: 21s
0:	learn: 478.0080055	total: 7.44ms	remaining: 7.43s
1:	learn: 465.1277574	total: 53ms	remaining: 26.5s
1:	learn: 464.4683323	total: 31.6ms	remaining: 15.8s
2:	learn: 451.8918434	total: 82.5ms	remaining: 27.4s
2:	learn: 451.3171127	total: 51.6ms	remaining: 17.2s
3:	learn: 438.5273299	total: 68.1ms	remaining: 17s
3:	learn: 438.9733598	total: 108ms	remaining: 27s
4:	learn: 426.2148129	total: 89.6ms	remaining: 17.8s
4:	learn: 426.4320375	total: 136ms	remaining: 27.1s
5:	learn: 414.1515062	total: 106ms	remaining: 17.6s
6:	learn: 402.4532591	total: 121ms	remaining: 17.2s
5:	learn: 414.2858757	total: 163ms	remaining: 27s
7:	learn: 391.1139126	total: 130ms	remaining: 16.2s
8:	learn: 380.1698824	total: 139ms	remaining: 15.3s
6:	learn: 402.5713824	total: 189ms	remaining: 26.8s
9:	learn: 369.4655141	total: 158ms	remaining: 15.6s
10:	learn: 359.2724735	total: 175ms	remaining: 15.7s
7:	learn: 391.2383

In [0]:
ensemble_preds = ensemble.predict(X_val)

In [0]:
print('Ensemble RMSE:', np.sqrt(mean_squared_error(y_val, ensemble_preds)))

In [0]:
def blend_models_predict(X):
    return ((
        (0.65 * ensemble.predict(X)) + \
        (0.10 * lgb_model.predict(X)) + \
        (0.10 * rf_model.predict(X)) + \
        (0.15 * cat_model.predict(X))
        ))
  
blend_pred = blend_models_predict(X_val)

print('Blend Models RMSE score:', np.sqrt(mean_squared_error(y_val, blend_pred)))