# Model Comparison

I created 2 classes, one for the dataset, one for the model.
These are the steps to run successfully the training, testing and prediction.

 1. Load datasets
 2. Apply transformations and feature engineering to the dataset (optional)
     1. Choose variables to be used for training the model (optional)
 4. Load model from SKLearn
 5. Run the simple test
 
 Below I created an example with the model that I had to test, Support Vector Machine.
 
 The shape of the dataset is the following:
 
1. `'Family_Case_ID'`
2. `'Severity'`
3. `'Birthday_year'`
4. `'Parents or siblings infected'`
5. `'Wife/Husband or children infected'`
7. `'Medical_Expenses_Family'`
8. `'Medical_Tent_A'`
9. `'Medical_Tent_B'`
10. `'Medical_Tent_C'`
11. `'Medical_Tent_D'`
12. `'Medical_Tent_E'`
13. `'Medical_Tent_F'`
14. `'Medical_Tent_G'`
15. `'Medical_Tent_T'`
16. `'Medical_Tent_n/a'`
17. `'City_Albuquerque'`
18. `'City_Santa Fe'`
19. `'City_Taos'`
20. `'Gender_M'`
21. `'family_size'`
22. `'Sev_by_city'`: Average severity in the city of the patient.
23. `'Sev_by_tent'`: Average severity in the medical tent of the patient.
24. `'Sev_by_gender'`: Average severity whithin the gender of the patient.
25. `'Sev_family'`: Average severity in the family of the patient.
26. `'spending_vs_severity'`: Medical Expenses Family / Patient's Severity
27. `'spending_family_member'`: Medical Expenses Family / Number of cases in the family
28. `'severity_against_avg_city'`: Patient's Severity / Sev_by_city
29. `'severity_against_avg_tent'`: Patient's Severity / Sev_by_tent
30. `'severity_against_avg_gender'`: Patient's Severity / Sev_by_gender
31. `'spending_family_severity'`: Patient's Severity / Sev_family


In [219]:
from dataset import Dataset
from model import Model
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split



## First model - Support Vector Machine - Alejandro

### Step 1: Load datasets

In [220]:
def remove_IQR_outliers(df, n = 1.5): 
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1

    mask = (df < (Q1 - n * IQR)) | (df > (Q3 + n * IQR))
    # exclude rotations 
    include_cols = ['Medical_Expenses_Family']
    for x in include_cols:
        mask.loc[:,~mask.columns.isin([x])] = False 
    df_out = df.copy()
    df_out[mask] = np.nan
    df_out.dropna(inplace = True)
    return df_out

In [221]:
dataset = Dataset()            # Loads the preprocessed dataset
train_set = dataset.train_data # Training set without labels (train.csv)
target = dataset.target        # Labels for training set     (train.csv[Deceased])
test_set = dataset.test_data   # Unlabeled test set          (test.csv)

train_set.describe()

Unnamed: 0,Family_Case_ID,Severity,Birthday_year,Parents or siblings infected,Wife/Husband or children infected,Medical_Expenses_Family,Sev_by_city,Sev_by_tent,Sev_by_gender,Sev_family,...,City_Santa Fe,City_Taos,Gender_M,family_size,spending_vs_severity,spending_family_member,severity_against_avg_city,severity_against_avg_tent,severity_against_avg_gender,spending_family_severity
count,898.0,898.0,898.0,898.0,898.0,898.0,898.0,898.0,898.0,898.0,...,898.0,898.0,898.0,898.0,898.0,898.0,898.0,898.0,898.0,898.0
mean,14286.119154,2.316258,1597.824053,0.380846,0.522272,892.749443,2.316258,2.316258,2.313653,2.316258,...,0.722717,0.089087,0.648107,1.826281,692.063103,550.403471,1.0,1.0,1.001201,430.066268
std,25443.036379,0.832842,792.720095,0.803941,1.099333,1385.91799,0.25518,0.615844,0.10605,0.825019,...,0.447907,0.285028,0.477827,1.369723,1428.606552,997.077121,0.359305,0.240359,0.35925,1021.963995
min,345.0,1.0,-1.0,0.0,0.0,0.0,1.893491,1.0,2.169811,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.344828,0.381107,0.418103,0.0
25%,8195.0,2.0,1966.0,0.0,0.0,221.0,2.354391,2.623932,2.169811,2.0,...,0.0,0.0,0.0,1.0,73.666667,203.0,0.849476,0.762215,0.836207,70.0
50%,13587.5,3.0,1988.0,0.0,0.0,405.0,2.354391,2.623932,2.391753,3.0,...,1.0,0.0,1.0,1.0,173.0,228.0,1.034483,1.143322,1.25431,81.0
75%,18891.0,3.0,1998.0,0.0,1.0,857.75,2.354391,2.623932,2.391753,3.0,...,1.0,0.0,1.0,2.0,573.0,553.75,1.274215,1.143322,1.25431,343.0
max,742836.0,3.0,2019.0,6.0,8.0,14345.0,2.9,3.0,2.391753,3.0,...,1.0,1.0,1.0,7.0,14345.0,14345.0,1.584375,2.898305,1.382609,14345.0


### Step 2: Apply transformations and select variables

In [222]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler


exclude_columns = [
    'Medical_Tent_T'
    
]

train_set = train_set.loc[:,~train_set.columns.isin(exclude_columns)]

In [223]:
#Scaling
scale_type = None
if scale_type == "RobustScaler":
    robust = RobustScaler().fit(train_set)
    train_set = robust.transform(train_set)
elif scale_type == "MinMaxScaler":
    minmax = MinMaxScaler().fit(train_set)
    train_set = minmax.transform(train_set)
elif scale_type == "StandardScaler":
    scaler = StandardScaler().fit(train_set)
    train_set = scaler.transform(train_set)

### Step 3: Load model from SKLearn

In [225]:

import optuna
def objective(trial):

    loss = trial.suggest_categorical('loss', ["deviance", "exponential"])
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.5)
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    subsample = trial.suggest_float('subsample', 0.1, 1)
    criterion = trial.suggest_categorical('criterion', ['friedman_mse', 'mse', 'mae'])
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 100)
    max_depth = trial.suggest_int('max_depth', 2, 20) 
    min_impurity_decrease = trial.suggest_float('min_impurity_decrease', 0., 0.1)
    max_features = trial.suggest_float('max_features', 0.1, 1)
    max_leaf_nodes = trial.suggest_categorical('max_leaf_nodes', [10, 50, 100, None])
    warm_start = trial.suggest_categorical('warm_start', [True, False])
    n_iter_no_change = trial.suggest_categorical('n_iter_no_change', [5, 10, 15, 20, None])
  #  ccp_alpha = trial.suggest_float('ccp_alpha', 0, 0.1)
    

                                                                
    clf = GradientBoostingClassifier(loss=loss,
                      learning_rate=learning_rate,
                        n_estimators=n_estimators,
                        subsample=subsample,
                        criterion=criterion,
                        min_samples_split=min_samples_split,
                        min_samples_leaf = min_samples_leaf,
                        max_depth=max_depth,
                        min_impurity_decrease = min_impurity_decrease,
                        max_features=max_features, 
                        max_leaf_nodes=max_leaf_nodes,
                        warm_start=warm_start,
                        n_iter_no_change=n_iter_no_change,
                   #     ccp_alpha=ccp_alpha,
                        validation_fraction = 0.2,
                        random_state=1234
                       )
    #print(clf)
    score = cross_val_score(clf, train_set, target, n_jobs=-1, cv=5, scoring="accuracy")
    # print(score)
    score = score.mean()
    return score

is_training = True
if is_training:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=1000, n_jobs=-1)

[I 2020-05-23 13:54:09,544] Finished trial#7 with value: 0.6158100558659219 with parameters: {'loss': 'exponential', 'learning_rate': 0.1105217297868891, 'n_estimators': 73, 'subsample': 0.17780715028025199, 'criterion': 'mse', 'min_samples_split': 43, 'min_samples_leaf': 71, 'max_depth': 11, 'min_impurity_decrease': 0.041761085537022485, 'max_features': 0.2564453769356009, 'max_leaf_nodes': None, 'warm_start': True, 'n_iter_no_change': 15}. Best is trial#7 with value: 0.6158100558659219.
[I 2020-05-23 13:54:10,053] Finished trial#0 with value: 0.7849844816883924 with parameters: {'loss': 'exponential', 'learning_rate': 0.23593766673103628, 'n_estimators': 157, 'subsample': 0.904242713245077, 'criterion': 'mse', 'min_samples_split': 6, 'min_samples_leaf': 79, 'max_depth': 4, 'min_impurity_decrease': 0.03075957971001253, 'max_features': 0.6179434494001546, 'max_leaf_nodes': 100, 'warm_start': True, 'n_iter_no_change': 10}. Best is trial#0 with value: 0.7849844816883924.
[I 2020-05-23 13

In [226]:
# top scores
opt_scores = study.trials_dataframe()
opt_scores.sort_values("value", ascending=False).head(5)

Unnamed: 0,number,value,datetime_start,datetime_complete,params_criterion,params_learning_rate,params_loss,params_max_depth,params_max_features,params_max_leaf_nodes,params_min_impurity_decrease,params_min_samples_leaf,params_min_samples_split,params_n_estimators,params_n_iter_no_change,params_subsample,params_warm_start,state
747,747,0.838504,2020-05-23 14:02:40.588490,2020-05-23 14:02:47.279945,friedman_mse,0.113191,deviance,15,0.795908,,0.080891,2,63,203,5.0,0.822361,True,COMPLETE
771,771,0.838498,2020-05-23 14:03:04.515537,2020-05-23 14:03:09.669207,friedman_mse,0.127058,deviance,13,0.719455,,0.082482,2,95,151,5.0,0.805853,True,COMPLETE
713,713,0.838473,2020-05-23 14:02:20.160607,2020-05-23 14:02:24.025870,friedman_mse,0.125503,deviance,14,0.65591,,0.094084,4,57,175,5.0,0.836917,True,COMPLETE
728,728,0.838461,2020-05-23 14:02:29.047503,2020-05-23 14:02:33.957376,friedman_mse,0.135789,deviance,15,0.637494,,0.093697,5,62,196,5.0,0.83169,True,COMPLETE
930,930,0.837381,2020-05-23 14:05:31.561949,2020-05-23 14:05:37.560861,friedman_mse,0.150961,deviance,9,0.747027,100.0,0.09365,14,96,173,10.0,0.8533,True,COMPLETE


In [174]:
import re
top_params = {}
for x in range(0, 5): 
    params = dict(opt_scores.sort_values("value", ascending=False).iloc[x,4:-1])
    params = {re.sub('params_', '', key): val for key, val in params.items()}
    top_params[x] = params
    
    

In [187]:
for x in range(0,5):
    print(top_params[x])

{'ccp_alpha': 0.0020121368398624364, 'criterion': 'friedman_mse', 'learning_rate': 0.20572877787819044, 'loss': 'exponential', 'max_depth': 6, 'max_features': 0.17656921450338955, 'max_leaf_nodes': 50.0, 'min_impurity_decrease': 0.0061954301244089805, 'min_samples_leaf': 7, 'min_samples_split': 3, 'n_estimators': 302, 'n_iter_no_change': nan, 'subsample': 0.963743813416333, 'warm_start': True}
{'ccp_alpha': 0.002020800985339463, 'criterion': 'friedman_mse', 'learning_rate': 0.1542088308919397, 'loss': 'exponential', 'max_depth': 6, 'max_features': 0.19903695658492065, 'max_leaf_nodes': 50.0, 'min_impurity_decrease': 0.049800610940764156, 'min_samples_leaf': 2, 'min_samples_split': 64, 'n_estimators': 254, 'n_iter_no_change': nan, 'subsample': 0.955404624588575, 'warm_start': True}
{'ccp_alpha': 6.287427032653941e-05, 'criterion': 'friedman_mse', 'learning_rate': 0.10471245666478986, 'loss': 'exponential', 'max_depth': 5, 'max_features': 0.6540767795374985, 'max_leaf_nodes': 50.0, 'min_

In [178]:
from sklearn.ensemble import VotingClassifier

In [215]:
gb1 = GradientBoostingClassifier(**{'ccp_alpha': 0.0020121368398624364, 
                                   'criterion': 'friedman_mse', 
                                   'learning_rate': 0.20572877787819044, 
                                   'loss': 'exponential', 'max_depth': 6, 
                                   'max_features': 0.17656921450338955, 
                                   'max_leaf_nodes': 50, 
                                   'min_impurity_decrease': 0.0061954301244089805, 
                                   'min_samples_leaf': 7, 'min_samples_split': 3, 
                                   'subsample': 0.963743813416333, 
                                   'warm_start': True}, random_state = 1234, validation_fraction = 0.2)
gb2 = GradientBoostingClassifier(**{'ccp_alpha': 0.002020800985339463, 
                                     'criterion': 'friedman_mse', 
                                     'learning_rate': 0.1542088308919397, 
                                     'loss': 'exponential', 
                                     'max_depth': 6, 
                                     'max_features': 0.19903695658492065,
                                     'max_leaf_nodes': 50, 
                                     'min_impurity_decrease': 0.049800610940764156, 
                                     'min_samples_leaf': 2, 
                                     'min_samples_split': 64, 
                                     'n_estimators': 254, 
                                     'subsample': 0.955404624588575, 
                                     'warm_start': True}, random_state = 1234, validation_fraction = 0.2)
gb3 = GradientBoostingClassifier(**{'ccp_alpha': 6.287427032653941e-05,
                                     'criterion': 'friedman_mse', 
                                     'learning_rate': 0.10471245666478986,
                                     'loss': 'exponential', 'max_depth': 5,
                                     'max_features': 0.6540767795374985,
                                     'max_leaf_nodes': 50,
                                     'min_impurity_decrease': 0.04931444639079484,
                                     'min_samples_leaf': 2, 
                                     'min_samples_split': 61,
                                     'n_estimators': 280, 
                                     'n_iter_no_change': 15, 
                                     'subsample': 0.9187871136612589, 
                                     'warm_start': True}, random_state = 1234, validation_fraction = 0.2)
gb4 = GradientBoostingClassifier(**{'ccp_alpha': 0.001715108495987478,
                                     'criterion': 'friedman_mse', 
                                     'learning_rate': 0.13193638859464804, 
                                     'loss': 'exponential', 'max_depth': 8, 
                                     'max_features': 0.2451162605347299,
                                     'max_leaf_nodes': 50,
                                     'min_impurity_decrease': 0.04887063973387433,
                                     'min_samples_leaf': 5, 
                                     'min_samples_split': 50,
                                     'n_estimators': 300, 
                                     'subsample': 0.8654418656799914, 
                                     'warm_start': True}, random_state = 1234, validation_fraction = 0.2)
gb5 = GradientBoostingClassifier(**{'ccp_alpha': 0.0020227602704888242,
                                     'criterion': 'friedman_mse', 
                                     'learning_rate': 0.1558669429393742,
                                     'loss': 'exponential', 'max_depth': 6,
                                     'max_features': 0.2240499917797328, 'max_leaf_nodes': 50,
                                     'min_impurity_decrease': 0.049564577308527605,
                                     'min_samples_leaf': 2, 'min_samples_split': 63,
                                     'n_estimators': 274,
                                     'subsample': 0.8480812470625199, 
                                     'warm_start': True}, random_state = 1234, validation_fraction = 0.2)
estimators = []
estimators.append(('gb1', gb1))
estimators.append(('gb2', gb2))
estimators.append(('gb3', gb3))
estimators.append(('gb4', gb4))
estimators.append(('gb5', gb5))

In [216]:
voting_model = VotingClassifier(estimators = estimators, voting = 'soft', n_jobs = -1)

In [217]:
cross_val_score(voting_model, train_set, target, n_jobs=-1, cv=5, scoring = 'accuracy').mean()

0.8362693978895095

### Step 4: Run model

In [218]:
model = Model(model     = voting_model,              # Initialized classifier model from SKLearn
            #  variables = selected_variables_SVC, # Subset of variables from data to be used for training
                                                  # If variables=None, then all variables in set are used
              
              train_set = train_set,              # Samples X for training and validating
              target    = target,                 # Samples Y for training and validating
              test_set  = test_set                # Unlabeled samples for creating prediction
              )                 

model.run_model(path="results/votingEns_results.csv")

Model - VotingClassifier(estimators=[('gb1',
                              GradientBoostingClassifier(ccp_alpha=0.0020121368398624364,
                                                         criterion='friedman_mse',
                                                         init=None,
                                                         learning_rate=0.20572877787819044,
                                                         loss='exponential',
                                                         max_depth=6,
                                                         max_features=0.17656921450338955,
                                                         max_leaf_nodes=50,
                                                         min_impurity_decrease=0.0061954301244089805,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=7,
                                               

**kaggle result: 0.82500**