# Step 5. Model the network or the machine learning algorithm

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as kr

In [18]:
import warnings
warnings.filterwarnings('ignore')

In [70]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression as Logistic
from sklearn.linear_model import RidgeClassifier as Ridge
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.ensemble import AdaBoostClassifier as AdaBoost
from sklearn.ensemble import BaggingClassifier as Bagging
from sklearn.ensemble import ExtraTreesClassifier as ExtraTrees
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.ensemble import GradientBoostingClassifier as Gradient
from sklearn.neighbors import KNeighborsClassifier as KNeighbors
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [12]:
train = pd.read_csv("../input/mapped_train.csv")
print("Train shape: ", train.shape)

Train shape:  (881, 25)


## Try Machine Leaning

In [56]:
def get_accuracy(model, data, target, cv=10):
    model = eval(model)()
    folds = np.abs(cross_val_score(model, data, target, cv=cv, scoring="accuracy"))
    score = np.mean(folds)
    return folds, score

def test_model(model, data, target, cv=10):
    accuracy = get_accuracy(model, data, target, cv)
    print("Folds: ", str(accuracy[0]))
    print("Score: ", str(accuracy[1]))

def print_best(dictionary, items=5):
    keys = list(dictionary.keys())
    values = list(dictionary.values())
    for i in range(items):
        number = str(i+1)
        print("TOP #"+number+" MODEL: ", keys[i])
        print("Score: ", values[i])
        print("")

def test_models(models, data, target, cv=10, items=5):
    scores = {}
    for i in models:
        print("Computing", i, " ...")
        accuracy = get_accuracy(i, data, target, cv)
        scores[i] = accuracy[1]
    sorted_scores = {key: value for key, value in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
    print("\n", "-"*64, "\n")
    print_best(sorted_scores, items)
    

In [15]:
x_train = train.drop(["Survived"], axis=1)
y_train = train["Survived"]
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Fsize,N_Doctor,N_Man,N_Married,...,C_C,C_D,C_E,C_F,C_G,C_T,C_X,E_C,E_Q,E_S
0,3,0,22.0,1,0,7.25,2,0,1,0,...,0,0,0,0,0,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,2,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,3,1,26.0,0,0,7.925,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,1,1,35.0,1,0,53.1,2,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,3,0,35.0,0,0,8.05,1,0,1,0,...,0,0,0,0,0,0,1,0,0,1


In [16]:
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [57]:
models = ["Logistic", "Ridge", "SGD", "AdaBoost", "Bagging", "ExtraTrees", "RandomForest", "Gradient", "KNeighbors", "SVC", "MLPClassifier"]
test_models(models, x_train, y_train, cv=20)

Computing Logistic  ...
Computing Ridge  ...
Computing SGD  ...
Computing AdaBoost  ...
Computing Bagging  ...
Computing ExtraTrees  ...
Computing RandomForest  ...
Computing Gradient  ...
Computing KNeighbors  ...
Computing SVC  ...
Computing MLPClassifier  ...

 ---------------------------------------------------------------- 

TOP #1 MODEL:  Gradient
Score:  0.8185353535353534

TOP #2 MODEL:  AdaBoost
Score:  0.8150505050505051

TOP #3 MODEL:  Logistic
Score:  0.8104545454545453

TOP #4 MODEL:  RandomForest
Score:  0.8093434343434343

TOP #5 MODEL:  Bagging
Score:  0.8071969696969697



### Best in this case: GRADIENT

## Hyperparameter tunning

In [68]:
# General parameters 
kfold = 5
cpu = 4
scoring = "accuracy"
verbose = 1

In [69]:
# Gradient boosting tunning
gradient = Gradient()
gradient_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] 
              }

gradient_model = GridSearchCV(gradient, param_grid = gradient_grid, cv=kfold, scoring=scoring, n_jobs= cpu, verbose = verbose)
gradient_model.fit(x_train, y_train)
best_gradient = gradient_model.best_estimator_

# Best score
gradient_model.best_score_

Fitting 5 folds for each of 72 candidates, totalling 360 fits


0.8172573189522343

In [72]:
# Adaboost
tree = DecisionTree()
adaboost = AdaBoost(tree, random_state=7)
adaboost_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[1,2],
              "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}

adaboost_model = GridSearchCV(adaboost, param_grid = adaboost_grid, cv=kfold, scoring=scoring, n_jobs= cpu, verbose = verbose)
adaboost_model.fit(x_train, y_train)
best_adaboost = adaboost_model.best_estimator_

adaboost_model.best_score_

Fitting 5 folds for each of 112 candidates, totalling 560 fits


0.7991268618387262

In [73]:
# Random forest Parameters tunning 
forest = RandomForest()
forest_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}

forest_model = GridSearchCV(forest, param_grid = forest_grid, cv=kfold, scoring=scoring, n_jobs= cpu, verbose = verbose)
forest_model.fit(x_train, y_train)
best_forest = forest_model.best_estimator_

forest_model.best_score_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


0.825199024139702

In [75]:
#ExtraTrees 
trees = ExtraTrees()
trees_grid = {"max_depth": [None],
            "max_features": [1, 3, 10],
            "min_samples_split": [2, 3, 10],
            "min_samples_leaf": [1, 3, 10],
            "bootstrap": [False],
            "n_estimators" :[100,300],
            "criterion": ["gini"]}

trees_model = GridSearchCV(trees, param_grid = trees_grid, cv=kfold, scoring=scoring, n_jobs= cpu, verbose = verbose)
trees_model.fit(x_train, y_train)
best_trees = trees_model.best_estimator_

trees_model.best_score_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


0.8149845916795069

In [76]:
#Logistic Regression
logistic = Logistic()
logistic_grid = {"penalty": ["l1", "l2", "elasticnet", "none"],
                "dual": [True, False],
                "tol": [1e-4, 1e-3, 1e-5],
                "C": [1, 1.25, 1.5, 2, 0.75, 0.5],
                "fit_intercept": [True, False]}

logistic_model = GridSearchCV(logistic, param_grid = logistic_grid, cv=kfold, scoring=scoring, n_jobs= cpu, verbose = verbose)
logistic_model.fit(x_train, y_train)
best_logistic = logistic_model.best_estimator_

logistic_model.best_score_

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


0.8059065228556754

### Best in this case: RANDOM FOREST

## Voting ensemble

In [78]:
from sklearn.ensemble import VotingClassifier

In [81]:
voted_model = VotingClassifier(estimators=[('gradient', best_gradient), ('adaboost', best_adaboost), ('forest', best_forest), 
            ('trees', best_trees),('logistic', best_logistic)], voting='soft', n_jobs=cpu)

votation = voted_model.fit(x_train, y_train)
voted_model.score(x_train, y_train)

0.9307604994324631

## Export models

In [82]:
import pickle

In [85]:
path = "../output/gradient.pkl"
with open(path, 'wb') as file:
    pickle.dump(gradient_model, file)

In [86]:
path = "../output/forest.pkl"
with open(path, 'wb') as file:
    pickle.dump(forest_model, file)

In [83]:
path = "../output/votation.pkl"
with open(path, 'wb') as file:
    pickle.dump(votation, file)