### Building Machine Learning Models
Now we will train several Machine Learning models and compare their results.

In [1]:
# Import important libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
plt.rcParams['figure.figsize'] = (12.0, 5.0)

In [2]:
# Read data
df = pd.read_csv('../data/train.csv', index_col=0)

In [3]:
# Import the preprocessed dataset from util
from ipynb.fs.full.utils import preprocess
df = preprocess(df)

In [4]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,fam_size,Embarked_1,...,Fare_3,Fare_4,Fare_5,Pclass_1,Pclass_2,Pclass_3,fam_size_1,fam_size_2,fam_size_3,fam_size_4
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,1,1,1,0,0,2,2,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1,1,0,2,1,0,4,0,2,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1,3,0,1,0,0,0,2,1,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,1,1,0,2,1,0,4,2,2,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0,3,1,2,0,0,1,2,1,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [5]:
dfm = df.drop(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'fam_size'], axis=1)
dfm.head(2)

Unnamed: 0_level_0,Survived,Embarked_1,Embarked_2,Embarked_3,Sex_1,Sex_2,Age_1,Age_2,Age_3,Age_4,...,Fare_3,Fare_4,Fare_5,Pclass_1,Pclass_2,Pclass_3,fam_size_1,fam_size_2,fam_size_3,fam_size_4
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [6]:
#Split dataset so that we can train the model with the training set and test the predictive power with the test dataset
X = dfm.drop("Survived", axis=1)
y = dfm["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

#Normalize the training variables using Standard Scaling to bring all of them to the same level of magnitude.
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [7]:
X_test.shape, X_train.shape, y_test.shape, y_train.shape

((223, 22), (668, 22), (223,), (668,))

In [8]:
#Create a function within many Machine Learning Models

logreg = LogisticRegression(random_state =42)
tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 42)
RF = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42)
sgd = linear_model.SGDClassifier(max_iter=5, random_state=42, tol=None)
knn = KNeighborsClassifier(n_neighbors = 3) 
gaussian = GaussianNB() 
perceptron = Perceptron(max_iter=20, random_state=42)
linear_svc = LinearSVC(random_state=42)

models = [('Logistic Regression',logreg),('Decision Tree',tree),('Random Forest',RF),('SGDClassifier',sgd),
          ('KNN',knn),('GaussianNB',gaussian),('Perceptron',perceptron),('LinearSVC',linear_svc)]

In [9]:
for i, model in models:
    #fit the model
    model.fit(X_train, y_train)
    accuracy = model.score(X_train, y_train)
    acc = "%s= %.2f"% (i, round(accuracy*100, 2))
    print('Train_Accuracy of', acc,'%')

Train_Accuracy of Logistic Regression= 83.23 %
Train_Accuracy of Decision Tree= 88.17 %
Train_Accuracy of Random Forest= 88.17 %
Train_Accuracy of SGDClassifier= 76.65 %
Train_Accuracy of KNN= 83.98 %
Train_Accuracy of GaussianNB= 76.35 %
Train_Accuracy of Perceptron= 72.75 %
Train_Accuracy of LinearSVC= 82.34 %




In [10]:
# Retrain Random Forest with parameters from grid search output in Modeling.ipynb

RF = RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 1, 
                                       min_samples_split = 16,   
                                       n_estimators=700)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)

acc_random_forest = round(RF.score(X_train, y_train) * 100, 2)
print(round(acc_random_forest,2,), "%")

85.33 %
