# ML CLASSIFICATION MODELS

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [20]:
train = pd.read_csv('titanic_train.csv')

In [21]:
#CLEAN UP DATA

#remove cabin field
train.drop('Cabin',axis=1,inplace=True)

#2 average age
def impute_age(cols):
    Age = cols[0]
    if pd.isnull(Age):
        return 29
    else:
        return Age  
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)

#3 remove remaining na
train.dropna(inplace=True)

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S


In [22]:
#REMOVE TEXT, MAKE CATEGORIES NUMERICAL

sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)

train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

train = pd.concat([train,sex,embark],axis=1)

train.drop(['PassengerId'], axis=1, inplace=True)

train.head(3)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived',axis=1), 
                                                    train['Survived'], test_size=0.30, 
                                                    random_state=101)

# LOGISTIC REGRESSION


In [24]:
from sklearn.metrics import classification_report,confusion_matrix

from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       163
           1       0.81      0.65      0.72       104

    accuracy                           0.81       267
   macro avg       0.81      0.78      0.79       267
weighted avg       0.81      0.81      0.80       267





# KNN

In [25]:
from sklearn.neighbors import KNeighborsClassifier
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 5)
# Fit the classifier to the data
knn.fit(X_train,y_train)
# Run predictions
knnpredict = knn.predict(X_test)
# Review accuracy
print(classification_report(y_test,knnpredict))

              precision    recall  f1-score   support

           0       0.74      0.74      0.74       163
           1       0.59      0.59      0.59       104

    accuracy                           0.68       267
   macro avg       0.67      0.66      0.66       267
weighted avg       0.68      0.68      0.68       267



# SUPPORT VECTOR MACHINE

In [26]:
from sklearn import svm
sv = svm.SVC()
sv.fit(X_train, y_train)
svpredict = sv.predict(X_test)
print(classification_report(y_test,svpredict))

              precision    recall  f1-score   support

           0       0.70      0.80      0.75       163
           1       0.59      0.45      0.51       104

    accuracy                           0.67       267
   macro avg       0.65      0.63      0.63       267
weighted avg       0.66      0.67      0.66       267





# NAIVE BAYES

In [27]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnbpredict = gnb.predict(X_test)
print(classification_report(y_test,gnbpredict))

              precision    recall  f1-score   support

           0       0.81      0.88      0.85       163
           1       0.79      0.68      0.73       104

    accuracy                           0.81       267
   macro avg       0.80      0.78      0.79       267
weighted avg       0.80      0.81      0.80       267



# DECISION TREES

In [28]:
from sklearn import tree
dtree = tree.DecisionTreeClassifier()
dtree.fit(X_train, y_train)
dtreepredict = dtree.predict(X_test)
print(classification_report(y_test,dtreepredict))

              precision    recall  f1-score   support

           0       0.81      0.79      0.80       163
           1       0.69      0.71      0.70       104

    accuracy                           0.76       267
   macro avg       0.75      0.75      0.75       267
weighted avg       0.76      0.76      0.76       267



# RANDOM FOREST

In [29]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)
rfpredict = rf.predict(X_test)
print(classification_report(y_test,rfpredict))


              precision    recall  f1-score   support

           0       0.84      0.86      0.85       163
           1       0.77      0.74      0.75       104

    accuracy                           0.81       267
   macro avg       0.80      0.80      0.80       267
weighted avg       0.81      0.81      0.81       267



In [30]:
print(confusion_matrix(y_test,rfpredict))

[[140  23]
 [ 27  77]]


# BOOSTED TREES

In [35]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=20, learning_rate=0.25, max_features=2, max_depth=2, random_state=0)
gbc.fit(X_train, y_train)
gbcpredict = gbc.predict(X_test)
print(classification_report(y_test,gbcpredict))


              precision    recall  f1-score   support

           0       0.82      0.94      0.88       163
           1       0.89      0.67      0.77       104

    accuracy                           0.84       267
   macro avg       0.85      0.81      0.82       267
weighted avg       0.85      0.84      0.83       267



In [34]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))

Learning rate:  0.05
Accuracy score (training): 0.828
Accuracy score (validation): 0.820
Learning rate:  0.075
Accuracy score (training): 0.825
Accuracy score (validation): 0.828
Learning rate:  0.1
Accuracy score (training): 0.825
Accuracy score (validation): 0.831
Learning rate:  0.25
Accuracy score (training): 0.825
Accuracy score (validation): 0.839
Learning rate:  0.5
Accuracy score (training): 0.844
Accuracy score (validation): 0.824
Learning rate:  0.75
Accuracy score (training): 0.849
Accuracy score (validation): 0.805
Learning rate:  1
Accuracy score (training): 0.860
Accuracy score (validation): 0.798
