In [62]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

In [63]:
import os

In [64]:
def new_titanic_data():
    sql_query = 'SELECT * FROM passengers'
    df = pd.read_sql(sql_query, get_connection('titanic_db'))
    df.to_csv('titanic_df.csv')
    return df

In [65]:
def get_titanic_data(cached=False):
    if cached or os.path.isfile('titanic_df.csv') == False:
        df = new_titanic_data()
    else:
        df = pd.read_csv('titanic_df.csv', index_col=0)
    return df

In [66]:
titanic = get_titanic_data()

In [6]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [67]:
def titanic_split(df):

    train_validate, test = train_test_split(df, test_size=.2,
                                        random_state=123,
                                        stratify=df.survived)
    train, validate = train_test_split(train_validate, test_size=.3,
                                        random_state=123,
                                        stratify=train_validate.survived)
    return train, validate, test

def impute_mean_age(train, validate, test):

    imputer = SimpleImputer(strategy = 'mean')

    train['age'] = imputer.fit_transform(train[['age']])

    validate['age'] = imputer.transform(validate[['age']])

    test['age'] = imputer.transform(test[['age']])

    return train, validate, test

def prep_titanic_data(cached=True):
#     df = get_titanic_data(cached)
    df = df[~df.embarked.isnull()]
    titanic_dummies = pd.get_dummies(df.embarked, drop_first=True)
    df = pd.concat([df, titanic_dummies], axis=1)
    df = df.drop(columns='deck')

    train, validate, test = titanic_split(df)

    train, validate, test = impute_mean_age(train, validate, test)

    return train, validate, test

In [68]:
 titanic = titanic[~titanic.embarked.isnull()]

In [69]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [70]:
titanic_dummies = pd.get_dummies(titanic[['sex', 'embark_town', 'class']], drop_first=True)


In [71]:
titanic = pd.concat([titanic, titanic_dummies], axis=1)


In [72]:
titanic = titanic.drop(columns=['embark_town','class', 'sex', 'pclass'])


In [73]:
titanic.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,embarked,deck,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third
0,0,0,22.0,1,0,7.25,S,,0,1,0,1,0,1
1,1,1,38.0,1,0,71.2833,C,C,0,0,0,0,0,0
2,2,1,26.0,0,0,7.925,S,,1,0,0,1,0,1
3,3,1,35.0,1,0,53.1,S,C,0,0,0,1,0,0
4,4,0,35.0,0,0,8.05,S,,1,1,0,1,0,1


In [74]:
titanic = titanic.drop(columns='deck')


In [75]:
train, validate, test = titanic_split(titanic)


In [76]:
train.age.isnull().values.any()

True

In [77]:
train, validate, test = impute_mean_age(train, validate, test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [78]:
train.shape

(497, 13)

In [79]:
validate.shape

(214, 13)

In [80]:
test.shape

(178, 13)

In [81]:
test.head()

Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,embarked,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third
561,561,0,40.0,0,0,7.8958,S,1,1,0,1,0,1
328,328,1,31.0,1,1,20.525,S,0,0,0,1,0,1
643,643,1,29.916875,0,0,56.4958,S,1,1,0,1,0,1
498,498,0,25.0,1,2,151.55,S,0,0,0,1,0,0
875,875,1,15.0,0,0,7.225,C,1,0,0,0,0,1


In [82]:
train.survived.value_counts(normalize=True)


0    0.617706
1    0.382294
Name: survived, dtype: float64

In [83]:
logit = LogisticRegression()


In [84]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [85]:
logit = logit.fit(X_train, y_train)


ValueError: could not convert string to float: 'C'

In [27]:
X_train.columns


Index(['passenger_id',          'age',        'sibsp',        'parch',
               'fare',     'embarked',         'deck',        'alone',
                    2,              3],
      dtype='object')

In [29]:
logit.coef_


AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [None]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [None]:
y_pred_proba

In [None]:
logit.score(X_train, y_train)


In [None]:
print(confusion_matrix(y_train, y_pred))


In [None]:
print(classification_report(y_train, y_pred))


In [34]:
titanic2 = get_titanic_data()

In [35]:
titanic2.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [36]:
titanic2.age.isnull().values.any()

True

Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [37]:
titanic2 = titanic2.drop(columns=['sex', 'sibsp', 'parch', 'embarked','class', 'deck','embark_town', 'alone'])


In [38]:
titanic2.head()

Unnamed: 0,passenger_id,survived,pclass,age,fare
0,0,0,3,22.0,7.25
1,1,1,1,38.0,71.2833
2,2,1,3,26.0,7.925
3,3,1,1,35.0,53.1
4,4,0,3,35.0,8.05


In [39]:
titanic2 = titanic2.drop(columns='passenger_id')

In [40]:
titanic2.head()

Unnamed: 0,survived,pclass,age,fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [41]:
titanic2_dummies = pd.get_dummies(titanic2.pclass, drop_first=True)


In [42]:
titanic2 = pd.concat([titanic2, titanic2_dummies], axis=1)


In [43]:
titanic2.head()

Unnamed: 0,survived,pclass,age,fare,2,3
0,0,3,22.0,7.25,0,1
1,1,1,38.0,71.2833,0,0
2,1,3,26.0,7.925,0,1
3,1,1,35.0,53.1,0,0
4,0,3,35.0,8.05,0,1


In [44]:
train, validate, test = titanic_split(titanic2)


In [45]:
train.head()

Unnamed: 0,survived,pclass,age,fare,2,3
583,0,1,36.0,40.125,0,0
165,1,3,9.0,20.525,0,1
50,0,3,7.0,39.6875,0,1
259,1,2,50.0,26.0,1,0
306,1,1,,110.8833,0,0


In [46]:
train.isnull().values.any()

True

In [47]:
impute_mean_age(train, validate, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(     survived  pclass        age      fare  2  3
 583         0       1  36.000000   40.1250  0  0
 165         1       3   9.000000   20.5250  0  1
 50          0       3   7.000000   39.6875  0  1
 259         1       2  50.000000   26.0000  1  0
 306         1       1  29.678105  110.8833  0  0
 ..        ...     ...        ...       ... .. ..
 313         0       3  28.000000    7.8958  0  1
 636         0       3  32.000000    7.9250  0  1
 222         0       3  51.000000    8.0500  0  1
 485         0       3  29.678105   25.4667  0  1
 744         1       3  31.000000    7.9250  0  1
 
 [498 rows x 6 columns],
      survived  pclass        age      fare  2  3
 610         0       3  39.000000   31.2750  0  1
 424         0       3  18.000000   20.2125  0  1
 568         0       3  29.678105    7.2292  0  1
 334         1       1  29.678105  133.6500  0  0
 101         0       3  29.678105    7.8958  0  1
 ..        ...     ...        ...       ... .. ..
 176         0       3 

In [48]:
train.shape

(498, 6)

In [50]:
train.age.isnull().values.any()

False

In [51]:
train.survived.value_counts(normalize=True)


0    0.616466
1    0.383534
Name: survived, dtype: float64

In [52]:
logit = LogisticRegression()


In [53]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [54]:
logit = logit.fit(X_train, y_train)


In [55]:
X_train.columns


Index(['pclass', 'age', 'fare', 2, 3], dtype='object')

In [56]:
print(logit.coef_)

[[-0.67260505 -0.02902885  0.00259074  0.14850399 -0.41036784]]


In [57]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [58]:
y_pred_proba


array([[0.40808026, 0.59191974],
       [0.65716147, 0.34283853],
       [0.63250245, 0.36749755],
       [0.64455554, 0.35544446],
       [0.32327957, 0.67672043],
       [0.50494894, 0.49505106],
       [0.59660579, 0.40339421],
       [0.49917841, 0.50082159],
       [0.78313474, 0.21686526],
       [0.75471528, 0.24528472],
       [0.8292616 , 0.1707384 ],
       [0.58902943, 0.41097057],
       [0.5868048 , 0.4131952 ],
       [0.48350205, 0.51649795],
       [0.5891301 , 0.4108699 ],
       [0.84916005, 0.15083995],
       [0.78300271, 0.21699729],
       [0.78307788, 0.21692212],
       [0.54101014, 0.45898986],
       [0.7591043 , 0.2408957 ],
       [0.76808348, 0.23191652],
       [0.78312374, 0.21687626],
       [0.2549033 , 0.7450967 ],
       [0.30008336, 0.69991664],
       [0.48113939, 0.51886061],
       [0.74834275, 0.25165725],
       [0.38689257, 0.61310743],
       [0.51601041, 0.48398959],
       [0.37317122, 0.62682878],
       [0.4903032 , 0.5096968 ],
       [0.

In [59]:
logit.score(X_train, y_train)


0.6847389558232931

In [60]:
print(confusion_matrix(y_train, y_pred))


[[247  60]
 [ 97  94]]


In [61]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.72      0.80      0.76       307
           1       0.61      0.49      0.54       191

    accuracy                           0.68       498
   macro avg       0.66      0.65      0.65       498
weighted avg       0.68      0.68      0.68       498



68% Accuracy

In [86]:
titanic3 = get_titanic_data()

In [87]:
titanic3.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


##This time keep sex as a feature

In [88]:
titanic3 = titanic3.drop(columns=['sibsp', 'parch', 'embarked','class', 'deck','embark_town', 'alone', 'deck'])


In [89]:
titanic3.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,fare
0,0,0,3,male,22.0,7.25
1,1,1,1,female,38.0,71.2833
2,2,1,3,female,26.0,7.925
3,3,1,1,female,35.0,53.1
4,4,0,3,male,35.0,8.05


In [90]:
titanic3 = titanic3.drop(columns='passenger_id')

In [91]:
titanic3_dummies = pd.get_dummies(titanic3.sex, drop_first=True)


In [92]:
titanic3 = pd.concat([titanic3, titanic3_dummies], axis=1)


In [93]:
titanic3.head()

Unnamed: 0,survived,pclass,sex,age,fare,male
0,0,3,male,22.0,7.25,1
1,1,1,female,38.0,71.2833,0
2,1,3,female,26.0,7.925,0
3,1,1,female,35.0,53.1,0
4,0,3,male,35.0,8.05,1


In [94]:
titanic3 = titanic3.drop(columns='sex')

In [95]:
train, validate, test = titanic_split(titanic3)


In [96]:
train.shape

(498, 5)

In [97]:
impute_mean_age(train, validate, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(     survived  pclass        age      fare  male
 583         0       1  36.000000   40.1250     1
 165         1       3   9.000000   20.5250     1
 50          0       3   7.000000   39.6875     1
 259         1       2  50.000000   26.0000     0
 306         1       1  29.678105  110.8833     0
 ..        ...     ...        ...       ...   ...
 313         0       3  28.000000    7.8958     1
 636         0       3  32.000000    7.9250     1
 222         0       3  51.000000    8.0500     1
 485         0       3  29.678105   25.4667     0
 744         1       3  31.000000    7.9250     1
 
 [498 rows x 5 columns],
      survived  pclass        age      fare  male
 610         0       3  39.000000   31.2750     0
 424         0       3  18.000000   20.2125     1
 568         0       3  29.678105    7.2292     1
 334         1       1  29.678105  133.6500     0
 101         0       3  29.678105    7.8958     1
 ..        ...     ...        ...       ...   ...
 176         0       3 

In [98]:
train.survived.value_counts(normalize=True)


0    0.616466
1    0.383534
Name: survived, dtype: float64

In [99]:
train.head()

Unnamed: 0,survived,pclass,age,fare,male
583,0,1,36.0,40.125,1
165,1,3,9.0,20.525,1
50,0,3,7.0,39.6875,1
259,1,2,50.0,26.0,0
306,1,1,29.678105,110.8833,0


In [100]:
logit = LogisticRegression()


In [101]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [102]:
logit = logit.fit(X_train, y_train)


In [103]:
print(logit.coef_)


[[-1.21051288e+00 -2.97225157e-02 -2.02959641e-03 -2.71607026e+00]]


In [104]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [105]:
y_pred_proba

array([[0.55886457, 0.44113543],
       [0.86000036, 0.13999964],
       [0.85750744, 0.14249256],
       [0.29285566, 0.70714434],
       [0.07420599, 0.92579401],
       [0.774868  , 0.225132  ],
       [0.83575346, 0.16424654],
       [0.75935768, 0.24064232],
       [0.91712955, 0.08287045],
       [0.45346729, 0.54653271],
       [0.51242469, 0.48757531],
       [0.71551944, 0.28448056],
       [0.23825869, 0.76174131],
       [0.62136008, 0.37863992],
       [0.83184567, 0.16815433],
       [0.9458325 , 0.0541675 ],
       [0.91717581, 0.08282419],
       [0.42266621, 0.57733379],
       [0.79126547, 0.20873453],
       [0.38917297, 0.61082703],
       [0.40496169, 0.59503831],
       [0.91713341, 0.08286659],
       [0.04834486, 0.95165514],
       [0.0551017 , 0.9448983 ],
       [0.61692251, 0.38307749],
       [0.900767  , 0.099233  ],
       [0.12528539, 0.87471461],
       [0.11040975, 0.88959025],
       [0.50495793, 0.49504207],
       [0.75482481, 0.24517519],
       [0.

In [106]:
logit.score(X_train, y_train)


0.8132530120481928

In [107]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.87      0.85       307
           1       0.77      0.73      0.75       191

    accuracy                           0.81       498
   macro avg       0.80      0.80      0.80       498
weighted avg       0.81      0.81      0.81       498



In [108]:
titanic = get_titanic_data()

In [109]:
titanic.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [116]:
total = 342 + 549

In [117]:
549/total

0.6161616161616161

Baseline is 61.6%

In [147]:
titanic_4 = get_titanic_data()

In [148]:
titanic_4.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [150]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [149]:
train.head()

Unnamed: 0,fare,fare.1,sex_female,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
583,40.125,40.125,0,1,1,0,0,1,0,0
165,20.525,20.525,0,1,0,0,1,0,0,1
50,39.6875,39.6875,0,1,0,0,1,0,0,1
259,26.0,26.0,1,0,0,1,0,0,0,1
306,110.8833,110.8833,1,0,1,0,0,1,0,0


In [151]:
X_train.head()


Unnamed: 0,fare,fare.1,sex_female,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
583,40.125,40.125,0,1,1,0,0,1,0,0
165,20.525,20.525,0,1,0,0,1,0,0,1
50,39.6875,39.6875,0,1,0,0,1,0,0,1
259,26.0,26.0,1,0,0,1,0,0,0,1
306,110.8833,110.8833,1,0,1,0,0,1,0,0


In [152]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)


In [153]:
clf.fit(X_train, y_train)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [154]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([[0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1]])

In [155]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

[array([[0.69827586, 0.30172414],
        [0.69827586, 0.30172414],
        [0.69827586, 0.30172414],
        [0.07142857, 0.92857143],
        [0.01923077, 0.98076923],
        [0.69827586, 0.30172414],
        [0.69827586, 0.30172414],
        [0.8630137 , 0.1369863 ],
        [0.91666667, 0.08333333],
        [0.91666667, 0.08333333],
        [0.39130435, 0.60869565],
        [0.69827586, 0.30172414],
        [0.07142857, 0.92857143],
        [0.69827586, 0.30172414],
        [0.69827586, 0.30172414],
        [0.91666667, 0.08333333],
        [0.91666667, 0.08333333],
        [0.39130435, 0.60869565],
        [0.8630137 , 0.1369863 ],
        [0.39130435, 0.60869565],
        [0.39130435, 0.60869565],
        [0.91666667, 0.08333333],
        [0.01923077, 0.98076923],
        [0.01923077, 0.98076923],
        [0.69827586, 0.30172414],
        [0.91666667, 0.08333333],
        [0.01923077, 0.98076923],
        [0.07142857, 0.92857143],
        [0.69827586, 0.30172414],
        [0.863

In [156]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [157]:
confusion_matrix(y_train, y_pred)


ValueError: multilabel-indicator is not supported

In [159]:
y_train


Unnamed: 0,survived,survived.1
583,0,0
165,1,1
50,0,0
259,1,1
306,1,1
...,...,...
313,0,0
636,0,0
222,0,0
485,0,0


In [237]:
## Pulling in my data again and cleaning it up before I start the decision tree model

In [160]:
titanic = get_titanic_data()

In [161]:
titanic = titanic.drop(columns=['sex', 'sibsp', 'parch', 'embarked','class', 'deck','embark_town', 'alone'])


In [162]:
titanic = titanic.drop(columns='passenger_id')

In [163]:
titanic_dummies = pd.get_dummies(titanic.pclass, drop_first=True)


In [166]:
titanic = pd.concat([titanic, titanic_dummies], axis=1)


In [167]:
train, validate, test = titanic_split(titanic)


In [168]:
impute_mean_age(train, validate, test)

(     survived  pclass        age      fare  2  3
 583         0       1  36.000000   40.1250  0  0
 165         1       3   9.000000   20.5250  0  1
 50          0       3   7.000000   39.6875  0  1
 259         1       2  50.000000   26.0000  1  0
 306         1       1  29.678105  110.8833  0  0
 ..        ...     ...        ...       ... .. ..
 313         0       3  28.000000    7.8958  0  1
 636         0       3  32.000000    7.9250  0  1
 222         0       3  51.000000    8.0500  0  1
 485         0       3  29.678105   25.4667  0  1
 744         1       3  31.000000    7.9250  0  1
 
 [498 rows x 6 columns],
      survived  pclass        age      fare  2  3
 610         0       3  39.000000   31.2750  0  1
 424         0       3  18.000000   20.2125  0  1
 568         0       3  29.678105    7.2292  0  1
 334         1       1  29.678105  133.6500  0  0
 101         0       3  29.678105    7.8958  0  1
 ..        ...     ...        ...       ... .. ..
 176         0       3 

In [169]:
logit = LogisticRegression()


In [190]:
X_train = train.drop(columns=['survived'])
y_train = train[['survived']]

X_validate = validate.drop(columns=['survived'])
y_validate = validate[['survived']]

X_test = test.drop(columns=['survived'])
y_test = test[['survived']]

In [191]:
logit = logit.fit(X_train, y_train)


In [192]:
print(logit.coef_)


[[-0.67260505 -0.02902885  0.00259074  0.14850399 -0.41036784]]


In [193]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [None]:
##Creating my decision tree model

In [194]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)


In [238]:
##Fit the model

In [195]:
clf.fit(X_train, y_train)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [239]:
##Estimate Survival

In [196]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([1, 0, 0, 0, 1])

In [240]:
##Estimate Probability of Survival

In [197]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.3255814 , 0.6744186 ],
       [0.74660633, 0.25339367],
       [0.74660633, 0.25339367],
       [0.54545455, 0.45454545],
       [0.3255814 , 0.6744186 ],
       [0.3255814 , 0.6744186 ],
       [0.54545455, 0.45454545],
       [0.66666667, 0.33333333],
       [0.74660633, 0.25339367],
       [0.74660633, 0.25339367],
       [0.92857143, 0.07142857],
       [0.54545455, 0.45454545],
       [0.3255814 , 0.6744186 ],
       [0.54545455, 0.45454545],
       [0.3255814 , 0.6744186 ],
       [0.92857143, 0.07142857],
       [0.74660633, 0.25339367],
       [0.74660633, 0.25339367],
       [0.66666667, 0.33333333],
       [0.74660633, 0.25339367],
       [0.74660633, 0.25339367],
       [0.74660633, 0.25339367],
       [0.3255814 , 0.6744186 ],
       [0.3255814 , 0.6744186 ],
       [0.54545455, 0.45454545],
       [0.74660633, 0.25339367],
       [0.3255814 , 0.6744186 ],
       [0.54545455, 0.45454545],
       [0.3255814 , 0.6744186 ],
       [0.66666667, 0.33333333],
       [0.

In [241]:
##Compute the Accuracy

In [198]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.72


In [242]:
##Create a confusion matrix

In [199]:
confusion_matrix(y_train, y_pred)


array([[265,  42],
       [ 97,  94]])

In [203]:
y_train

Unnamed: 0,survived
583,0
165,1
50,0
259,1
306,1
...,...
313,0
636,0
222,0
485,0


In [201]:
labels = sorted(y_train.survived.unique())


In [205]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)


Unnamed: 0,0,1
0,265,42
1,97,94


True Positive Rate:
    
    TP / (TP + FN)

In [207]:
265 / (265 + 97)

0.7320441988950276

In [208]:
y_train.shape

(498, 1)

In [182]:
y_train.value_counts()


0    307
1    191
Name: survived, dtype: int64

In [243]:
##Create a classification report

In [183]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.73      0.86      0.79       307
           1       0.69      0.49      0.57       191

    accuracy                           0.72       498
   macro avg       0.71      0.68      0.68       498
weighted avg       0.72      0.72      0.71       498



In [245]:
#Create the decision tree object

In [209]:
clf = DecisionTreeClassifier(max_depth=6, random_state=123)


In [246]:
##Fit the model

In [210]:
clf.fit(X_train, y_train)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [247]:
##Examine survival

In [211]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([1, 0, 0, 1, 1])

In [248]:
#Estimate probability of survival

In [212]:
y_pred_proba = clf.predict_proba(X_train)


In [249]:
#Compute the accuracy

In [213]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.79


In [250]:
#Create a confusion matrix

In [214]:
confusion_matrix(y_train, y_pred)


array([[251,  56],
       [ 49, 142]])

In [215]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)


Unnamed: 0,0,1
0,251,56
1,49,142


In [251]:
#Create a classification report

In [216]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.82      0.83       307
           1       0.72      0.74      0.73       191

    accuracy                           0.79       498
   macro avg       0.78      0.78      0.78       498
weighted avg       0.79      0.79      0.79       498



In [252]:
#Compute the accuracy run on the validate set

In [217]:
clf = DecisionTreeClassifier(max_depth=6, random_state=123)


In [218]:
clf.fit(X_validate, y_validate)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [219]:
y_pred = clf.predict(X_validate)
y_pred[0:5]

array([0, 0, 0, 1, 0])

In [220]:
y_pred_proba = clf.predict_proba(X_validate)


In [228]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on training set: 0.64


In [255]:
####When I raised the max-depth, it lowered the accuracy of the validate set.

In [225]:
clf.fit(X_test, y_test)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [226]:
y_pred = clf.predict(X_test)
y_pred[0:5]

array([0, 1, 1, 0, 1])

In [227]:
y_pred_proba = clf.predict_proba(X_test)


In [229]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 0.85


In [256]:
###The test set on the higher max-depth is higher than the train or validate

In [230]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)


In [231]:
clf.fit(X_train, y_train)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [232]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([1, 0, 0, 1, 1])

In [233]:
y_pred_proba = clf.predict_proba(X_train)


In [234]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.71


In [235]:
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)


Unnamed: 0,0,1
0,240,67
1,79,112


In [236]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.75      0.78      0.77       307
           1       0.63      0.59      0.61       191

    accuracy                           0.71       498
   macro avg       0.69      0.68      0.69       498
weighted avg       0.70      0.71      0.70       498



In [260]:
from sklearn.ensemble import RandomForestClassifier


Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [267]:
##Examine my data

In [273]:
X_train.head()

Unnamed: 0,pclass,age,fare,2,3
583,1,36.0,40.125,0,0
165,3,9.0,20.525,0,1
50,3,7.0,39.6875,0,1
259,2,50.0,26.0,1,0
306,1,29.678105,110.8833,0,0


In [265]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 583 to 744
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  498 non-null    int64  
 1   age     498 non-null    float64
 2   fare    498 non-null    float64
 3   2       498 non-null    uint8  
 4   3       498 non-null    uint8  
dtypes: float64(2), int64(1), uint8(2)
memory usage: 16.5 KB


In [275]:
y_train.head()

Unnamed: 0,survived
583,0
165,1
50,0
259,1
306,1


In [286]:
y_train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [288]:
1 - (191 / (191 + 307))

0.6164658634538153

In [268]:
##Create the random forest object

In [261]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [269]:
##Fit the model to the training data

In [262]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [263]:
RandomForestClassifier(max_depth=3, min_samples_leaf=3, random_state=123)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [270]:
##Print feature importances

In [264]:
print(rf.feature_importances_)


[0.21150977 0.21640522 0.33616296 0.0180178  0.21790425]


In [271]:
##Estimate whether the passenger will survive

In [276]:
y_pred = rf.predict(X_train)


In [278]:
y_pred

array([1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,

In [279]:
##Estimate the probability of a passenger surviving

In [280]:
y_pred_proba = rf.predict_proba(X_train)


In [281]:
y_pred_proba

array([[0.41393817, 0.58606183],
       [0.66364937, 0.33635063],
       [0.72862468, 0.27137532],
       [0.48404006, 0.51595994],
       [0.36832434, 0.63167566],
       [0.46118117, 0.53881883],
       [0.45370764, 0.54629236],
       [0.60659698, 0.39340302],
       [0.73856046, 0.26143954],
       [0.72826334, 0.27173666],
       [0.76178037, 0.23821963],
       [0.53392512, 0.46607488],
       [0.45210613, 0.54789387],
       [0.44963074, 0.55036926],
       [0.45300924, 0.54699076],
       [0.87405505, 0.12594495],
       [0.81775462, 0.18224538],
       [0.76216214, 0.23783786],
       [0.55618529, 0.44381471],
       [0.81529945, 0.18470055],
       [0.72513987, 0.27486013],
       [0.75610565, 0.24389435],
       [0.28794623, 0.71205377],
       [0.34757729, 0.65242271],
       [0.44891064, 0.55108936],
       [0.81758243, 0.18241757],
       [0.276245  , 0.723755  ],
       [0.45453566, 0.54546434],
       [0.45303174, 0.54696826],
       [0.55644884, 0.44355116],
       [0.

In [282]:
##Compute the Accuracy

In [283]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.73


In [284]:
##Create a Confusion Matrix

In [289]:
print(confusion_matrix(y_train, y_pred))


[[251  56]
 [ 78 113]]


In [290]:
##Create a Classification Report

In [291]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.76      0.82      0.79       307
           1       0.67      0.59      0.63       191

    accuracy                           0.73       498
   macro avg       0.72      0.70      0.71       498
weighted avg       0.73      0.73      0.73       498



In [293]:
## Find Accuracy of Validate Set

In [292]:
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))


Accuracy of random forest classifier on validate set: 0.71


In [294]:
## Accuracy of train set is close to validate set

In [295]:
## Find Accuracy of Test Set

In [296]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on test set: 0.73


In [297]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [298]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [300]:
###Setting hyperparameters with max-depth of 20 and min-samples-leaf of 1

In [299]:
RandomForestClassifier(max_depth=20, min_samples_leaf=1, random_state=123)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [301]:
print(rf.feature_importances_)


[0.21150977 0.21640522 0.33616296 0.0180178  0.21790425]


In [302]:
y_pred = rf.predict(X_train)


In [303]:
y_pred_proba = rf.predict_proba(X_train)


In [304]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.73


In [305]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.76      0.82      0.79       307
           1       0.67      0.59      0.63       191

    accuracy                           0.73       498
   macro avg       0.72      0.70      0.71       498
weighted avg       0.73      0.73      0.73       498



In [306]:
print('Accuracy of random forest classifier on validate set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on validate set: 0.71


In [307]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on test set: 0.73


In [308]:
### Accuracy is comparable across train, validate, test

Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [309]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

In [310]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [311]:
print(rf.feature_importances_)

[0.21439708 0.20500168 0.34326661 0.01746984 0.21986479]


In [312]:
y_pred = rf.predict(X_train)


In [313]:
y_pred_proba = rf.predict_proba(X_train)


In [314]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.73


In [315]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on test set: 0.72


In [316]:
##Accuracy is very similar again

In [317]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)

In [318]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [319]:
print(rf.feature_importances_)

[0.04882404 0.42643042 0.46889124 0.00708409 0.0487702 ]


In [320]:
y_pred = rf.predict(X_train)


In [321]:
y_pred_proba = rf.predict_proba(X_train)


In [322]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [323]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on test set: 0.67


In [324]:
###The lower min leaf sample and larger max depth creates a higher accuracy on the training model and lower accuracy on the test data compared to the min leaf sample of 5 and max depth of 3, which are more similar in their accuray

In [325]:
####### KNN ####

Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [326]:
X_train.head()

Unnamed: 0,pclass,age,fare,2,3
583,1,36.0,40.125,0,0
165,3,9.0,20.525,0,1
50,3,7.0,39.6875,0,1
259,2,50.0,26.0,1,0
306,1,29.678105,110.8833,0,0


In [329]:
from sklearn.neighbors import KNeighborsClassifier


In [330]:
##Create KNN object

In [331]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')


In [332]:
##Fit the model to the training data

In [333]:
knn.fit(X_train, y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [334]:
##Estimate whether the passenger will survive

In [335]:
y_pred = knn.predict(X_train)


In [336]:
##Estimate the probability of survival

In [337]:
y_pred_proba = knn.predict_proba(X_train)


In [338]:
#Compute the Accuracy

In [339]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.76


In [340]:
##Create a Confusion Matrix

In [341]:
print(confusion_matrix(y_train, y_pred))


[[257  50]
 [ 68 123]]


In [342]:
#Create Classification Report

In [343]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.79      0.84      0.81       307
           1       0.71      0.64      0.68       191

    accuracy                           0.76       498
   macro avg       0.75      0.74      0.74       498
weighted avg       0.76      0.76      0.76       498



In [344]:
##Run it on validate and test data

In [345]:
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on validate set: 0.67


In [346]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of KNN classifier on test set: 0.64


Run through steps 2-4 setting k to 10

In [347]:
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')


In [348]:
knn.fit(X_train, y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [349]:
y_pred = knn.predict(X_train)


In [350]:
y_pred_proba = knn.predict_proba(X_train)


In [351]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.72


In [352]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.72      0.89      0.80       307
           1       0.72      0.45      0.55       191

    accuracy                           0.72       498
   macro avg       0.72      0.67      0.68       498
weighted avg       0.72      0.72      0.71       498



In [353]:
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on validate set: 0.72


In [354]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of KNN classifier on test set: 0.69


In [355]:
###Models were more consistent when raising the k to 10

Run through setps 2-4 setting k to 20


In [356]:
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')


In [357]:
knn.fit(X_train, y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                     weights='uniform')

In [358]:
y_pred = knn.predict(X_train)


In [359]:
y_pred_proba = knn.predict_proba(X_train)


In [360]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.71


In [361]:
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on validate set: 0.71


In [362]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of KNN classifier on test set: 0.66


In [363]:
### Slightly less accurate than when k was set to 10