In [1]:
# DS Libraries
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sb

# knn/submodules from scikit learn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.impute import SimpleImputer

# Data Acquisition
from pydataset import data

import acquire_cl as acq
import prepare_functionscl as prep
import warnings
warnings.filterwarnings('ignore')

## Acquire & Prep

- Use the `titanic` dataset from pydata

In [2]:
rose = acq.get_titanic_data()

In [3]:
rose.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
rose.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   age           714 non-null    float64
 5   sibsp         891 non-null    int64  
 6   parch         891 non-null    int64  
 7   fare          891 non-null    float64
 8   embarked      889 non-null    object 
 9   class         891 non-null    object 
 10  deck          203 non-null    object 
 11  embark_town   889 non-null    object 
 12  alone         891 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 97.5+ KB


In [5]:
jack = prep.prep_titanic(rose)

In [6]:
jack.head()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,22.0,1,0,7.25,0,0,1,0,0,1
1,1,1,1,38.0,1,0,71.2833,0,1,0,1,0,0
2,2,1,3,26.0,0,0,7.925,1,1,0,0,0,1
3,3,1,1,35.0,1,0,53.1,0,1,0,0,0,1
4,4,0,3,35.0,0,0,8.05,1,0,1,0,0,1


In [7]:
jack.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   passenger_id             891 non-null    int64  
 1   survived                 891 non-null    int64  
 2   pclass                   891 non-null    int64  
 3   age                      714 non-null    float64
 4   sibsp                    891 non-null    int64  
 5   parch                    891 non-null    int64  
 6   fare                     891 non-null    float64
 7   alone                    891 non-null    int64  
 8   sex_female               891 non-null    uint8  
 9   sex_male                 891 non-null    uint8  
 10  embark_town_Cherbourg    891 non-null    uint8  
 11  embark_town_Queenstown   891 non-null    uint8  
 12  embark_town_Southampton  891 non-null    uint8  
dtypes: float64(2), int64(6), uint8(5)
memory usage: 67.0 KB


In [8]:
jack.describe()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,445.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.602694,0.352413,0.647587,0.188552,0.08642,0.725028
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.489615,0.47799,0.47799,0.391372,0.281141,0.446751
min,0.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,222.5,0.0,2.0,20.125,0.0,0.0,7.9104,0.0,0.0,0.0,0.0,0.0,0.0
50%,445.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0,0.0,1.0,0.0,0.0,1.0
75%,667.5,1.0,3.0,38.0,1.0,0.0,31.0,1.0,1.0,1.0,0.0,0.0,1.0
max,890.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,1.0,1.0,1.0


## Split

In [11]:
import prepare_functionscl as prep

In [24]:
train, val, test = prep.split_titanic(jack)

In [25]:
train.shape, val.shape, test.shape

((498, 13), (214, 13), (179, 13))

In [26]:
train.columns

Index(['passenger_id', 'survived', 'pclass', 'age', 'sibsp', 'parch', 'fare',
       'alone', 'sex_female', 'sex_male', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')

In [27]:
X_train = train.drop(columns=['survived'])
y_train = train['survived']

X_val = val.drop(columns=['survived'])
y_val = val['survived']

X_test = test.drop(columns=['survived'])
y_test = test['survived']

print(X_train.columns)
print(X_val.columns)
print(X_test.columns)

Index(['passenger_id', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'alone',
       'sex_female', 'sex_male', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')
Index(['passenger_id', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'alone',
       'sex_female', 'sex_male', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')
Index(['passenger_id', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'alone',
       'sex_female', 'sex_male', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')


In [28]:
train, val, test = prep.titanic_age_imputer(train, val, test)

In [29]:
print(X_train.columns)
print(X_val.columns)
print(X_test.columns)

Index(['passenger_id', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'alone',
       'sex_female', 'sex_male', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')
Index(['passenger_id', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'alone',
       'sex_female', 'sex_male', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')
Index(['passenger_id', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'alone',
       'sex_female', 'sex_male', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')


In [30]:
# Find mode of target to establish baseline

print(train['survived'].value_counts())

# or
# print(y_train.mode())

0    307
1    191
Name: survived, dtype: int64


In [31]:
base_pred = (train.survived == 0)
base_acc = base_pred.mean()
print(f'Baseline Accuracy:{base_acc:.2%}')

Baseline Accuracy:61.65%


**Exercise 1**

Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [32]:
knn = KNeighborsClassifier()

In [40]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [34]:
y_pred= knn.predict(X_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

**Note**

This keeps happening anytime I include 'age' as a part of my df. I've tried several times to fix it to no avail, so I'm dropping the column from my df.

In [35]:
df = jack.drop(columns='age')

In [36]:
train, val, test = prep.split_titanic(df)

In [37]:
train.shape, val.shape, test.shape

((498, 12), (214, 12), (179, 12))

In [38]:
X_train = train.drop(columns=['survived'])
y_train = train['survived']

X_val = val.drop(columns=['survived'])
y_val = val['survived']

X_test = test.drop(columns=['survived'])
y_test = test['survived']

print(X_train.columns)
print(X_val.columns)
print(X_test.columns)

Index(['passenger_id', 'pclass', 'sibsp', 'parch', 'fare', 'alone',
       'sex_female', 'sex_male', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')
Index(['passenger_id', 'pclass', 'sibsp', 'parch', 'fare', 'alone',
       'sex_female', 'sex_male', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')
Index(['passenger_id', 'pclass', 'sibsp', 'parch', 'fare', 'alone',
       'sex_female', 'sex_male', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')


In [89]:
knn = KNeighborsClassifier() 
knn.fit(X_train, y_train)
y_pred= knn.predict(X_train)

In [90]:
y_pred

array([1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

**Exercise 2**

Evaluate your results using the model score, confusion matrix, and classification report.


In [91]:
# Model accuracy for train dataset
train_acc= knn.score(X_train, y_train)

In [111]:
print(f'''Train Accuracy Score: {knn.score(X_train, y_train):.2%}''')

Train Accuracy Score: 74.10%


In [93]:
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.745856,0.727941,0.740964,0.736899,0.738985
recall,0.879479,0.518325,0.740964,0.698902,0.740964
f1-score,0.807175,0.605505,0.740964,0.70634,0.729827
support,307.0,191.0,0.740964,498.0,498.0


In [95]:
mr_anderson = confusion_matrix(y_train, y_pred)
mr_anderson

array([[270,  37],
       [ 92,  99]])

**Exercise 3**

Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [96]:
TN, FP, FN, TP = mr_anderson[1,1], mr_anderson[0,1], mr_anderson[1,0], mr_anderson[0,0]

All = TN + FP+ FN+ TP

accuracy= (TP + TN) / All
True_pos_rate = recall= TP/ (TP + FN)
precision = TP/ (TP + FP)
support_pos = TP + FN
support_neg = FP +TN
classrep_knn1= pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
print(f'''Accuracy score: {accuracy:.2%},
          Recall: {recall:.2%},
          Precision: {precision:.2%},
          Support Positive: {support_pos},
          Support Negative: {support_neg}''')
print(classrep_knn1)

Accuracy score: 74.10%,
          Recall: 74.59%,
          Precision: 87.95%,
          Support Positive: 362,
          Support Negative: 136
                    0           1  accuracy   macro avg  weighted avg
precision    0.745856    0.727941  0.740964    0.736899      0.738985
recall       0.879479    0.518325  0.740964    0.698902      0.740964
f1-score     0.807175    0.605505  0.740964    0.706340      0.729827
support    307.000000  191.000000  0.740964  498.000000    498.000000


**Exercise 4**


Run through steps 1-3 setting k to 10


In [97]:
knn2 = KNeighborsClassifier(n_neighbors=10) 
knn2.fit(X_train, y_train)
y_pred2= knn.predict(X_train)

In [98]:
y_pred2

array([1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

In [99]:
print(f'''Train Accuracy Score: {knn2.score(X_train, y_train):.2%}''')

Train Accuracy Score: 70.68%


In [100]:
zion = confusion_matrix(y_train, y_pred)
classrep_knn2= pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

In [101]:
TN, FP, FN, TP = zion[1,1], zion[0,1], zion[1,0], zion[0,0]
All = TN + FP+ FN+ TP
accuracy= (TP + TN) / All
True_pos_rate = recall= TP/ (TP + FN)
precision = TP/ (TP + FP)
support_pos = TP + FN
support_neg = FP +TN
classrep_knn2= pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
print(f'''Accuracy score: {accuracy:.2%},
          Recall: {recall:.2%},
          Precision: {precision:.2%},
          Support Positive: {support_pos},
          Support Negative: {support_neg}''')
print(classrep_knn2)

Accuracy score: 74.10%,
          Recall: 74.59%,
          Precision: 87.95%,
          Support Positive: 362,
          Support Negative: 136
                    0           1  accuracy   macro avg  weighted avg
precision    0.745856    0.727941  0.740964    0.736899      0.738985
recall       0.879479    0.518325  0.740964    0.698902      0.740964
f1-score     0.807175    0.605505  0.740964    0.706340      0.729827
support    307.000000  191.000000  0.740964  498.000000    498.000000


**Exercise 5**


Run through steps 1-3 setting k to 20

In [102]:
knn3 = KNeighborsClassifier(n_neighbors=20) 
knn3.fit(X_train, y_train)
y_pred3= knn.predict(X_train)

In [103]:
y_pred3

array([1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

In [104]:
print(f'''Train Accuracy Score: {knn3.score(X_train, y_train):.2%}''')

Train Accuracy Score: 67.67%


In [105]:
deja_vu = confusion_matrix(y_train, y_pred)
classrep_knn3= pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

In [106]:
TN, FP, FN, TP = deja_vu[1,1], deja_vu[0,1], deja_vu[1,0], deja_vu[0,0]
All = TN + FP+ FN+ TP
accuracy= (TP + TN) / All
True_pos_rate = recall= TP/ (TP + FN)
precision = TP/ (TP + FP)
support_pos = TP + FN
support_neg = FP +TN
classrep_knn3= pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
print(f'''Accuracy score: {accuracy:.2%},
          Recall: {recall:.2%},
          Precision: {precision:.2%},
          Support Positive: {support_pos},
          Support Negative: {support_neg}''')
print(classrep_knn3)

Accuracy score: 74.10%,
          Recall: 74.59%,
          Precision: 87.95%,
          Support Positive: 362,
          Support Negative: 136
                    0           1  accuracy   macro avg  weighted avg
precision    0.745856    0.727941  0.740964    0.736899      0.738985
recall       0.879479    0.518325  0.740964    0.698902      0.740964
f1-score     0.807175    0.605505  0.740964    0.706340      0.729827
support    307.000000  191.000000  0.740964  498.000000    498.000000


**Exercise 6**


What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

- The first model with 5 neighbors has a better score than the models with 10 or 20 neighbors. 

Scores according to neighbors:
    
    
    5 neighbors:  74.1%
    10 neighbors: 70.68%
    20 neighbors: 67.67%



**Exercise 7**

Which model performs best on our out-of-sample data from validate?

In [127]:
for i in range(5,21):
# creating KNN classifier with number of neighbors=i
    knn_classifier = KNeighborsClassifier(n_neighbors = i)

# fitting the KNN classifier with training data
    knn_classifier.fit(X_train, y_train)

# predicting churn outcome for test data
    y_pred = knn_classifier.predict(X_val)

# model score/Val accuracy
    model_score = knn_classifier.score(X_val, y_val)

# confusion matrix
    confusion_mat = confusion_matrix(y_val, y_pred)

# getting the classification report
    classification_rep = classification_report(y_val, y_pred)
    
    print(f'Accuracy score KNN on validate data with {i} neighbors: {model_score:.2%}')
    print("\nClassification Report:\n", classification_rep)
    print("\nConfusion Matrix:\n", confusion_mat)



Accuracy score KNN on validate data with 5 neighbors: 57.01%

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.69      0.66       132
           1       0.43      0.38      0.40        82

    accuracy                           0.57       214
   macro avg       0.54      0.53      0.53       214
weighted avg       0.56      0.57      0.56       214


Confusion Matrix:
 [[91 41]
 [51 31]]
Accuracy score KNN on validate data with 6 neighbors: 59.35%

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.79      0.71       132
           1       0.45      0.28      0.35        82

    accuracy                           0.59       214
   macro avg       0.54      0.53      0.53       214
weighted avg       0.57      0.59      0.57       214


Confusion Matrix:
 [[104  28]
 [ 59  23]]
Accuracy score KNN on validate data with 7 neighbors: 59.35%

Classification Report:
       

- 5 neighbors: 57.01%
- 10 neighbors: 59.81%
- 20 neighbors: 65.42%

Best model is 20 neighbors, but all scores are below my baseline accuracy