In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import acquire
import prepare

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [2]:
titanic = acquire.get_titanic_data()
titanic = prepare.prep_titantic(titanic)

In [3]:
col = 'survived'
train, validate, test = prepare.train_val_test(titanic, col)

In [4]:
X_train = train.drop(columns=['survived', 'sex', 'embark_town'])
y_train = train.survived

X_validate = validate.drop(columns=['survived', 'sex', 'embark_town'])
y_validate = validate.survived

X_test = test.drop(columns=['survived', 'sex', 'embark_town'])
y_test = test.survived

In [5]:
y_train.mode()

0    0
Name: survived, dtype: int64

In [6]:
baseline = y_train.mode()
baseline_acc = (y_train == 0).mean()

In [7]:
print(f'Baseline: 0')
print(f'Baseline accuracy: {baseline_acc:.2%}')

Baseline: 0
Baseline accuracy: 61.64%


In [8]:
knn = KNeighborsClassifier(n_neighbors = 5)

knn

In [9]:
knn.fit(X_train, y_train)

In [13]:
train_preds = knn.predict(X_train)

## 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [10]:
knn.score(X_train, y_train)

0.8314606741573034

In [14]:
confusion_matrix(y_train, train_preds)

array([[339,  45],
       [ 60, 179]])

In [15]:
print(classification_report(y_train, train_preds))

report = classification_report(y_train, train_preds, output_dict = True, target_names=('died','survived'))
pd.DataFrame(report)

              precision    recall  f1-score   support

           0       0.85      0.88      0.87       384
           1       0.80      0.75      0.77       239

    accuracy                           0.83       623
   macro avg       0.82      0.82      0.82       623
weighted avg       0.83      0.83      0.83       623



Unnamed: 0,died,survived,accuracy,macro avg,weighted avg
precision,0.849624,0.799107,0.831461,0.824366,0.830244
recall,0.882812,0.748954,0.831461,0.815883,0.831461
f1-score,0.8659,0.773218,0.831461,0.819559,0.830345
support,384.0,239.0,0.831461,623.0,623.0


In [11]:
knn.score(X_validate, y_validate)

0.842443729903537

## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [16]:
tn, fp, fn, tp = confusion_matrix(y_train,train_preds).ravel()
acc_all = tn + tp + fn + fp

tn, fp, fn, tp, acc_all

(339, 45, 60, 179, 623)

In [17]:
accuracy = (tp + tn)/acc_all
print(f"Accuracy: {accuracy:.3%}")

true_positive_rate = tp/(tp+fn)
print(f"True Positive Rate: {true_positive_rate:.3%}")

false_positive_rate = fp/(fp+tn)
print(f"False Positive Rate: {false_positive_rate:.3%}")

true_negative_rate = tn/(tn+fp)
print(f"True Negative Rate: {true_negative_rate:.3%}")

false_negative_rate = fn/(fn+tp)
print(f"False Negative Rate: {false_negative_rate:.3%}")

precision = tp/(tp+fp)
print(f"Precision: {precision:.3%}")

recall = tp/(tp+fn)
print(f"Recall: {recall:.3%}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score:.3%}")

support_pos = tp + fn
print(f"Support (0): {support_pos}")

support_neg = fp + tn
print(f"Support (1): {support_neg}")

Accuracy: 83.146%
True Positive Rate: 74.895%
False Positive Rate: 11.719%
True Negative Rate: 88.281%
False Negative Rate: 25.105%
Precision: 79.911%
Recall: 74.895%
F1 Score: 77.322%
Support (0): 239
Support (1): 384


## 4. Run through steps 1-3 setting k to 10

## 5. Run through steps 1-3 setting k to 20

In [18]:
num_neighbors = [10,20]
train_acc = []
val_acc = []

for i in num_neighbors:
    
    knn = KNeighborsClassifier(n_neighbors = i)
    
    knn.fit(X_train, y_train)
    
    train_acc.append(knn.score(X_train, y_train))
    
    val_acc.append(knn.score(X_validate, y_validate))

In [19]:
knn_metrics = pd.DataFrame({'neighbors': num_neighbors,
                            'train_acc': train_acc,
                            'val_acc': val_acc})

knn_metrics

Unnamed: 0,neighbors,train_acc,val_acc
0,10,0.778491,0.784566
1,20,0.739968,0.752412


## 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

## 7. Which model performs best on our out-of-sample data from validate?