# KNN Notebook

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier


# Read the CSV and Preform Basic Data Cleaning

In [None]:
gender_df = pd.read_csv("data/gender_submission.csv")
test_df = pd.read_csv("data/test.csv")
train_df = pd.read_csv("data/train.csv")

In [None]:
train_df

In [None]:
test_df.head()

In [None]:
gender_df

In [None]:
##### Use the following fields in the model 
# Pclass, Sex, age, SibSp, Parch, fare 

##### Test and training variable sources (csv files)
# X_train - train.csv - split the survived into y_train 
# X_test - test.csv 
# y_train - target - survived from the train.csv 
# y_test - gender file 

# Establish the training sets

In [None]:
X_train = train_df[['Pclass', 'Sex', 'Survived','Age', 'SibSp', 'Parch', 'Fare']]

In [None]:
# Drop null values
X_train = X_train.dropna()

In [None]:
# Set the target variable
y_train = X_train['Survived']

In [None]:
y_train = y_train.values.reshape(-1, 1)

In [None]:
y_train.shape

In [None]:
# Drop the target variable from X_train
X_train = X_train[['Pclass', 'Sex','Age', 'SibSp', 'Parch', 'Fare']]

In [None]:
X_train.shape

# Clean data for the test sets

In [None]:
# merge gender_df and test_df
merged_test_df = test_df.merge(gender_df, on='PassengerId')

In [None]:
merged_test_df  

In [None]:
merged_test_df = merged_test_df[['Pclass', 'Sex', 'Survived','Age', 'SibSp', 'Parch', 'Fare']]

In [None]:
merged_test_df

In [None]:
# Remove all null values from the testing set
merged_test_df = merged_test_df.dropna()

# Establish X and y test sets

In [None]:
X_test = merged_test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

In [None]:
X_test.shape

In [None]:
y_test = merged_test_df['Survived']

In [None]:
y_test = y_test.values.reshape(-1,1)

In [None]:
# y_test = y_test.values.reshape(-1, 1)
y_test.shape

# Encode the gender data

In [None]:
# Encode the sex column in the test set
X_test = X_test.copy()

X_test = pd.get_dummies(X_test, columns=["Sex"])
X_test.shape

In [None]:
X_test.head()

In [None]:
# Encode the sex column in the train set
X_train = X_train.copy()

X_train = pd.get_dummies(X_train, columns=["Sex"])
X_train.head()

# Scale the data

In [None]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

# Create, fit and validate the K Nearest Neighbor model

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
# Note that k: 6 seems to be the best choice for this dataset
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train_scaled, y_train)
print('k=6 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

In [None]:
# Validate the model using the test data
print(f'Training Data Score: {knn.score(X_train, y_train)}')
print(f'Testing Data Score: {knn.score(X_test, y_test)}')

# Make predictions

In [None]:
# Generate a new data point
import numpy as np

### Prediction person data (Pclass = 1, 2, or 3)
Pclass = 1
Age = 20 
SibSp = 1 
Parch = 0
Fare = 75
Sex_female = 1
Sex_male = 0

new_person = np.array([[Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]])
new_person

In [None]:
# Predict whether or not the new person survives
predictions = knn.predict(new_person)

if predictions == 0:
    print(f"This person would probably perish on the Titanic")
else:
    print(f"This person would probably survive on the Titanic")

In [None]:
### Prediction number 2
Pclass = 3
Age = 45 
SibSp = 1 
Parch = 2
Fare = 20
Sex_female = 0
Sex_male = 1

new_person2 = np.array([[Pclass, Age, SibSp, Parch, Fare, Sex_female, Sex_male]])
new_person2

In [None]:
# Predict whether or not the new person survives
predictions_2 = knn.predict(new_person2)
if predictions_2 == 0:
    print(f"This person would probably perish on the Titanic")
else:
    print(f"This person would probably survive on the Titanic")