In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## 1 INTROTOCLASSIFICATION/KNN/INTROTOCLASSIFICATION KNN 2 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs

In [None]:
# =================================================-
#### Slide 3: Loading packages  ####

import pickle
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import metrics

In [None]:
# =================================================-
#### Slide 4: Load data into Python  ####

df = pd.read_csv(str(data_dir) + "/" + "healthcare-dataset-stroke-data.csv")
print(df.head())

In [None]:
# =================================================-
#### Slide 5: Subset data  ####

df = df[
    [
        "age",
        "avg_glucose_level",
        "heart_disease",
        "ever_married",
        "hypertension",
        "Residence_type",
        "gender",
        "smoking_status",
        "work_type",
        "stroke",
        "id",
    ]
]
print(df.head())

In [None]:
# =================================================-
#### Slide 6: Convert target to binary  ####

# Target not binary - calculate the mean and assign the above mean to 1 and below to 0
threshold = np.mean(df["stroke"])
df["stroke"] = np.where(df["stroke"] > threshold, 1, 0)
# Target is binary
print(df["stroke"])

In [None]:
# =================================================-
#### Slide 9: The data at first glance  ####

# The first 3 rows.
print(df.head(3))
# The data types.
print(df.dtypes)
print(df["stroke"].value_counts())

In [None]:
# =================================================-
#### Slide 10: Data prep: check for NAs  ####

# Check for NAs.
print(df.isnull().sum())

In [None]:
# =================================================-
#### Slide 11: Data prep: check for NAs  ####

percent_missing = df.isnull().sum() * 100 / len(df)
print(percent_missing)

In [None]:
# =================================================-
#### Slide 12: Data prep: check for NAs  ####

# Delete columns containing either 50% or more than 50% NaN Values
perc = 50.0
min_count = int(((100 - perc) / 100) * df.shape[0] + 1)
df = df.dropna(axis=1, thresh=min_count)
print(df.shape)


# Function to impute NA in both numeric and categorical columns
def fillna(df):
    # Fill numeric columns with mean value
    df = df.fillna(df.mean())
    # Fill categorical columns with mode value
    df = df.fillna(df.mode().iloc[0])
    return df


df = fillna(df)

In [None]:
# =================================================-
#### Slide 13: Data prep: ready for kNN  ####

print(df["stroke"].dtypes)
# Identify the the two unique classes
unique_values = sorted(df["stroke"].unique())
df["stroke"] = np.where(df["stroke"] == unique_values[0], False, True)
# Split the data into X and y
columns_to_drop_from_X = ["stroke"] + ["id"]
X = df.drop(columns_to_drop_from_X, axis=1)
y = np.array(df["stroke"])

In [None]:
# =================================================-
#### Slide 14: Data prep: numeric variables  ####

print(X.dtypes)
X = pd.get_dummies(
    X,
    columns=[
        "heart_disease",
        "ever_married",
        "hypertension",
        "Residence_type",
        "gender",
        "smoking_status",
        "work_type",
    ],
    dtype=float,
    drop_first=True,
)
print(X.dtypes)

In [None]:
# =================================================-
#### Slide 20: Train and test: small scale before n-fold  ####

# Set the seed.
np.random.seed(1)

# Split into train and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# =================================================-
#### Slide 22: Data prep: scaling variables  ####

# Scale X.
X_train = scale(X_train)
X_test = scale(X_test)
print(X_train[0:2])
print(X_test[0:2])


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################

In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## 1 INTROTOCLASSIFICATION/KNN/INTROTOCLASSIFICATION KNN 3 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs

In [None]:
# =================================================-
#### Slide 5: kNN: build model  ####

# Create kNN classifier.
default = 5
kNN = KNeighborsClassifier(n_neighbors=default)
# Fit the classifier to the data.
kNN.fit(X_train, y_train)

In [None]:
# =================================================-
#### Slide 6: kNN: predict on a test set  ####

predictions = kNN.predict(X_test)
print(predictions[0:5])

In [None]:
# =================================================-
#### Slide 7: kNN: predict on test  ####

actual_v_predicted = np.column_stack((y_test, predictions))
print(actual_v_predicted[0:5])

In [None]:
# =================================================-
#### Slide 18: Confusion matrix in python  ####

# Confusion matrix for kNN.
cm_kNN = confusion_matrix(y_test, predictions)
print(cm_kNN)
print(round(accuracy_score(y_test, predictions), 4))

In [None]:
# =================================================-
#### Slide 19: Confusion matrix: visualize  ####

plt.imshow(cm_kNN, interpolation="nearest", cmap=plt.cm.Wistia)
classNames = ["Negative", "Positive"]
plt.title("Confusion Matrix - Test Data")
plt.ylabel("True label")
plt.xlabel("Predicted label")
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [["TN", "FP"], ["FN", "TP"]]
for i in range(2):
    for j in range(2):
        plt.text(j, i, str(s[i][j]) + " = " + str(cm_kNN[i][j]))
plt.show()

In [None]:
# =================================================-
#### Slide 20: Evaluation of kNN with k neighbors  ####

# Create a dictionary with accuracy values for our kNN model with k.
model_final_dict = {
    "metrics": ["accuracy"],
    "values": [round(accuracy_score(y_test, predictions), 4)],
    "model": ["kNN_k"],
}
model_final = pd.DataFrame(data=model_final_dict)
print(model_final)

In [None]:
# =================================================-
#### Slide 24: Plot ROC and calculate AUC  ####

# Store FPR, TPR, and threshold as variables.
fpr, tpr, threshold = metrics.roc_curve(y_test, predictions)
# Store the AUC.
roc_auc = metrics.auc(fpr, tpr)
plt.title("Receiver Operating Characteristic")
plt.plot(fpr, tpr, "b", label="AUC = %0.2f" % roc_auc)
plt.legend(loc="lower right")
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################

In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## 1 INTROTOCLASSIFICATION/KNN/INTROTOCLASSIFICATION KNN 4 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs

In [None]:
# =================================================-
#### Slide 6: Cross-validation pipeline for optimal accuracy  ####

# Create a pipeline of the scaler and Estimator
cv_pipeline = Pipeline([("transformer", StandardScaler()), ("estimator", kNN)])

In [None]:
# =================================================-
#### Slide 7: Cross-validation for optimal accuracy  ####

# Calculate cv scores
cv_scores = cross_val_score(cv_pipeline, X, y, cv=5)

In [None]:
# =================================================-
#### Slide 8: Cross-validation for optimal accuracy  ####

# Print each cv score (accuracy) and average them.
print(cv_scores)
print("cv_scores mean:{}".format(np.mean(cv_scores)))
mean = np.mean(cv_scores)
print("Optimal cv score is:", round(mean, 4))


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################

In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## 1 INTROTOCLASSIFICATION/KNN/INTROTOCLASSIFICATION KNN 5 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs

In [None]:
# =================================================-
#### Slide 5: Finding optimal k - GridSearchCV  ####

# Define the parameter values that should be searched.
k_range = list(range(1, 31))

# Create a parameter grid: map the parameter names to the values that should be searched by building a Python dictionary.
# key: parameter name
# value: list of values that should be searched for that parameter
# single key-value pair for param_grid
param_grid = dict(n_neighbors=k_range)
print(param_grid)

# Instantiate the grid using our original model - kNN with k.
grid = GridSearchCV(kNN, param_grid, cv=10, scoring="accuracy")

In [None]:
# =================================================-
#### Slide 6: Finding optimal k - GridSearchCV  ####

# Create a pipeline of the scaler and gridsearch
grid_search_pipeline = Pipeline(
    [("transformer", StandardScaler()), ("estimator", grid)]
)

# Fit Gridsearch pipeline
grid_search_pipeline.fit(X, y)

In [None]:
# =================================================-
#### Slide 7: Finding optimal k - view results  ####

# View the complete results (list of named tuples).
print(grid.cv_results_["mean_test_score"])

In [None]:
# =================================================-
#### Slide 8: Finding optimal k  ####

# Create a list of the mean scores only by using a list comprehension to loop through grid.cv_results_.
grid_mean_scores = [result for result in grid.cv_results_["mean_test_score"]]
print(grid_mean_scores)

In [None]:
# =================================================-
#### Slide 9: Finding optimal k - plot  ####

# Plot the results.
_ = plt.plot(k_range, grid_mean_scores)
_ = plt.xlabel("Value of K for kNN")
_ = plt.ylabel("Cross-Validated Accuracy")
plt.show()

In [None]:
# =================================================-
#### Slide 10: Define and examine the optimized model   ####

# Single best score achieved across all params (k).
print(grid.best_score_)
grid_score = grid.best_score_

# Dictionary containing the parameters (k) used to generate that score.
print(grid.best_params_)

# Actual model object fit with those best parameters.
# Shows default parameters that we did not specify.
print(grid.best_estimator_)

In [None]:
# =================================================-
#### Slide 11: Add GridSearchCV score to the final scores  ####

model_final = model_final.append(
    {
        "metrics": "accuracy",
        "values": round(grid_score, 4),
        "model": "kNN_GridSearchCV",
    },
    ignore_index=True,
)
print(model_final)

In [None]:
# =================================================-
#### Slide 12: Optimal model and final thoughts  ####

kNN_best = grid.best_estimator_

# Check accuracy of our model on the test data.
print(kNN_best.score(X_test, y_test))
kNN_champ = kNN_best.score(X_test, y_test)

In [None]:
# =================================================-
#### Slide 14: Final Model   ####

# Save this final model
model_final = {
    "metrics": "accuracy",
    "values": round(kNN_champ, 4),
    "model": "kNN_optimized",
}
print(model_final)
pickle.dump(model_final, open(str(data_dir) + "/" + "model_final.sav", "wb"))

In [None]:
# =================================================-
#### Slide 18: Exercise  ####


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################