In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## 2 LOGISTICREGRESSION/LOGISTICREGRESSION/LOGISTICREGRESSION LOGISTICREGRESSION 2 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [None]:
#=================================================-
#### Slide 3: Loading packages  ####

import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# Helper packages.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
# Scikit-learn package for logistic regression.
from sklearn import linear_model
# Model set up and tuning packages from scikit-learn.
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
# Scikit-learn packages for evaluating model performance.
from sklearn import metrics
# Scikit-learn package for data preprocessing.
from sklearn import preprocessing




In [None]:
#=================================================-
#### Slide 6: Loading data into Python  ####

df = pd.read_csv(str(data_dir)+"/"+ 'healthcare-dataset-stroke-data.csv')
print(df.head())




In [None]:
#=================================================-
#### Slide 7: Subset data  ####

df_subset = df[['age', 'avg_glucose_level', 'heart_disease', 'ever_married', 'hypertension', 'Residence_type', 'gender', 'smoking_status', 'work_type', 'stroke', 'id']]
print(df_subset.head())




In [None]:
#=================================================-
#### Slide 8: Convert target to binary  ####

# Target is binary
print(df_subset['stroke'].head())
# Identify the the two unique classes
unique_values = sorted(df_subset['stroke'].unique())
df_subset['stroke'] = np.where(df_subset['stroke'] == unique_values[0],  False,True)




In [None]:
#=================================================-
#### Slide 11: Data prep: target variable  ####

print(df_subset['stroke'].dtypes)
# Identify the the two unique classes
unique_values = sorted(df_subset['stroke'].unique())
df_subset['stroke'] = np.where(df_subset['stroke'] == unique_values[0],  False,True)
# Check class again.
print(df_subset['stroke'].dtypes)




In [None]:
#=================================================-
#### Slide 12: Data prep: check for NAs  ####

 # Check for NAs. 
print(df_subset.isnull().sum())
percent_missing = df_subset.isnull().sum() * 100 / len(df_subset)
print(percent_missing)




In [None]:
#=================================================-
#### Slide 13: Data prep: check for NAs (cont'd)  ####

# Delete columns containing either 50% or more than 50% NaN Values
perc = 50.0
min_count =  int(((100-perc)/100)*df_subset.shape[0] + 1)
df_subset = df_subset.dropna(axis=1, 
               thresh=min_count)
print(df_subset.shape)




In [None]:
#=================================================-
#### Slide 14: Data prep: check for NAs (cont'd)  ####

# Function to impute NA in both numeric and categorical columns
def fillna(df):
    # Fill numeric columns with mean value
    df = df.fillna(df.mean())    
    # Fill categorical columns with mode value
    df = df.fillna(df.mode().iloc[0])
    return df
  
df_subset = fillna(df_subset)




In [None]:
#=================================================-
#### Slide 15: Data prep: split data   ####

# Split the data into X and y 
columns_to_drop_from_X = ['stroke'] + ['id']
X = df_subset.drop(columns_to_drop_from_X, axis = 1)
y = np.array(df_subset['stroke'])




In [None]:
#=================================================-
#### Slide 19: Data prep: convert categorical data columns to dummies  ####

print(X.dtypes)
X = pd.get_dummies(X, columns = ['heart_disease', 'ever_married', 'hypertension', 'Residence_type', 'gender', 'smoking_status', 'work_type'], dtype=float, drop_first=True)
print(X.dtypes)




In [None]:
#=================================================-
#### Slide 20: Split into train and test set  ####

# Set the seed.
np.random.seed(1)

# Split data into train and test sets, use a 70 train - 30 test split.
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3)
                                                    




In [None]:
#=================================================-
#### Slide 23: Scale the features (cont'd)  ####

# Initialize scaler.
scaler = preprocessing.MinMaxScaler()

# Fit on training data.
scaler.fit(X_train)

# Scale training and test data.
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)




In [None]:
#=================================================-
#### Slide 27: Logistic regression: build  ####

# Set up logistic regression model.
logistic_regression_model = linear_model.LogisticRegression()
print(logistic_regression_model)




In [None]:
#=================================================-
#### Slide 29: Logistic regression: fit (cont'd)  ####

# Fit the model.
logistic_regression_model.fit(X_train_scaled, 
                              y_train)




In [None]:
#=================================================-
#### Slide 30: Logistic regression: predict  ####

# Predict on test data.
predicted_values = logistic_regression_model.predict(X_test_scaled)
print(predicted_values[:20])




In [None]:
#=================================================-
#### Slide 37: Confusion matrix and accuracy  ####

# Take a look at test data confusion matrix.
conf_matrix_test = metrics.confusion_matrix(y_test, predicted_values)
print(conf_matrix_test)
# Compute test model accuracy score.
test_accuracy_score = metrics.accuracy_score(y_test, predicted_values)
print("Accuracy on test data: ", test_accuracy_score)




In [None]:
#=================================================-
#### Slide 38: Classification report  ####

# Create a list of target names to interpret class assignments.
target_names = df_subset['stroke'].unique()
target_names=target_names.tolist()
target_names = [str(x) for x in target_names]
print(class_report)




In [None]:
#=================================================-
#### Slide 42: Save accuracy score  ####

model_final = {'metrics' : "accuracy" , 
                'values' : round(test_accuracy_score,4),
                'model':'logistic' }
print(model_final)




In [None]:
#=================================================-
#### Slide 43: Getting probabilities instead of class labels  ####

# Get probabilities instead of predicted values.
test_probabilities = logistic_regression_model.predict_proba(X_test_scaled)
print(test_probabilities[0:5, :])
# Get probabilities of test predictions only.
test_predictions = test_probabilities[:, 1]
print(test_predictions[0:5])




In [None]:
#=================================================-
#### Slide 44: Computing FPR, TPR, and threshold  ####

# Get FPR, TPR, and threshold values.
fpr, tpr, threshold = metrics.roc_curve(y_test,            #<- test data labels
                                        test_predictions)  #<- predicted probabilities
print("False positive: ", fpr[:5])
print("True positive: ", tpr[:5])
print("Threshold: ", threshold[:5])




In [None]:
#=================================================-
#### Slide 45: Computing AUC  ####

# Get AUC by providing the FPR and TPR.
auc = metrics.auc(fpr, tpr)
print("Area under the ROC curve: ", auc)




In [None]:
#=================================================-
#### Slide 46: Putting it all together: ROC plot  ####

# Make an ROC curve plot.
plt.title('Receiver Operator Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
# Make an ROC curve plot.
plt.title('Receiver Operator Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################


In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## 2 LOGISTICREGRESSION/LOGISTICREGRESSION/LOGISTICREGRESSION LOGISTICREGRESSION 3 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs




In [None]:
#=================================================-
#### Slide 3: Accuracy on train vs. accuracy on test  ####

# Compute trained model accuracy score.
trained_accuracy_score = logistic_regression_model.score(X_train_scaled, y_train)
print("Accuracy on train data: " , trained_accuracy_score)




In [None]:
#=================================================-
#### Slide 15: Prepare parameters for optimization  ####

# Create regularization penalty space.
penalty = ['l1', 'l2']
# Create regularization constant space.
C = np.logspace(0, 10, 10)
print("Regularization constant: ", C)
# Create hyperparameter options dictionary.
hyperparameters = dict(C = C, penalty = penalty)
print(hyperparameters)




In [None]:
#=================================================-
#### Slide 16: Set up cross-validation logistic function  ####

# Grid search 10-fold cross-validation with above parameters.
clf = GridSearchCV(linear_model.LogisticRegression(solver='liblinear'), #<- function to optimize
                   hyperparameters,                   #<- grid search parameters
                   cv = 10,                           #<- 10-fold cv
                   verbose = 0)                       #<- no messages to show
# Fit CV grid search.
best_model = clf.fit(X_train_scaled, y_train)
best_model




In [None]:
#=================================================-
#### Slide 17: Check best parameters found by CV  ####

# Get best penalty and constant parameters.
penalty = best_model.best_estimator_.get_params()['penalty']
constant = best_model.best_estimator_.get_params()['C']
print('Best penalty: ', penalty)
print('Best C: ', constant)




In [None]:
#=================================================-
#### Slide 18: Predict using the best model parameters  ####

# Predict on test data using best model.
best_predicted_values = best_model.predict(X_test_scaled)
print(best_predicted_values)
# Compute best model accuracy score.
best_accuracy_score = metrics.accuracy_score(y_test, best_predicted_values)
print("Accuracy on test data (best model): ", best_accuracy_score)




In [None]:
#=================================================-
#### Slide 19: Accuracy on train vs. accuracy on test  ####

# Compute trained model accuracy score.
trained_accuracy_score = best_model.score(X_train_scaled, y_train)
print("Accuracy on train data: " , trained_accuracy_score)




In [None]:
#=================================================-
#### Slide 20: Assessing the tuned model  ####

# Compute confusion matrix for best model.
best_confusion_matrix = metrics.confusion_matrix(y_test, best_predicted_values)
print(best_confusion_matrix)
# Create a list of target names to interpret class assignments.
target_names = ['Low value', 'High value']
print(best_class_report)




In [None]:
#=================================================-
#### Slide 21: Save accuracy score  ####

model_final = {'metrics' : "accuracy", 
                                  'values' : round(best_accuracy_score, 4),
                                  'model':'logistic_tuned' }
print(model_final)




In [None]:
#=================================================-
#### Slide 22: Get metrics for ROC curve  ####

# Get probabilities instead of predicted values.
best_test_probabilities = best_model.predict_proba(X_test_scaled)
print(best_test_probabilities[0:5, ])
# Get probabilities of test predictions only.
best_test_predictions = best_test_probabilities[:, 1]
print(best_test_predictions[0:5])




In [None]:
#=================================================-
#### Slide 23: Get metrics for ROC curve (cont'd)  ####

# Get ROC curve metrics.
best_fpr, best_tpr, best_threshold = metrics.roc_curve(y_test, best_test_predictions)
best_auc = metrics.auc(best_fpr, best_tpr)
print(best_auc)




In [None]:
#=================================================-
#### Slide 24: Plot ROC curve for both models  ####

# Make an ROC curve plot.
plt.title('Receiver Operator Characteristic')
plt.plot(fpr, tpr, 'blue', 
         label = 'AUC = %0.2f'%auc)
plt.plot(best_fpr, best_tpr, 'black', 
         label = 'AUC (best) = %0.2f'%best_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
plt.clf()

# Make an ROC curve plot.
plt.title('Receiver Operator Characteristic')
plt.plot(fpr, tpr, 'blue', label = 'AUC = %0.2f' % auc)
plt.plot(best_fpr, best_tpr, 'black', label = 'AUC (optimized) = %0.2f' % best_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()




In [None]:
#=================================================-
#### Slide 26: Exercise  ####




#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################
