# Project Downfalls

- Dataset too small
- Dataset imbalanced (95% non stroke - 5% stroke) and trying to predict stroke...
- Try to increase Recall but lose precision more than desired

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Data getting, cleaning, and exploring
import wrangle as w
import explore as ex

# Python without these is hard
import pandas as pd
import numpy as np
from scipy import stats

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer

# Regression Modeling
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

# Classification Modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.tree import export_graphviz

In [2]:
df = w.wrangle_stroke()

In [3]:
train, validate, test = w.train_validate_test_split(df, 'stroke', 42)

In [4]:
X_train = train.drop(columns=['stroke'])
y_train = train.stroke

X_validate = validate.drop(columns=['stroke'])
y_validate = validate.stroke

X_test = test.drop(columns=['stroke'])
y_test = test.stroke

In [5]:
cat_cols = X_train.loc[:,df.dtypes == "object"].columns
num_cols = X_train.loc[:,df.dtypes != "object"].columns

In [6]:
num_cols

Index(['age', 'hypertension', 'heart_disease', 'ever_married',
       'avg_glucose_level', 'bmi', 'rural_residence', 'urban_residence',
       'is_female', 'is_male', 'current_smoker', 'age_bin', 'gluc_bin'],
      dtype='object')

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
        ('one_hot', OneHotEncoder(handle_unknown='ignore'))
    ])

full_pipeline = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ])
    

X_train = full_pipeline.fit_transform(X_train, y_train)
X_test = full_pipeline.fit_transform(X_test)
X_validate = full_pipeline.fit_transform(X_validate)



# imbalanced-learn 
    pip install imbalanced-learn
    
    a python package offering a number of re-sampling techniques commonly used in datasets showing strong between-class imbalance. 
    It is compatible with scikit-learn and is part of scikit-learn-contrib projects.

## Oversampling and undersampling 
    techniques used to adjust the class distribution of a data set (i.e. the ratio between the different classes/categories represented)

In [9]:
# We can see only 5% of data show patients who had a stroke. It is a clear inbalance which will not allow
# model to learn properly. To avoid that I will try a couple of methods(undersampling and oversampling)
# to eliminate the problem.
# Let's check wich method works the best with RandomForestClassifier

from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, SVMSMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler, AllKNN, NeighbourhoodCleaningRule

equalizers = [
    SMOTE(),
    BorderlineSMOTE(),
    ADASYN(),
    SVMSMOTE(),
    NearMiss(),
    RandomUnderSampler(),
    AllKNN(),
    NeighbourhoodCleaningRule()
]

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

def train_and_evaluate(model, train, train_y, test, test_y, eq=None, train_model=True, threashold=0.5):
    if train_model:
        model.fit(train, train_y)
    
    results = model.predict_proba(test)
    
    proba = results[:,1]
    results = (results[:,1] > threashold).astype(int)
    
    print('/'*80)
    print(model)
    if eq != None:
        print(eq)
    print()
    print('confusion_matrix')
    print(confusion_matrix(test_y, results))
    print('roc_auc')
    print(roc_auc_score(test_y, proba))
    print(classification_report(test_y, results))
    
    return proba

In [11]:
for eq in equalizers:
    model = RandomForestClassifier(random_state=1234)
    X_train_eq, y_train_eq = eq.fit_resample(X_train, y_train.ravel())
    train_and_evaluate(model, X_train_eq, y_train_eq, X_test, y_test, eq)

////////////////////////////////////////////////////////////////////////////////
RandomForestClassifier(random_state=1234)
SMOTE()

confusion_matrix
[[698 274]
 [ 31  19]]
roc_auc
0.6505246913580247
              precision    recall  f1-score   support

           0       0.96      0.72      0.82       972
           1       0.06      0.38      0.11        50

    accuracy                           0.70      1022
   macro avg       0.51      0.55      0.47      1022
weighted avg       0.91      0.70      0.79      1022

////////////////////////////////////////////////////////////////////////////////
RandomForestClassifier(random_state=1234)
BorderlineSMOTE()

confusion_matrix
[[839 133]
 [ 38  12]]
roc_auc
0.7185288065843621
              precision    recall  f1-score   support

           0       0.96      0.86      0.91       972
           1       0.08      0.24      0.12        50

    accuracy                           0.83      1022
   macro avg       0.52      0.55      0.52    

In [12]:
# as we can see randomundersampler seems to be working the best(it maximize the recall for stoke) 

eq = RandomUnderSampler()
X_train, y_train = eq.fit_resample(X_train, y_train.ravel())
X_train.shape

(278, 21)

## The small size of the dataset creates an issue once we run our Random Under Sampler
### Data set dropped from 5k to 278

In [13]:
# Let's quickly go through couple models and pick 2~3 the best of them to try improve the results with 
# various hyperparameters. We are going to try to maximize roc_auc score

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC


from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_validate

np.random.seed(1234)

In [14]:
models = [
    (AdaBoostClassifier(), 'AdaBoost'),
    (RandomForestClassifier(), 'RandomForest'),
    (ExtraTreesClassifier(), 'ExtraTreesClassifier'),
    (LogisticRegression(), 'LogisticRegression'),
    (KNeighborsClassifier(), 'KNeighbors'),
    (SVC(probability=True), 'SVC'),
]

def print_scores(scores, model_name):
    print(model_name)
    print()
    print(scores)
    print("mean: {}".format(scores.mean()))
    print("std: {}".format(scores.std()))
    print()
    print()

In [15]:
for model, name in models:
    train_and_evaluate(model, X_train, y_train, X_test, y_test)

////////////////////////////////////////////////////////////////////////////////
AdaBoostClassifier()

confusion_matrix
[[680 292]
 [ 12  38]]
roc_auc
0.8116666666666666
              precision    recall  f1-score   support

           0       0.98      0.70      0.82       972
           1       0.12      0.76      0.20        50

    accuracy                           0.70      1022
   macro avg       0.55      0.73      0.51      1022
weighted avg       0.94      0.70      0.79      1022

////////////////////////////////////////////////////////////////////////////////
RandomForestClassifier()

confusion_matrix
[[684 288]
 [ 11  39]]
roc_auc
0.8225925925925927
              precision    recall  f1-score   support

           0       0.98      0.70      0.82       972
           1       0.12      0.78      0.21        50

    accuracy                           0.71      1022
   macro avg       0.55      0.74      0.51      1022
weighted avg       0.94      0.71      0.79      1022

//