## Stroke Predictions - Machine Learning

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf


In [2]:
#import from API

import requests
import json

data_url = 'https://team-3-project-4.onrender.com/api/v1.0/brain_stroke_data'

response_API = requests.get(data_url)

x = response_API.json()
stroke_df = pd.DataFrame(x)
stroke_df.head()

Unnamed: 0,Residence_type,age,avg_glucose_level,bmi,ever_married,gender,heart_disease,hypertension,id,smoking_status,stroke,work_type
0,Urban,67,228.69,36.6,Yes,Male,1,0,9046,formerly smoked,1,Private
1,Rural,80,105.92,32.5,Yes,Male,1,0,31112,never smoked,1,Private
2,Urban,49,171.23,34.4,Yes,Female,0,0,60182,smokes,1,Private
3,Rural,79,174.12,24.0,Yes,Female,0,1,1665,never smoked,1,Self-employed
4,Urban,81,186.21,29.0,Yes,Male,0,0,56669,formerly smoked,1,Private


In [3]:
#drop columns we arent using in our test/train

stroke_df.drop(["Residence_type", "ever_married", "id", "work_type"], axis=1, inplace=True)


In [4]:
stroke_df

Unnamed: 0,age,avg_glucose_level,bmi,gender,heart_disease,hypertension,smoking_status,stroke
0,67,228.69,36.6,Male,1,0,formerly smoked,1
1,80,105.92,32.5,Male,1,0,never smoked,1
2,49,171.23,34.4,Female,0,0,smokes,1
3,79,174.12,24.0,Female,0,1,never smoked,1
4,81,186.21,29.0,Male,0,0,formerly smoked,1
...,...,...,...,...,...,...,...,...
4904,13,103.08,18.6,Female,0,0,Unknown,0
4905,81,125.20,40.0,Female,0,0,never smoked,0
4906,35,82.99,30.6,Female,0,0,never smoked,0
4907,51,166.29,25.6,Male,0,0,formerly smoked,0


In [5]:
stroke_df.head()

Unnamed: 0,age,avg_glucose_level,bmi,gender,heart_disease,hypertension,smoking_status,stroke
0,67,228.69,36.6,Male,1,0,formerly smoked,1
1,80,105.92,32.5,Male,1,0,never smoked,1
2,49,171.23,34.4,Female,0,0,smokes,1
3,79,174.12,24.0,Female,0,1,never smoked,1
4,81,186.21,29.0,Male,0,0,formerly smoked,1


In [6]:
# Find null values (if any)
for column in stroke_df.columns:
    print(f"Column {column} has {stroke_df[column].isnull().sum()} null values")

# Find duplicate entries (if any)
print(f"Duplicate entries: {stroke_df.duplicated().sum()}")

Column age has 0 null values
Column avg_glucose_level has 0 null values
Column bmi has 0 null values
Column gender has 0 null values
Column heart_disease has 0 null values
Column hypertension has 0 null values
Column smoking_status has 0 null values
Column stroke has 0 null values
Duplicate entries: 0


In [7]:
# Generate our categorical variable lists
stroke_cat = stroke_df.dtypes[stroke_df.dtypes == "object"].index.tolist()
stroke_df[stroke_cat].nunique()

gender            3
smoking_status    4
dtype: int64

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(stroke_df[stroke_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(stroke_cat)
encode_df.head()



Unnamed: 0,gender_Female,gender_Male,gender_Other,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [9]:
# Merge one-hot encoded features and drop the originals
stroke_df = stroke_df.merge(encode_df,left_index=True, right_index=True)
stroke_df = stroke_df.drop(stroke_cat,1)

#change column name
stroke_df.rename(columns = {'smoking_status_formerly smoked' : 'smoking_status_formerly',
                            'smoking_status_never smoked' : 'smoking_status_never'}, inplace = True)

stroke_df.head()

Unnamed: 0,age,avg_glucose_level,bmi,heart_disease,hypertension,stroke,gender_Female,gender_Male,gender_Other,smoking_status_Unknown,smoking_status_formerly,smoking_status_never,smoking_status_smokes
0,67,228.69,36.6,1,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,80,105.92,32.5,1,0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,49,171.23,34.4,0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,79,174.12,24.0,0,1,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,81,186.21,29.0,0,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [10]:
# Split our preprocessed data into our features and target arrays
y = stroke_df["stroke"]
X = stroke_df.drop(["stroke"],1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [11]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Train a Logistic Regression model and print the model score

# Create the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

# Fit the model to the training data. 
model.fit(X_train, y_train)

print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")


print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(model.predict(X_test[:10]))}')

Training Data Score: 0.9603368649823417
Testing Data Score: 0.9486970684039088
Actual:		[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Predicted:	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [13]:
X_train.shape

(3681, 12)

In [14]:
X_test.shape

(1228, 12)

In [15]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = model.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[1165,    0],
       [  63,    0]], dtype=int64)

In [16]:
#The accuracy of the Logistic Regression model on the test data is TP + TN / (TP + FP + TN + FN)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn)
print(f"Logistic Regression Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.9486970684039088


In [17]:
# Train a Random Forest Classifier model and print the model score
# Import a Random Forests classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.9462540716612378


In [18]:
#The accuracy of the Random Forest Classifier on the test data is TP + TN / (TP + FP + TN + FN)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn)
print(f"Random Forest Classifier Accuracy: {accuracy}")

Random Forest Classifier Accuracy: 0.9462540716612378


## Logistic Regression is more accurate

In [19]:
# Save the model via pickle

import pickle
pickle.dump(model, open('model.pkl', 'wb'))

In [20]:
from plotly import tools
import plotly as py
import plotly.graph_objs as go


In [21]:
p1 = go.Scatter(x=X_test['bmi'],
                y=X_test['age'],
                mode='markers',
                #color=y
                marker=dict(color=y),
                )


layout = go.Layout(xaxis=dict(ticks='', showticklabels=True,
                              zeroline=False, title = 'BMI'),
                   yaxis=dict(ticks='', showticklabels=True,
                              zeroline=False, title = 'AGE'),
                   showlegend=True, hovermode='closest')

fig = go.Figure(data=[p1], layout=layout)

py.offline.iplot(fig)

In [22]:
p2 = go.Scatter(x=X_test['avg_glucose_level'],
                y=X_test['bmi'],
                mode='markers',
                #color=y
                marker=dict(color=y)
               )


layout = go.Layout(xaxis=dict(ticks='', showticklabels=True,
                              zeroline=False, title = 'Ave Glucose Level'),
                   yaxis=dict(ticks='', showticklabels=True,
                              zeroline=False, title = 'BMI'),
                   showlegend=True, hovermode='closest')

fig2 = go.Figure(data=[p2], layout=layout)

py.offline.iplot(fig2)

In [23]:
# import plotly.express as px
# stroke_df_order = stroke_df.sort_values(by=["stroke","ever_married_Yes"])
# fig = px.bar(stroke_df_order, x="ever_married_Yes", y="stroke", color="ever_married_Yes",
#              labels={"ever_married_Yes":"Marital Status", "stroke":"Stroke"},
#              title="Stroke Relation to Marital Status", barmode = 'stack').update_layout(title_font_size=16, title_x=0.5)
# fig.update_layout(xaxis=dict(tickvals=[0,1], ticktext=['Not Married', 'Married']))
# #fig.update_traces(showlegend=False)
# fig.show()