In [69]:
import pandas as pd
import matplotlib as plt
import sklearn as skl
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [70]:
# for displaying all feature from dataset:
pd.pandas.set_option('display.max_columns', None)

In [71]:
df = pd.read_csv("Resources/Prepped_Stroke_Data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,age,gender,nhiss,severity_level,mrs,systolic,distolic,glucose,paralysis,smoking,bmi,cholestrol,tos,risk
0,75,1,4,1,1,140,89,190,1,1,25,205,1,1
1,68,1,1,1,1,133,116,174,1,0,35,206,1,1
2,88,0,1,1,1,124,118,79,2,0,35,239,1,1
3,58,0,4,1,0,126,105,198,0,0,25,222,1,1
4,50,1,3,1,2,140,106,87,0,0,39,222,1,2


In [72]:
df.columns

Index(['age', 'gender', 'nhiss', 'severity_level', 'mrs', 'systolic',
       'distolic', 'glucose', 'paralysis', 'smoking', 'bmi', 'cholestrol',
       'tos', 'risk'],
      dtype='object')

In [73]:
# Set features. This will also be used as your x values.
# selected_features = df[['names', 'of', 'selected', 'features', 'here']]

selected_features = df[ ['age', 'gender', 'nhiss', 'severity_level', 'mrs', 'systolic',
       'distolic', 'glucose', 'smoking', 'bmi', 'cholestrol',
       ]]


In [74]:
selected_features

Unnamed: 0,age,gender,nhiss,severity_level,mrs,systolic,distolic,glucose,smoking,bmi,cholestrol
0,75,1,4,1,1,140,89,190,1,25,205
1,68,1,1,1,1,133,116,174,0,35,206
2,88,0,1,1,1,124,118,79,0,35,239
3,58,0,4,1,0,126,105,198,0,25,222
4,50,1,3,1,2,140,106,87,0,39,222
...,...,...,...,...,...,...,...,...,...,...,...
4545,67,0,13,2,6,180,83,198,2,39,219
4546,67,1,20,3,3,180,88,188,2,22,187
4547,65,0,6,2,4,126,111,227,3,23,233
4548,64,0,19,3,5,126,82,262,2,28,244


In [75]:
df.corr()

Unnamed: 0,age,gender,nhiss,severity_level,mrs,systolic,distolic,glucose,paralysis,smoking,bmi,cholestrol,tos,risk
age,1.0,-0.033676,0.058495,0.033389,0.041504,0.439812,0.476497,-0.050585,0.089203,0.320757,0.015409,0.47112,-0.013581,0.122307
gender,-0.033676,1.0,-0.171225,-0.147354,-0.098114,-0.078162,-0.073362,-0.044245,-0.041195,-0.056915,-0.022629,-0.086824,-0.095736,-0.092974
nhiss,0.058495,-0.171225,1.0,0.930975,0.695623,0.369223,0.254689,0.437065,0.195058,0.101446,0.145126,0.223306,0.534138,0.55764
severity_level,0.033389,-0.147354,0.930975,1.0,0.767575,0.38864,0.239997,0.518909,0.201266,0.086293,0.140277,0.187086,0.581724,0.607909
mrs,0.041504,-0.098114,0.695623,0.767575,1.0,0.377404,0.218693,0.544069,0.19038,0.044584,0.167718,0.164646,0.521503,0.556399
systolic,0.439812,-0.078162,0.369223,0.38864,0.377404,1.0,0.590822,0.299251,0.190716,0.226261,0.118996,0.531017,0.284736,0.412328
distolic,0.476497,-0.073362,0.254689,0.239997,0.218693,0.590822,1.0,0.1516,0.179014,0.279731,0.111874,0.541236,0.184149,0.288969
glucose,-0.050585,-0.044245,0.437065,0.518909,0.544069,0.299251,0.1516,1.0,0.1646,-0.047011,0.120565,0.10489,0.368605,0.502262
paralysis,0.089203,-0.041195,0.195058,0.201266,0.19038,0.190716,0.179014,0.1646,1.0,0.059543,0.107851,0.157792,0.116854,0.258843
smoking,0.320757,-0.056915,0.101446,0.086293,0.044584,0.226261,0.279731,-0.047011,0.059543,1.0,0.006805,0.280354,0.063322,0.177899


In [76]:
# y = df["severity_level"].map({'CANDIDATE':0.0, 'CONFIRMED':1.0, 'FALSE POSITIVE':0.0}).values.reshape(-1, 1)

X = df.drop(["nhiss","severity_level","mrs"],axis=1)
y = df["severity_level"]
X


Unnamed: 0,age,gender,systolic,distolic,glucose,paralysis,smoking,bmi,cholestrol,tos,risk
0,75,1,140,89,190,1,1,25,205,1,1
1,68,1,133,116,174,1,0,35,206,1,1
2,88,0,124,118,79,2,0,35,239,1,1
3,58,0,126,105,198,0,0,25,222,1,1
4,50,1,140,106,87,0,0,39,222,1,2
...,...,...,...,...,...,...,...,...,...,...,...
4545,67,0,180,83,198,1,2,39,219,3,2
4546,67,1,180,88,188,0,2,22,187,1,2
4547,65,0,126,111,227,0,3,23,233,3,2
4548,64,0,126,82,262,2,2,28,244,1,2


In [77]:

y_adjusted = pd.get_dummies(y.replace(1,"mild").replace(2,"mild_to_moderate").replace(3,"severe").replace(4,"very_severe"),drop_first=True)
y_adjusted


Unnamed: 0,mild_to_moderate,severe,very_severe
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
4545,1,0,0
4546,0,1,0
4547,1,0,0
4548,0,1,0


In [78]:
print("X_train length: ", len(X_train))
print("X_test length: ", len(X_test))
print("y_train length: ", len(y_train))
print("y_test length: ", len(y_test))
X_train.head()
X_train.shape

X_train length:  3412
X_test length:  1138
y_train length:  3412
y_test length:  1138


(3412, 11)

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y_adjusted, random_state=78)

# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [93]:
# Note SVM needs sclaed data, but random forest doesn't, but scaling also doesn't hurt.
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=78)
classifier


RandomForestClassifier(random_state=78)

In [95]:
model2 = classifier.fit(X_train_scaled, y_train)

In [96]:

print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.7926186291739895


In [97]:

# Create the GridSearchCV model for Logistic Regression
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [50, 75, 100, 125, 150],
              'max_features':[None, 'auto', 'sqrt']
              }
grid = GridSearchCV(model2, param_grid, verbose=3)


In [98]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END max_features=None, n_estimators=50;, score=0.758 total time=   0.3s
[CV 2/5] END max_features=None, n_estimators=50;, score=0.757 total time=   0.3s
[CV 3/5] END max_features=None, n_estimators=50;, score=0.730 total time=   0.3s
[CV 4/5] END max_features=None, n_estimators=50;, score=0.729 total time=   0.3s
[CV 5/5] END max_features=None, n_estimators=50;, score=0.758 total time=   0.3s
[CV 1/5] END max_features=None, n_estimators=75;, score=0.758 total time=   0.5s
[CV 2/5] END max_features=None, n_estimators=75;, score=0.769 total time=   0.5s
[CV 3/5] END max_features=None, n_estimators=75;, score=0.742 total time=   0.5s
[CV 4/5] END max_features=None, n_estimators=75;, score=0.732 total time=   0.5s
[CV 5/5] END max_features=None, n_estimators=75;, score=0.768 total time=   0.5s
[CV 1/5] END max_features=None, n_estimators=100;, score=0.754 total time=   0.8s
[CV 2/5] END max_features=None, n_estimators=10

GridSearchCV(estimator=RandomForestClassifier(random_state=78),
             param_grid={'max_features': [None, 'auto', 'sqrt'],
                         'n_estimators': [50, 75, 100, 125, 150]},
             verbose=3)

In [99]:
print(grid.best_params_)
print(grid.best_score_)

{'max_features': 'auto', 'n_estimators': 125}
0.7623113485013076


In [100]:

# Train the model with GridSearch
grid.score(X_test_scaled, y_test)


0.7943760984182777

In [101]:
grid


GridSearchCV(estimator=RandomForestClassifier(random_state=78),
             param_grid={'max_features': [None, 'auto', 'sqrt'],
                         'n_estimators': [50, 75, 100, 125, 150]},
             verbose=3)

In [108]:

len(X_test_scaled[0])
X_test_scaled[0].shape
X_test_scaled_point = X_test_scaled[0].reshape(1,11)
X_test_scaled_point


array([[-0.08313482, -0.73796363, -0.32468587,  0.09033823,  0.71268165,
        -0.38116224,  1.20970099,  1.3327225 , -0.62277886,  1.14956421,
         0.16212225]])

In [109]:
# Make a prediction!
X_test_scaled_point = X_test_scaled[0].reshape(1,11)
grid.predict(X_test_scaled_point)


array([[0, 0, 1]], dtype=uint8)

In [110]:
feature_names = X_train.columns.tolist()
selected_features = sorted(zip(grid.estimator.feature_importances_, feature_names), reverse=True)
# grid.estimator.feature_importances_
ranked_features = pd.DataFrame(selected_features, columns=['Score', 'Feature'])
ranked_features = ranked_features.set_index('Feature')
ranked_features

Unnamed: 0_level_0,Score
Feature,Unnamed: 1_level_1
glucose,0.202625
tos,0.132774
cholestrol,0.119709
systolic,0.110111
age,0.106937
distolic,0.100576
bmi,0.084091
risk,0.044345
paralysis,0.042959
smoking,0.035855


In [111]:
classifier_tuned = RandomForestClassifier(random_state=99, max_features='auto', n_estimators=125)
model3 = classifier_tuned.fit(X_train_scaled, y_train)
print(f"Training Data Score: {model3.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model3.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.789103690685413
