In [29]:
import numpy as np # linear algebra
import pandas as pd # data processing
import plotly.express as px #visualization
from plotly.subplots import make_subplots #visualization
from matplotlib import pyplot as plt #visualization
import seaborn as sns #visualization
from sklearn.model_selection import train_test_split #split data
from sklearn.metrics import confusion_matrix #confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve #metrics
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score, classification_report #metrics
from sklearn.model_selection import cross_val_score #cross validation
from sklearn.model_selection import KFold #cross validation by kfold
from sklearn.model_selection import GridSearchCV #find best parameter
from sklearn.ensemble import RandomForestClassifier #ML model
from sklearn.linear_model import LogisticRegression #ML model
from xgboost import XGBClassifier #ML model
from sklearn.tree import DecisionTreeClassifier #ML model
from imblearn.over_sampling import SMOTE #oversample data
from sklearn import preprocessing #label encoding
import warnings
warnings.filterwarnings('ignore')

In [30]:
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [31]:
#Borramos las columnas que no son necesarias
data.drop(data[data.gender == 'Other'].index, inplace=True)
data.drop(data[data.work_type == 'Never_worked'].index, inplace=True)
data.drop('id', axis=1, inplace=True)

#Columnas que convertiremos a categoria
cols = ['gender', 'ever_married', 'work_type','Residence_type','smoking_status']
#Convertimos las columnas a tipo category
data[cols] = data[cols].astype('category')
#Convertimos la columna 'edad' a int
data['age'] = data['age'].astype('int')

In [59]:
#chequeamos si hay valores nulos
data.info()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,67,0,1,1,1,228.69,36.6,1,0,1,0,0,0,1,0,0
1,0,61,0,0,1,0,202.21,28.90964,1,0,0,1,0,0,0,1,0
2,1,80,0,1,1,0,105.92,32.5,1,0,1,0,0,0,0,1,0
3,0,49,0,0,1,1,171.23,34.4,1,0,1,0,0,0,0,0,1
4,0,79,1,0,1,0,174.12,24.0,1,0,0,1,0,0,0,1,0


In [None]:
#rellenamos la columna 'bmi'
data['bmi'].fillna((data['bmi'].mean()), inplace=True)
data.isnull().mean()

In [34]:
#convertimos las variables categoricas a numericas
columns_obj = ["gender", "ever_married" ,"Residence_type"]
encoding = preprocessing.LabelEncoder()
for col in columns_obj:
    data[col]=  encoding.fit_transform(data[col])
#convert in 0 and 1 the rest of columns
data = pd.get_dummies(data)
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,67,0,1,1,1,228.69,36.6,1,0,1,0,0,0,1,0,0
1,0,61,0,0,1,0,202.21,28.90964,1,0,0,1,0,0,0,1,0
2,1,80,0,1,1,0,105.92,32.5,1,0,1,0,0,0,0,1,0
3,0,49,0,0,1,1,171.23,34.4,1,0,1,0,0,0,0,0,1
4,0,79,1,0,1,0,174.12,24.0,1,0,0,1,0,0,0,1,0


In [35]:
#Definimos variable dependiente e independientes
X = data.drop(columns = ['stroke'])
#target
y = data['stroke']

In [36]:
#balanceamos el dataset
smote = SMOTE(random_state=42)
X , y = smote.fit_resample(X,y)

data.stroke.value_counts(normalize=True)
y.value_counts(normalize=True)

1    0.5
0    0.5
Name: stroke, dtype: float64

In [37]:
#Separamos los datos de entrenamiento y prueba.
#Utilizaremos el 80% para entrenamiento y el 20% restante para test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(7740, 16) (1936, 16) (7740,) (1936,)


In [38]:
X_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
9384,0,65,0,0,1,1,90.16502,31.106426,0,0,0,0,0,1,0,0
9006,1,78,0,0,1,1,93.558836,29.240975,0,0,1,0,0,0,0,0
8296,0,58,0,0,1,0,198.911971,36.692833,0,0,0,0,0,0,0,0
5894,0,63,0,0,1,0,61.156513,26.317026,0,0,0,0,0,0,0,0
9252,1,62,0,0,1,0,196.838248,36.47704,0,1,0,0,0,1,0,0


In [39]:
#redondeamos decimales
X_train = round(X_train, 2)
X_test = round(X_test, 2)


In [40]:
X_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
9384,0,65,0,0,1,1,90.17,31.11,0,0,0,0,0,1,0,0
9006,1,78,0,0,1,1,93.56,29.24,0,0,1,0,0,0,0,0
8296,0,58,0,0,1,0,198.91,36.69,0,0,0,0,0,0,0,0
5894,0,63,0,0,1,0,61.16,26.32,0,0,0,0,0,0,0,0
9252,1,62,0,0,1,0,196.84,36.48,0,1,0,0,0,1,0,0


In [41]:
#function to eliminate outliers base in Zscore
def Zscore_outlier(df):
    out=[]
    m = np.mean(df)
    sd = np.std(df)
    for i in df:
        z = (i-m)/sd
        if np.abs(z) > 3:
            out.append(i)
    out.sort()
    print(out)
Zscore_outlier(X_train['bmi'])
Zscore_outlier(X_train['avg_glucose_level'])

[49.3, 49.3, 49.3, 49.4, 49.7, 49.8, 49.8, 49.8, 50.1, 50.1, 50.2, 50.2, 50.2, 50.3, 50.5, 50.6, 50.6, 50.8, 50.9, 51.0, 51.25, 51.5, 51.61, 51.7, 51.76, 51.8, 51.89, 51.9, 52.3, 52.47, 52.5, 52.8, 52.8, 52.8, 52.9, 53.4, 53.4, 53.5, 53.8, 53.8, 54.0, 54.1, 54.2, 54.28, 54.3, 54.52, 54.6, 54.6, 54.7, 54.7, 54.7, 55.0, 55.1, 55.18, 55.2, 55.38, 55.7, 55.7, 55.7, 55.7, 55.9, 55.9, 56.0, 56.1, 56.6, 56.6, 57.2, 57.5, 57.7, 57.9, 58.1, 59.7, 60.2, 60.9, 60.9, 61.2, 63.3, 64.4, 64.8, 66.8, 71.9, 78.0, 92.0, 97.6]
[]


In [42]:
#change outliers value to approximate maximum
X_train.loc[X_train.bmi >= 53.4, 'bmi'] = 49

In [56]:
X_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
9384,0,65,0,0,1,1,90.17,31.11,0,0,0,0,0,1,0,0
9006,1,78,0,0,1,1,93.56,29.24,0,0,1,0,0,0,0,0
8296,0,58,0,0,1,0,198.91,36.69,0,0,0,0,0,0,0,0
5894,0,63,0,0,1,0,61.16,26.32,0,0,0,0,0,0,0,0
9252,1,62,0,0,1,0,196.84,36.48,0,1,0,0,0,1,0,0


In [49]:
xgb = XGBClassifier(objective='binary:logistic')

#PARAMETROS XGBOOST
params = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05],
    'min_child_weight':[4,5,6],
     'gamma': [0]
}

#Inicializamos en grid search
grid= GridSearchCV(xgb, params, scoring = 'roc_auc' ,cv=5)
#entrenamos el gridsearch
grid.fit(X_train, y_train)
#Imprimimos el modelo
grid.best_estimator_

In [50]:
print(grid.best_params_)

{'gamma': 0, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 4, 'n_estimators': 140}


In [51]:
print(grid.best_score_)

0.9921070282517872


In [44]:
best_xgb = XGBClassifier(objective='binary:logistic',learning_rate = 0.1, max_depth = 9, n_estimators =140, gamma=0, min_child_weight=4)
best_xgb.fit(X_train,y_train)

In [46]:
p = best_xgb.score(X_train,y_train)
print('train acc: %5f' % p)

y_pred = best_xgb.predict(X_test)

print('test acc: %5f' % best_xgb.score(X_test,y_test))

train acc: 0.991731
test acc: 0.957645


In [47]:
scores = cross_val_score(best_xgb, X_train, y_train, cv=5, scoring="neg_mean_squared_error")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(),round(scores.std()*100, 2)))

-0.04 accuracy with a standard deviation of 0.27


In [52]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       985
           1       0.96      0.95      0.96       951

    accuracy                           0.96      1936
   macro avg       0.96      0.96      0.96      1936
weighted avg       0.96      0.96      0.96      1936

[[950  35]
 [ 47 904]]


In [55]:
import pickle

In [57]:
filename = 'model.pkl'
pickle.dump(best_xgb, open(filename, "wb"))

In [60]:
model_loaded = pickle.load(open(filename, 'rb'))

In [62]:
with open('categories_ohe.pickle', 'wb') as handle:
	pickle.dump(X.columns, handle, protocol=pickle.HIGHEST_PROTOCOL)