In [89]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

import sklearn.metrics as skmetrics

import category_encoders as ce

In [90]:
stroke_df = pd.read_csv('healthcare-dataset-stroke-data.xls')
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [91]:
stroke_df['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [92]:
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [93]:
X = stroke_df.drop('stroke', axis=1) 
y = stroke_df.stroke

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [94]:
cat_bin_cols = [col for col in X_train.columns if X_train[col].dtype == 'object' and X_train[col].nunique() == 2]
cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object' and X_train[col].nunique() > 2]
num_cols = [col for col in X_train.columns if X_train[col].dtype in ['float64', 'int64']]

In [95]:
cat_bin_pipelines = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('binary_encoder', ce.BinaryEncoder()),
])

cat_pipelines = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('oh', ce.OneHotEncoder()),
])

num_pipelines = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
])

In [96]:
column_transform = ColumnTransformer([
    ('cat_bin_transform', cat_bin_pipelines, cat_bin_cols),
    ('cat_transform', cat_pipelines, cat_cols),
    ('num_transform', num_pipelines, num_cols),
])


In [97]:
# logo após fazer os pipelines / preprocessamento, transformamos os nossos dataframe

X_train_transformed = column_transform.fit_transform(X_train)
X_test_transformed = column_transform.transform(X_test)

In [98]:
linear_model = LinearRegression()
linear_model.fit(X_train_transformed, y_train) 
linear_pred = linear_model.predict(X_test_transformed)

In [99]:
forest_model = RandomForestClassifier()
forest_model.fit(X_train_transformed, y_train)
forest_pred = forest_model.predict(X_test_transformed)

In [104]:
forest_accuracy = skmetrics.accuracy_score(y_test, forest_pred)

In [102]:
linear_pred = skmetrics.mean_absolute_error(y_test, linear_pred)

In [105]:
print(f'Forest Accuracy: {forest_accuracy}')

Forest Accuracy: 0.9585289514866979


In [106]:
print(f'Linear MAE: {linear_pred}')

Linear MAE: 0.08807388611767233
