Dự đoán tỷ lệ đột quỵ sử dụng Random Forest với pipelines

In [38]:
# import thư viện
import pandas as pd # load data
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score, classification_report, precision_score

## The Data

In [3]:
# loading the data
stroke = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [17]:
# kiểm tra missing data và dropping item
stroke.isna().sum()
# dropna
stroke = stroke.dropna()
# bỏ cột id
stroke = stroke.drop('id', axis=1)
stroke.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [18]:
stroke.dtypes
# theo nguồn dữ liệu, hầu hết các biến đều mang tính phân loại

gender                 int32
age                  float64
hypertension           int64
heart_disease          int64
ever_married           int32
work_type              int32
Residence_type         int32
avg_glucose_level    float64
bmi                  float64
smoking_status         int32
stroke                 int64
dtype: object

In [19]:
### Mã hóa nhãn dữ liệu:
le = LabelEncoder()
# các cột cần mã hóa
cols = ['gender','hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status','stroke']
# quá trình mã hóa nhãn
stroke[cols] = stroke[cols].apply(lambda col: le.fit_transform(col))
stroke.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [20]:
# Để chia tỷ lệ dữ liệu trước khi đưa vào mô hình RF, chúng ta sử dụng 1 pipeline
# Defining pipeline
# scaler và rf classifier
# pipeline sẽ chia tỷ lệ dữ liệu trước khi đưa vào classifier
pipe = Pipeline([('scaler', StandardScaler()),('rf', RandomForestClassifier())])
pipe

In [22]:
# Phân chia dữ liệu:

# Xác định X và y
X = stroke.loc[:, 'gender':'smoking_status']
y = stroke.stroke
# chia dữ liệu
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [23]:
# Mô hình Random forest sử dụng pipeline
pipe.fit(X_train, y_train)

In [33]:
acc = pipe.score(X_test, y_test)
print('Accuracy:', acc)

Accuracy: 0.9517990495587237


In [40]:
precision = precision_score(y_test, pipe.predict(X_test))
print('Precision:', precision)

Precision: 1.0


In [36]:
cm = confusion_matrix(y_test, pipe.predict(X_test))
print('Confusion matrix:\n', cm)

Confusion matrix:
 [[1401    0]
 [  71    1]]


In [30]:
cr= classification_report(y_test, pipe.predict(X_test))
print('Classification report:\n', cr)

Classification report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98      1401
           1       1.00      0.01      0.03        72

    accuracy                           0.95      1473
   macro avg       0.98      0.51      0.50      1473
weighted avg       0.95      0.95      0.93      1473

