In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Đọc dữ liệu từ file CSV hoặc nguồn dữ liệu khác
# Giả sử df là DataFrame chứa dữ liệu của bạn
df = pd.read_csv('patient_survival.csv')

# Lựa chọn các đặc trưng quan trọng
selected_features = ['age', 'bmi', 'elective_surgery', 'icu_admit_source', 'apache_2_diagnosis', 'apache_3j_diagnosis',
                     'apache_post_operative', 'heart_rate_apache', 'resprate_apache', 'map_apache', 'temp_apache',
                     'd1_glucose_max', 'd1_potassium_max', 'aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure',
                     'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']

# Lựa chọn cột dự đoán
target_column = 'hospital_death'

# Xóa cột không mong muốn và xử lý giá trị thiếu
df.drop(columns=["Unnamed: 83"], axis=1, inplace=True)
df.dropna(inplace=True)

# Đổi các giá trị trong dataset thành số
le = LabelEncoder()
df['icu_admit_source'] = le.fit_transform(df['icu_admit_source'])
# Thêm các bước xử lý cho các cột dữ liệu kiểu categorical khác nếu cần

# Tách dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(df[selected_features], df[target_column], test_size=0.2, random_state=42)

# Tiêu chuẩn hóa dữ liệu
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Xây dựng mô hình Logistic Regression
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Dự đoán trên tập kiểm tra
y_pred = model.predict(X_test_scaled)

# Đánh giá mô hình trên tập kiểm tra
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# In kết quả
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')
df.head()


Accuracy: 0.9142882234126636
Confusion Matrix:
[[10358    25]
 [  951    53]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     10383
           1       0.68      0.05      0.10      1004

    accuracy                           0.91     11387
   macro avg       0.80      0.53      0.53     11387
weighted avg       0.90      0.91      0.88     11387



Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,66154,25312,118,68.0,22.73,0,Caucasian,M,180.3,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular,0
1,114252,59342,81,77.0,27.42,0,Caucasian,F,160.0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
5,33181,74489,83,67.0,27.56,0,Caucasian,M,190.5,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic,0
10,105427,125898,77,72.0,28.257052,1,Hispanic,F,154.9,2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Respiratory,Respiratory,0
17,22471,112115,118,46.0,25.845717,0,Hispanic,M,167.6,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory,0
