# Xây dụng mô hình

### 1. Import các thư viện cần thiết

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import pickle


### 2. Tải dữ liệu đã chia

In [None]:
# Đọc dữ liệu đã được tiền xử lý
print("Loading splits data...")
X_train_imbalance = pd.read_csv('../data/splits/X_train_imbalance.csv')
y_train_imbalance = pd.read_csv('../data/splits/y_train_imbalance.csv')
X_train_smote = pd.read_csv('../data/splits/X_train_smote.csv')
y_train_smote = pd.read_csv('../data/splits/y_train_smote.csv')
X_train_adasyn = pd.read_csv('../data/splits/X_train_adasyn.csv')
y_train_adasyn = pd.read_csv('../data/splits/y_train_adasyn.csv')


### 3. Định nghĩa hàm huấn luyện và lưu mô hình

In [None]:
def train_and_save_model(X_train, y_train, model, model_name, output_dir="../models/"):
    # Huấn luyện mô hình
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    
    # Đánh giá sơ bộ trên tập huấn luyện
    print(f"Model: {model_name}")
    print(f"Accuracy on Training Set: {accuracy_score(y_train, y_pred):.4f}")
    print(classification_report(y_train, y_pred))

    # Lưu mô hình
    model_path = output_dir + model_name + ".pkl"
    with open(model_path, "wb") as file:
        pickle.dump(model, file)
    print(f"Model saved at {model_path}\n")


### 4. Huấn luyện trên dữ liệu Imbalance

In [None]:
# Huấn luyện Logistic Regression
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
train_and_save_model(X_train_imbalance, y_train_imbalance, logistic_model, "logistic_imbalance")

# Huấn luyện Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
train_and_save_model(X_train_imbalance, y_train_imbalance, dt_model, "decision_tree_imbalance")

# Huấn luyện Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_save_model(X_train_imbalance, y_train_imbalance, rf_model, "random_forest_imbalance")

# Huấn luyện XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
train_and_save_model(X_train_imbalance, y_train_imbalance, xgb_model, "xgboost_imbalance")


### 5. Huấn luyện trên dữ liệu SMOTE

In [None]:
# Huấn luyện Logistic Regression
train_and_save_model(X_train_smote, y_train_smote, logistic_model, "logistic_smote")

# Huấn luyện Decision Tree
train_and_save_model(X_train_smote, y_train_smote, dt_model, "decision_tree_smote")

# Huấn luyện Random Forest
train_and_save_model(X_train_smote, y_train_smote, rf_model, "random_forest_smote")

# Huấn luyện XGBoost
train_and_save_model(X_train_smote, y_train_smote, xgb_model, "xgboost_smote")


### 6. Huấn luyện trên dữ liệu ADASYN

In [None]:
# Huấn luyện Logistic Regression
train_and_save_model(X_train_adasyn, y_train_adasyn, logistic_model, "logistic_adasyn")

# Huấn luyện Decision Tree
train_and_save_model(X_train_adasyn, y_train_adasyn, dt_model, "decision_tree_adasyn")

# Huấn luyện Random Forest
train_and_save_model(X_train_adasyn, y_train_adasyn, rf_model, "random_forest_adasyn")

# Huấn luyện XGBoost
train_and_save_model(X_train_adasyn, y_train_adasyn, xgb_model, "xgboost_adasyn")
