In [1]:
import pandas as pd
import numpy as np
import pyodbc
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from tqdm import tqdm



In [2]:
server = '192.168.1.212'
database = 'master'
username = 'test'
password = 'tester2024'

mssql_conn_str = f'DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password};'

In [3]:
msql_query = f"""
        select *
        from ProjectNew..FullCompanyInfo
        """
print(msql_query)


        select *
        from ProjectNew..FullCompanyInfo
        


In [4]:
try:
    mssql_conn = pyodbc.connect(mssql_conn_str)
    print("Kết nối cơ sở dữ liệu thành công")
except pyodbc.Error as e:
    print(f"Lỗi khi kết nối cơ sở dữ liệu: {e}")

data = pd.read_sql_query(msql_query, mssql_conn)

mssql_conn.close()

Kết nối cơ sở dữ liệu thành công


  data = pd.read_sql_query(msql_query, mssql_conn)


In [5]:
def processing_data(df1, n):
    df = df1.copy()

    # One-hot encoding cho cột CompanyType
    df = pd.get_dummies(df, columns=['CompanyType'], prefix='Type')
        
    if 'CompanyId' in df.columns:
        df.drop(columns=['CompanyId'], inplace=True)

    # Duplicate các dòng dữ liệu có status = 1 n lần
    if n > 1:
        df_status_1 = df[df['status'] == 1]
        df = pd.concat([df] + [df_status_1] * (n - 1), ignore_index=True)

    # Normalize các cột còn lại với giá trị từ 0 đến 1
    scaler = MinMaxScaler()
    df[df.columns] = scaler.fit_transform(df[df.columns])
    
    # Thay thế tất cả các giá trị NaN trong df thành 0
    df.fillna(0, inplace=True)
    
    return df

In [6]:
processed_data = processing_data(data, 1)

In [7]:
processed_data

Unnamed: 0,CompanyAge,FDI,CapitalAmount,NumberOfLabors,Region,FS11,FS12,FS13,FS14,FS15,...,FS20,FS21,FS22,FS23,FS24,Status,Type_LLC1,Type_LLC2,Type_PE,Type_SC
0,0.176471,0.0,0.000292,0.000061,1.0,0.066675,0.000000,0.135581,0.023858,0.061939,...,0.071832,0.077137,0.033333,0.182198,0.217620,1.0,0.0,0.0,0.0,1.0
1,0.382353,0.0,0.000097,0.000076,0.0,0.066634,0.000000,0.135614,0.023858,0.061955,...,0.071798,0.077136,0.033332,0.182198,0.217620,0.0,0.0,0.0,0.0,1.0
2,0.147059,0.0,0.000015,0.000046,1.0,0.066630,0.000000,0.135581,0.023858,0.061939,...,0.071794,0.077136,0.033332,0.182198,0.217620,0.0,1.0,0.0,0.0,0.0
3,0.147059,0.0,0.000048,0.000015,1.0,0.066635,0.000000,0.135629,0.023858,0.061945,...,0.071799,0.077144,0.033334,0.182198,0.217620,0.0,1.0,0.0,0.0,0.0
4,0.088235,0.0,0.000002,0.000153,1.0,0.066627,0.000000,0.135581,0.023858,0.061939,...,0.071792,0.077136,0.033332,0.182198,0.217620,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235649,0.235294,0.0,0.000049,0.000153,1.0,0.066638,0.000000,0.000000,0.000000,0.000000,...,0.071801,0.000000,0.000000,0.182198,0.217620,0.0,1.0,0.0,0.0,0.0
235650,0.117647,0.0,0.000005,0.000015,1.0,0.066627,0.000000,0.135581,0.023858,0.061939,...,0.071792,0.077136,0.033332,0.182198,0.217620,0.0,0.0,1.0,0.0,0.0
235651,0.205882,0.0,0.000243,0.000153,1.0,0.066642,0.096372,0.135581,0.023917,0.061944,...,0.071841,0.077144,0.033339,0.182202,0.217624,0.0,1.0,0.0,0.0,0.0
235652,0.088235,0.0,0.000024,0.000015,1.0,0.066632,0.096354,0.135581,0.023858,0.061939,...,0.071796,0.077136,0.033332,0.182198,0.217620,0.0,1.0,0.0,0.0,0.0


In [8]:
from sklearn.linear_model import LogisticRegression

def train_logistic_regression_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo mô hình Logistic Regression
    model = LogisticRegression()

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Huấn luyện mô hình trên tập train
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred))

In [12]:
train_logistic_regression_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.7946786616027668
Training on fold 2...
Accuracy for fold 2: 0.7954424900808386
Training on fold 3...
Accuracy for fold 3: 0.7956334472003564
Training on fold 4...
Accuracy for fold 4: 0.7951242282149753
Training on fold 5...
Accuracy for fold 5: 0.795374496074687

Final Report:
Mean Accuracy: 0.7952506646347248
Standard Deviation of Accuracy: 0.00032918911180646125

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.80      1.00      0.89     37504
         1.0       0.44      0.01      0.01      9626

    accuracy                           0.80     47130
   macro avg       0.62      0.50      0.45     47130
weighted avg       0.72      0.80      0.71     47130


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.80      1.00      0.89     37504
         1.0       0.44      0.01      0.01      9626

    accuracy              

In [10]:
import xgboost as xgb

def train_xgboost_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình XGBoost trên tập train
        model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [13]:
train_xgboost_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.7998981562029238
Training on fold 2...
Accuracy for fold 2: 0.8013197258704462
Training on fold 3...
Accuracy for fold 3: 0.802189641637139
Training on fold 4...
Accuracy for fold 4: 0.8019774670598969
Training on fold 5...
Accuracy for fold 5: 0.8018883938043709

Final Report:
Mean Accuracy: 0.8014546769149554
Standard Deviation of Accuracy: 0.0008297664165371048

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.81      0.98      0.89     37504
         1.0       0.60      0.09      0.15      9626

    accuracy                           0.80     47130
   macro avg       0.70      0.54      0.52     47130
weighted avg       0.77      0.80      0.74     47130


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.81      0.98      0.89     37504
         1.0       0.60      0.09      0.15      9626

    accuracy               

Kết quả chạy mô hình XGBoost cho thấy độ chính xác (accuracy) tổng thể là 0.80, nhưng có sự chênh lệch lớn về hiệu suất giữa hai lớp. 

Status 0.0 có recall rất thấp (0.15), cho thấy mô hình gặp khó khăn trong việc dự đoán đúng các mẫu thuộc lớp này.

In [14]:
import lightgbm as lgb

def train_lightgbm_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình LightGBM trên tập train
        model = lgb.LGBMClassifier()
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [15]:
train_lightgbm_kfold(processed_data)

Training on fold 1...
[LightGBM] [Info] Number of positive: 38498, number of negative: 150025
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004236 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4126
[LightGBM] [Info] Number of data points in the train set: 188523, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.204209 -> initscore=-1.360196
[LightGBM] [Info] Start training from score -1.360196
Accuracy for fold 1: 0.799919373660648
Training on fold 2...
[LightGBM] [Info] Number of positive: 38549, number of negative: 149974
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006284 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4126
[LightGBM] [Info] Number of data points in the train set: 188523, number of used features

Kết quả chạy mô hình LightGBM cho thấy độ chính xác (accuracy) tổng thể là 0.80, nhưng có sự chênh lệch lớn về hiệu suất giữa hai lớp. 

Status 0.0 có recall rất thấp (0.07), cho thấy mô hình gặp khó khăn trong việc dự đoán đúng các mẫu thuộc lớp này.

In [16]:
import catboost as cb

def train_catboost_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']
    
    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình CatBoost trên tập train
        model = cb.CatBoostClassifier(verbose=0)
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [17]:
train_catboost_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.8012136385818251
Training on fold 2...
Accuracy for fold 2: 0.8018289448558273
Training on fold 3...


In [None]:
# train_hist_gradient_boosting_kfold
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier


def train_hist_gradient_boosting_kfold(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình HistGradientBoostingClassifier trên tập train
        model = HistGradientBoostingClassifier()
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))




In [None]:
train_hist_gradient_boosting_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.7995841466519562
Training on fold 2...
Accuracy for fold 2: 0.8006025630145124
Training on fold 3...
Accuracy for fold 3: 0.800899601120258
Training on fold 4...
Accuracy for fold 4: 0.8022150555885598
Training on fold 5...
Accuracy for fold 5: 0.8030129429238277
Training on fold 6...
Accuracy for fold 6: 0.8019944833439423
Training on fold 7...
Accuracy for fold 7: 0.8028007638446849
Training on fold 8...
Accuracy for fold 8: 0.801060895395714
Training on fold 9...
Accuracy for fold 9: 0.7998302567366857
Training on fold 10...
Accuracy for fold 10: 0.8017398684489709

Final Report:
Mean Accuracy: 0.8013740577069111
Standard Deviation of Accuracy: 0.0011159276334650715

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.62      0.07      0.12      4796
         1.0       0.81      0.99      0.89     18769

    accuracy                           0.80     23565
   macro avg       0.71 

In [None]:
# train_random_forest_kfold
from sklearn.ensemble import RandomForestClassifier

def train_random_forest_kfold(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình RandomForestClassifier trên tập train
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [None]:
train_random_forest_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.7945769328693881
Training on fold 2...
Accuracy for fold 2: 0.7942798947636426
Training on fold 3...
Accuracy for fold 3: 0.7941950267334296
Training on fold 4...
Accuracy for fold 4: 0.7950012730204532
Training on fold 5...
Accuracy for fold 5: 0.7957564184171441
Training on fold 6...
Accuracy for fold 6: 0.7966051347337153
Training on fold 7...
Accuracy for fold 7: 0.7957139826013155
Training on fold 8...
Accuracy for fold 8: 0.7949501379164015
Training on fold 9...
Accuracy for fold 9: 0.7963080840229153
Training on fold 10...
Accuracy for fold 10: 0.7952896244430299

Final Report:
Mean Accuracy: 0.7952676509521435
Standard Deviation of Accuracy: 0.0007800641290058936

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.49      0.15      0.23      4796
         1.0       0.82      0.96      0.88     18769

    accuracy                           0.80     23565
   macro avg       0.6

In [None]:
# train_smote_RandomForest_kfold
from imblearn.over_sampling import SMOTE

def train_smote_RandomForest_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Áp dụng SMOTE cho tập train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        
        # Khởi tạo và huấn luyện mô hình RandomForestClassifier trên tập train đã được resample
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train_res, y_train_res)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))

In [None]:
train_smote_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.7158618348468132
Training on fold 2...
Accuracy for fold 2: 0.7198506322668251
Training on fold 3...
Accuracy for fold 3: 0.7122549435627599
Training on fold 4...
Accuracy for fold 4: 0.7188746499193753
Training on fold 5...
Accuracy for fold 5: 0.720560152768937
Training on fold 6...
Accuracy for fold 6: 0.7200084871631657
Training on fold 7...
Accuracy for fold 7: 0.7197114364523658
Training on fold 8...
Accuracy for fold 8: 0.7151708041587099
Training on fold 9...
Accuracy for fold 9: 0.7169106726076809
Training on fold 10...
Accuracy for fold 10: 0.7196265648207086

Final Report:
Mean Accuracy: 0.7178830178567341
Standard Deviation of Accuracy: 0.002588137201672347

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.35      0.45      0.40      4796
         1.0       0.85      0.79      0.82     18769

    accuracy                           0.72     23565
   macro avg       0.60 

In [None]:
# train_random_forest_class_weight_kfold

# Imbalanced Learning Techniques - Class Weight Adjustment: Điều chỉnh trọng số lớp để mô hình tập trung hơn vào lớp thiểu số.
from sklearn.ensemble import RandomForestClassifier

def train_random_forest_class_weight_kfold(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình RandomForestClassifier với class_weight='balanced'
        model = RandomForestClassifier(class_weight='balanced')
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [None]:
train_random_forest_class_weight_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.7674615972163286
Training on fold 2...
Accuracy for fold 2: 0.7678435033522872
Training on fold 3...
Accuracy for fold 3: 0.762496817448867
Training on fold 4...
Accuracy for fold 4: 0.7661037087329203
Training on fold 5...
Accuracy for fold 5: 0.7672395501803522
Training on fold 6...
Accuracy for fold 6: 0.7691915977084659
Training on fold 7...
Accuracy for fold 7: 0.7662635264162954
Training on fold 8...
Accuracy for fold 8: 0.7648631444939529
Training on fold 9...
Accuracy for fold 9: 0.766390833863781
Training on fold 10...
Accuracy for fold 10: 0.768512624655209

Final Report:
Mean Accuracy: 0.7666366904068459
Standard Deviation of Accuracy: 0.001825233375266761

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.41      0.29      0.34      4796
         1.0       0.83      0.89      0.86     18769

    accuracy                           0.77     23565
   macro avg       0.62   

In [None]:
# train_random_forest_grid_search

# Hyperparameter Tuning - Grid Search

from sklearn.model_selection import KFold, GridSearchCV

def train_random_forest_grid_search(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # Thiết lập các tham số cho Grid Search
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Khởi tạo Grid Search với RandomForestClassifier
    grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=kf, scoring='accuracy', n_jobs=-1, verbose=2)

    # Huấn luyện Grid Search trên toàn bộ dữ liệu
    grid_search.fit(X, y)

    # Lấy mô hình tốt nhất từ Grid Search
    best_model = grid_search.best_estimator_

    # In ra các tham số tốt nhất
    print("Best parameters found: ", grid_search.best_params_)
    print("Best cross-validation accuracy: ", grid_search.best_score_)

    # Dự đoán và đánh giá mô hình tốt nhất trên toàn bộ tập dữ liệu
    y_pred = best_model.predict(X)
    print("Classification Report for best model:")
    print(classification_report(y, y_pred, zero_division=0))

In [None]:
train_random_forest_grid_search(processed_data)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
Best parameters found:  {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation accuracy:  0.8023924971187096
Classification Report for best model:
              precision    recall  f1-score   support

         0.0       0.92      0.15      0.26     48177
         1.0       0.82      1.00      0.90    187477

    accuracy                           0.82    235654
   macro avg       0.87      0.58      0.58    235654
weighted avg       0.84      0.82      0.77    235654



In [None]:
# train_random_forest_random_search

# Hyperparameter Tuning - Random Search

from sklearn.model_selection import KFold, RandomizedSearchCV
from scipy.stats import randint

def train_random_forest_random_search(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # Thiết lập các tham số cho Random Search
    param_dist = {
        'n_estimators': randint(50, 200),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': randint(2, 11),
        'min_samples_leaf': randint(1, 5)
    }

    # Khởi tạo Random Search với RandomForestClassifier
    random_search = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=param_dist, n_iter=100, cv=kf, scoring='accuracy', n_jobs=-1, verbose=2, random_state=42)

    # Huấn luyện Random Search trên toàn bộ dữ liệu
    random_search.fit(X, y)

    # Lấy mô hình tốt nhất từ Random Search
    best_model = random_search.best_estimator_

    # In ra các tham số tốt nhất
    print("Best parameters found: ", random_search.best_params_)
    print("Best cross-validation accuracy: ", random_search.best_score_)

    # Dự đoán và đánh giá mô hình tốt nhất trên toàn bộ tập dữ liệu
    y_pred = best_model.predict(X)
    print("Classification Report for best model:")
    print(classification_report(y, y_pred, zero_division=0))

# Giả sử processed_data là DataFrame của bạn
# processed_data = pd.read_csv('your_processed_data.csv')

# Gọi hàm với số lần fold là 5
# train_random_forest_random_search(processed_data, k=5)


In [None]:
train_random_forest_random_search(processed_data)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best parameters found:  {'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 165}
Best cross-validation accuracy:  0.8025410248150482
Classification Report for best model:
              precision    recall  f1-score   support

         0.0       0.93      0.18      0.31     48177
         1.0       0.83      1.00      0.90    187477

    accuracy                           0.83    235654
   macro avg       0.88      0.59      0.60    235654
weighted avg       0.85      0.83      0.78    235654



In [None]:
# train_stacking_kfold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

def train_stacking_kfold(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo các mô hình cơ bản
        estimators = [
            ('rf', RandomForestClassifier()),
            ('gb', GradientBoostingClassifier())
        ]
        
        # Khởi tạo và huấn luyện mô hình StackingClassifier trên tập train
        model = StackingClassifier(
            estimators=estimators,
            final_estimator=LogisticRegression()
        )
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))

# Giả sử processed_data là DataFrame của bạn
# processed_data = pd.read_csv('your_processed_data.csv')

# Gọi hàm với số lần fold là 5
# train_stacking_kfold(processed_data, k=5)


In [None]:
train_stacking_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.8011542052108971
Training on fold 2...
Accuracy for fold 2: 0.7992446745311041
Training on fold 3...
Accuracy for fold 3: 0.7997114486972757
Training on fold 4...
Accuracy for fold 4: 0.8003055249087668
Training on fold 5...
Accuracy for fold 5: 0.8009335879482283
Training on fold 6...
Accuracy for fold 6: 0.8014003819223424
Training on fold 7...
Accuracy for fold 7: 0.8009760237640569
Training on fold 8...
Accuracy for fold 8: 0.8025885847655421
Training on fold 9...
Accuracy for fold 9: 0.8000424358158286
Training on fold 10...
Accuracy for fold 10: 0.8011882028431997

Final Report:
Mean Accuracy: 0.8007545070407242
Standard Deviation of Accuracy: 0.000910284629604158

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.57      0.09      0.16      4796
         1.0       0.81      0.98      0.89     18769

    accuracy                           0.80     23565
   macro avg       0.69

In [None]:
# train_deep_learning_kfold
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ProgbarLogger

def build_model(input_shape):
    model = Sequential()
    model.add(Dense(64, input_dim=input_shape, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_deep_learning_kfold(processed_data, k=5, epochs=50, batch_size=32):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
        # Xây dựng mô hình
        model = build_model(X_train.shape[1])
        
        # Đào tạo mô hình với tpdm (ProgbarLogger)
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, 
                  validation_data=(X_test, y_test), callbacks=[ProgbarLogger()])
        
        # Dự đoán trên tập test
        y_pred = (model.predict(X_test) > 0.5).astype("int32")
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))



In [None]:
train_deep_learning_kfold(processed_data)

Training on fold 1...
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 807us/step - accuracy: 0.7963 - loss: 0.4978 - val_accuracy: 0.7958 - val_loss: 0.4918
Epoch 2/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 735us/step - accuracy: 0.7946 - loss: 0.4936 - val_accuracy: 0.7960 - val_loss: 0.4901
Epoch 3/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 713us/step - accuracy: 0.7952 - loss: 0.4914 - val_accuracy: 0.7963 - val_loss: 0.4959
Epoch 4/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 715us/step - accuracy: 0.7954 - loss: 0.4911 - val_accuracy: 0.7967 - val_loss: 0.4890
Epoch 5/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 707us/step - accuracy: 0.7954 - loss: 0.4910 - val_accuracy: 0.7967 - val_loss: 0.4885
Epoch 6/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 731us/step - accuracy: 0.7959 - loss: 0.4889 - val_accuracy: 0.7966 - val_loss: 0.4886
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 694us/step - accuracy: 0.7960 - loss: 0.4977 - val_accuracy: 0.7934 - val_loss: 0.4944
Epoch 2/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 687us/step - accuracy: 0.7968 - loss: 0.4910 - val_accuracy: 0.7933 - val_loss: 0.4934
Epoch 3/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 684us/step - accuracy: 0.7934 - loss: 0.4941 - val_accuracy: 0.7937 - val_loss: 0.4922
Epoch 4/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 715us/step - accuracy: 0.7954 - loss: 0.4912 - val_accuracy: 0.7938 - val_loss: 0.4924
Epoch 5/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 723us/step - accuracy: 0.7953 - loss: 0.4905 - val_accuracy: 0.7939 - val_loss: 0.4949
Epoch 6/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 702us/step - accuracy: 0.7951 - loss: 0.4907 - val_accuracy: 0.7937 - val_loss: 0.4919
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 717us/step - accuracy: 0.7952 - loss: 0.4985 - val_accuracy: 0.7951 - val_loss: 0.4919
Epoch 2/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 714us/step - accuracy: 0.7955 - loss: 0.4932 - val_accuracy: 0.7953 - val_loss: 0.4895
Epoch 3/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 702us/step - accuracy: 0.7960 - loss: 0.4917 - val_accuracy: 0.7952 - val_loss: 0.4918
Epoch 4/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 707us/step - accuracy: 0.7946 - loss: 0.4922 - val_accuracy: 0.7953 - val_loss: 0.4900
Epoch 5/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 705us/step - accuracy: 0.7954 - loss: 0.4902 - val_accuracy: 0.7952 - val_loss: 0.4893
Epoch 6/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 707us/step - accuracy: 0.7963 - loss: 0.4893 - val_accuracy: 0.7954 - val_loss: 0.4892
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 729us/step - accuracy: 0.7894 - loss: 0.5010 - val_accuracy: 0.7963 - val_loss: 0.4945
Epoch 2/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 713us/step - accuracy: 0.7949 - loss: 0.4937 - val_accuracy: 0.7961 - val_loss: 0.4923
Epoch 3/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 719us/step - accuracy: 0.7963 - loss: 0.4894 - val_accuracy: 0.7957 - val_loss: 0.4920
Epoch 4/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 720us/step - accuracy: 0.7972 - loss: 0.4881 - val_accuracy: 0.7960 - val_loss: 0.4906
Epoch 5/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 713us/step - accuracy: 0.7959 - loss: 0.4902 - val_accuracy: 0.7958 - val_loss: 0.4900
Epoch 6/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 723us/step - accuracy: 0.7947 - loss: 0.4912 - val_accuracy: 0.7962 - val_loss: 0.4902
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 723us/step - accuracy: 0.7949 - loss: 0.4990 - val_accuracy: 0.7964 - val_loss: 0.4923
Epoch 2/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 712us/step - accuracy: 0.7947 - loss: 0.4930 - val_accuracy: 0.7966 - val_loss: 0.4902
Epoch 3/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 716us/step - accuracy: 0.7943 - loss: 0.4929 - val_accuracy: 0.7956 - val_loss: 0.4892
Epoch 4/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 717us/step - accuracy: 0.7968 - loss: 0.4893 - val_accuracy: 0.7957 - val_loss: 0.4898
Epoch 5/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 711us/step - accuracy: 0.7953 - loss: 0.4908 - val_accuracy: 0.7962 - val_loss: 0.4907
Epoch 6/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 720us/step - accuracy: 0.7956 - loss: 0.4898 - val_accuracy: 0.7962 - val_loss: 0.4893
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 733us/step - accuracy: 0.7938 - loss: 0.4979 - val_accuracy: 0.7961 - val_loss: 0.4939
Epoch 2/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 699us/step - accuracy: 0.7936 - loss: 0.4948 - val_accuracy: 0.7961 - val_loss: 0.4896
Epoch 3/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 712us/step - accuracy: 0.7960 - loss: 0.4907 - val_accuracy: 0.7967 - val_loss: 0.4904
Epoch 4/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 707us/step - accuracy: 0.7959 - loss: 0.4903 - val_accuracy: 0.7969 - val_loss: 0.4884
Epoch 5/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 710us/step - accuracy: 0.7944 - loss: 0.4915 - val_accuracy: 0.7968 - val_loss: 0.4890
Epoch 6/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 719us/step - accuracy: 0.7962 - loss: 0.4886 - val_accuracy: 0.7969 - val_loss: 0.4896
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 759us/step - accuracy: 0.7960 - loss: 0.4980 - val_accuracy: 0.7964 - val_loss: 0.4907
Epoch 2/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 711us/step - accuracy: 0.7962 - loss: 0.4910 - val_accuracy: 0.7964 - val_loss: 0.4895
Epoch 3/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 711us/step - accuracy: 0.7946 - loss: 0.4927 - val_accuracy: 0.7965 - val_loss: 0.4881
Epoch 4/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 715us/step - accuracy: 0.7965 - loss: 0.4894 - val_accuracy: 0.7965 - val_loss: 0.4877
Epoch 5/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 719us/step - accuracy: 0.7955 - loss: 0.4905 - val_accuracy: 0.7966 - val_loss: 0.4905
Epoch 6/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 718us/step - accuracy: 0.7954 - loss: 0.4910 - val_accuracy: 0.7965 - val_loss: 0.4876
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 736us/step - accuracy: 0.7946 - loss: 0.4991 - val_accuracy: 0.7946 - val_loss: 0.4942
Epoch 2/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 717us/step - accuracy: 0.7962 - loss: 0.4915 - val_accuracy: 0.7946 - val_loss: 0.4939
Epoch 3/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 715us/step - accuracy: 0.7956 - loss: 0.4905 - val_accuracy: 0.7945 - val_loss: 0.4921
Epoch 4/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 726us/step - accuracy: 0.7951 - loss: 0.4908 - val_accuracy: 0.7947 - val_loss: 0.4921
Epoch 5/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 719us/step - accuracy: 0.7943 - loss: 0.4921 - val_accuracy: 0.7947 - val_loss: 0.4915
Epoch 6/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 729us/step - accuracy: 0.7974 - loss: 0.4874 - val_accuracy: 0.7948 - val_loss: 0.4941
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 720us/step - accuracy: 0.7943 - loss: 0.4991 - val_accuracy: 0.7950 - val_loss: 0.4928
Epoch 2/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 717us/step - accuracy: 0.7952 - loss: 0.4921 - val_accuracy: 0.7950 - val_loss: 0.4922
Epoch 3/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 714us/step - accuracy: 0.7962 - loss: 0.4905 - val_accuracy: 0.7953 - val_loss: 0.4918
Epoch 4/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 719us/step - accuracy: 0.7958 - loss: 0.4901 - val_accuracy: 0.7954 - val_loss: 0.4910
Epoch 5/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 712us/step - accuracy: 0.7972 - loss: 0.4882 - val_accuracy: 0.7955 - val_loss: 0.4909
Epoch 6/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 720us/step - accuracy: 0.7947 - loss: 0.4909 - val_accuracy: 0.7957 - val_loss: 0.4905
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 721us/step - accuracy: 0.7953 - loss: 0.4975 - val_accuracy: 0.7965 - val_loss: 0.4907
Epoch 2/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 708us/step - accuracy: 0.7967 - loss: 0.4904 - val_accuracy: 0.7965 - val_loss: 0.4914
Epoch 3/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 710us/step - accuracy: 0.7948 - loss: 0.4925 - val_accuracy: 0.7965 - val_loss: 0.4898
Epoch 4/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 711us/step - accuracy: 0.7957 - loss: 0.4903 - val_accuracy: 0.7966 - val_loss: 0.4887
Epoch 5/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 708us/step - accuracy: 0.7968 - loss: 0.4887 - val_accuracy: 0.7967 - val_loss: 0.4891
Epoch 6/50
[1m6628/6628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 712us/step - accuracy: 0.7959 - loss: 0.4897 - val_accuracy: 0.7966 - val_loss: 0.4888
Epoch 7/50
[1m

In [None]:
# train_smote_deep_learning_kfold
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ProgbarLogger
from imblearn.over_sampling import SMOTE

def build_model(input_shape):
    model = Sequential()
    model.add(Dense(64, input_dim=input_shape, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_smote_deep_learning_kfold(processed_data, k=5, epochs=50, batch_size=32):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Áp dụng SMOTE cho tập train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)        

        # Xây dựng mô hình
        model = build_model(X_train.shape[1])
        
        # Đào tạo mô hình với ProgbarLogger
        model.fit(X_train_res, y_train_res, epochs=epochs, batch_size=batch_size, verbose=1, 
                  validation_data=(X_test, y_test), callbacks=[ProgbarLogger()])
        
        # Dự đoán trên tập test
        y_pred = (model.predict(X_test) > 0.5).astype("int32")
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))



In [None]:
train_smote_deep_learning_kfold(processed_data, 5)

Training on fold 1...
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9377/9377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 766us/step - accuracy: 0.5639 - loss: 0.6739 - val_accuracy: 0.5474 - val_loss: 0.6661
Epoch 2/50
[1m9377/9377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 757us/step - accuracy: 0.5789 - loss: 0.6676 - val_accuracy: 0.5469 - val_loss: 0.6537
Epoch 3/50
[1m9377/9377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 728us/step - accuracy: 0.5836 - loss: 0.6656 - val_accuracy: 0.5138 - val_loss: 0.6946
Epoch 4/50
[1m9377/9377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 734us/step - accuracy: 0.5823 - loss: 0.6656 - val_accuracy: 0.5450 - val_loss: 0.6586
Epoch 5/50
[1m9377/9377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 764us/step - accuracy: 0.5837 - loss: 0.6647 - val_accuracy: 0.5706 - val_loss: 0.6506
Epoch 6/50
[1m9377/9377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 713us/step - accuracy: 0.5836 - loss: 0.6646 - val_accuracy: 0.5656 - val_loss: 0.6573
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 785us/step - accuracy: 0.5614 - loss: 0.6739 - val_accuracy: 0.4208 - val_loss: 0.6911
Epoch 2/50
[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 730us/step - accuracy: 0.5804 - loss: 0.6669 - val_accuracy: 0.4982 - val_loss: 0.6676
Epoch 3/50
[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 778us/step - accuracy: 0.5805 - loss: 0.6664 - val_accuracy: 0.4511 - val_loss: 0.7081
Epoch 4/50
[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 745us/step - accuracy: 0.5813 - loss: 0.6654 - val_accuracy: 0.4983 - val_loss: 0.6848
Epoch 5/50
[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 784us/step - accuracy: 0.5838 - loss: 0.6649 - val_accuracy: 0.4909 - val_loss: 0.6864
Epoch 6/50
[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 743us/step - accuracy: 0.5833 - loss: 0.6652 - val_accuracy: 0.5499 - val_loss: 0.6619
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9372/9372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 734us/step - accuracy: 0.5621 - loss: 0.6746 - val_accuracy: 0.5095 - val_loss: 0.6719
Epoch 2/50
[1m9372/9372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 729us/step - accuracy: 0.5793 - loss: 0.6681 - val_accuracy: 0.5286 - val_loss: 0.6732
Epoch 3/50
[1m9372/9372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 719us/step - accuracy: 0.5827 - loss: 0.6651 - val_accuracy: 0.4948 - val_loss: 0.6867
Epoch 4/50
[1m9372/9372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 729us/step - accuracy: 0.5850 - loss: 0.6647 - val_accuracy: 0.4915 - val_loss: 0.6805
Epoch 5/50
[1m9372/9372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 736us/step - accuracy: 0.5855 - loss: 0.6641 - val_accuracy: 0.5303 - val_loss: 0.6712
Epoch 6/50
[1m9372/9372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 799us/step - accuracy: 0.5833 - loss: 0.6639 - val_accuracy: 0.5385 - val_loss: 0.6682
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 707us/step - accuracy: 0.5643 - loss: 0.6736 - val_accuracy: 0.5811 - val_loss: 0.6392
Epoch 2/50
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 695us/step - accuracy: 0.5776 - loss: 0.6682 - val_accuracy: 0.5401 - val_loss: 0.6694
Epoch 3/50
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 700us/step - accuracy: 0.5823 - loss: 0.6665 - val_accuracy: 0.5360 - val_loss: 0.6671
Epoch 4/50
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 690us/step - accuracy: 0.5840 - loss: 0.6656 - val_accuracy: 0.5323 - val_loss: 0.6635
Epoch 5/50
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 701us/step - accuracy: 0.5836 - loss: 0.6650 - val_accuracy: 0.5063 - val_loss: 0.6697
Epoch 6/50
[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 691us/step - accuracy: 0.5859 - loss: 0.6647 - val_accuracy: 0.5823 - val_loss: 0.6639
Epoch 7/50
[1m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 731us/step - accuracy: 0.5654 - loss: 0.6739 - val_accuracy: 0.5522 - val_loss: 0.6521
Epoch 2/50
[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 714us/step - accuracy: 0.5796 - loss: 0.6675 - val_accuracy: 0.4929 - val_loss: 0.6611
Epoch 3/50
[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 713us/step - accuracy: 0.5858 - loss: 0.6654 - val_accuracy: 0.5123 - val_loss: 0.6616
Epoch 4/50
[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 725us/step - accuracy: 0.5848 - loss: 0.6651 - val_accuracy: 0.5599 - val_loss: 0.6386
Epoch 5/50
[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 711us/step - accuracy: 0.5847 - loss: 0.6645 - val_accuracy: 0.4676 - val_loss: 0.6985
Epoch 6/50
[1m9374/9374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 715us/step - accuracy: 0.5841 - loss: 0.6644 - val_accuracy: 0.4844 - val_loss: 0.7016
Epoch 7/50
[1m

In [None]:
# train_smote_passive_aggressive_kfold
from sklearn.linear_model import PassiveAggressiveClassifier
from imblearn.over_sampling import SMOTE

def train_smote_passive_aggressive_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in tqdm(kf.split(X), total=k, desc="K-Fold Progress"):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Áp dụng SMOTE cho tập train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)        

        # Khởi tạo và huấn luyện mô hình PassiveAggressiveClassifier trên tập train
        model = PassiveAggressiveClassifier()
        model.fit(X_train_res, y_train_res)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [None]:
train_smote_passive_aggressive_kfold(processed_data)

K-Fold Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Training on fold 1...


K-Fold Progress:  20%|██        | 1/5 [00:02<00:09,  2.35s/it]

Accuracy for fold 1: 0.6419129659884153
Training on fold 2...


K-Fold Progress:  40%|████      | 2/5 [00:04<00:06,  2.33s/it]

Accuracy for fold 2: 0.5962529969659035
Training on fold 3...


K-Fold Progress:  60%|██████    | 3/5 [00:06<00:04,  2.25s/it]

Accuracy for fold 3: 0.4462031359402516
Training on fold 4...


K-Fold Progress:  80%|████████  | 4/5 [00:09<00:02,  2.27s/it]

Accuracy for fold 4: 0.7415925823767796
Training on fold 5...


K-Fold Progress: 100%|██████████| 5/5 [00:11<00:00,  2.26s/it]

Accuracy for fold 5: 0.46959473795883727

Final Report:
Mean Accuracy: 0.5791112838460375
Standard Deviation of Accuracy: 0.1098148547179676

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.19      0.50      0.28      9626
         1.0       0.78      0.46      0.58     37504

    accuracy                           0.47     47130
   macro avg       0.49      0.48      0.43     47130
weighted avg       0.66      0.47      0.52     47130


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.19      0.50      0.28      9626
         1.0       0.78      0.46      0.58     37504

    accuracy                           0.47     47130
   macro avg       0.49      0.48      0.43     47130
weighted avg       0.66      0.47      0.52     47130


Classification Report for fold 3:





              precision    recall  f1-score   support

         0.0       0.19      0.50      0.28      9626
         1.0       0.78      0.46      0.58     37504

    accuracy                           0.47     47130
   macro avg       0.49      0.48      0.43     47130
weighted avg       0.66      0.47      0.52     47130


Classification Report for fold 4:
              precision    recall  f1-score   support

         0.0       0.19      0.50      0.28      9626
         1.0       0.78      0.46      0.58     37504

    accuracy                           0.47     47130
   macro avg       0.49      0.48      0.43     47130
weighted avg       0.66      0.47      0.52     47130


Classification Report for fold 5:
              precision    recall  f1-score   support

         0.0       0.19      0.50      0.28      9626
         1.0       0.78      0.46      0.58     37504

    accuracy                           0.47     47130
   macro avg       0.49      0.48      0.43     47130
weig

In [None]:
# train_smote_ridge_classifier_kfold
from sklearn.linear_model import RidgeClassifier
from imblearn.over_sampling import SMOTE

def train_smote_ridge_classifier_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in tqdm(kf.split(X), total=k, desc="K-Fold Progress"):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Áp dụng SMOTE cho tập train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        
        # Khởi tạo và huấn luyện mô hình RidgeClassifier trên tập train đã áp dụng SMOTE
        model = RidgeClassifier()
        model.fit(X_train_res, y_train_res)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [None]:
train_smote_ridge_classifier_kfold(processed_data)

K-Fold Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Training on fold 1...


K-Fold Progress:  20%|██        | 1/5 [00:01<00:07,  1.85s/it]

Accuracy for fold 1: 0.5333856697290531
Training on fold 2...


K-Fold Progress:  40%|████      | 2/5 [00:03<00:05,  1.98s/it]

Accuracy for fold 2: 0.5286753941142772
Training on fold 3...


K-Fold Progress:  60%|██████    | 3/5 [00:05<00:03,  1.94s/it]

Accuracy for fold 3: 0.5340858458339521
Training on fold 4...


K-Fold Progress:  80%|████████  | 4/5 [00:07<00:01,  1.88s/it]

Accuracy for fold 4: 0.5239651184995013
Training on fold 5...


K-Fold Progress: 100%|██████████| 5/5 [00:09<00:00,  1.90s/it]

Accuracy for fold 5: 0.5352217271377042

Final Report:
Mean Accuracy: 0.5310667510628976
Standard Deviation of Accuracy: 0.00419303985958335

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.25      0.63      0.35      9626
         1.0       0.84      0.51      0.64     37504

    accuracy                           0.54     47130
   macro avg       0.54      0.57      0.50     47130
weighted avg       0.72      0.54      0.58     47130


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.25      0.63      0.35      9626
         1.0       0.84      0.51      0.64     37504

    accuracy                           0.54     47130
   macro avg       0.54      0.57      0.50     47130
weighted avg       0.72      0.54      0.58     47130


Classification Report for fold 3:





              precision    recall  f1-score   support

         0.0       0.25      0.63      0.35      9626
         1.0       0.84      0.51      0.64     37504

    accuracy                           0.54     47130
   macro avg       0.54      0.57      0.50     47130
weighted avg       0.72      0.54      0.58     47130


Classification Report for fold 4:
              precision    recall  f1-score   support

         0.0       0.25      0.63      0.35      9626
         1.0       0.84      0.51      0.64     37504

    accuracy                           0.54     47130
   macro avg       0.54      0.57      0.50     47130
weighted avg       0.72      0.54      0.58     47130


Classification Report for fold 5:
              precision    recall  f1-score   support

         0.0       0.25      0.63      0.35      9626
         1.0       0.84      0.51      0.64     37504

    accuracy                           0.54     47130
   macro avg       0.54      0.57      0.50     47130
weig

In [None]:
# train_smote_random_forest_class_weight_kfold
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

def train_smote_random_forest_class_weight_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Áp dụng SMOTE cho tập train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        
        # Khởi tạo và huấn luyện mô hình RandomForestClassifier trên tập train đã được resample
        model = RandomForestClassifier(class_weight='balanced',random_state=42)
        model.fit(X_train_res, y_train_res)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))

In [None]:
train_smote_random_forest_class_weight_kfold (processed_data)

Training on fold 1...
Accuracy for fold 1: 0.7158618348468132
Training on fold 2...
Accuracy for fold 2: 0.7198506322668251
Training on fold 3...
Accuracy for fold 3: 0.7122549435627599
Training on fold 4...
Accuracy for fold 4: 0.7188746499193753
Training on fold 5...
Accuracy for fold 5: 0.720560152768937
Training on fold 6...
Accuracy for fold 6: 0.7200084871631657
Training on fold 7...
Accuracy for fold 7: 0.7197114364523658
Training on fold 8...
Accuracy for fold 8: 0.7151708041587099
Training on fold 9...
Accuracy for fold 9: 0.7169106726076809
Training on fold 10...
Accuracy for fold 10: 0.7196265648207086

Final Report:
Mean Accuracy: 0.7178830178567341
Standard Deviation of Accuracy: 0.002588137201672347

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.35      0.45      0.40      4796
         1.0       0.85      0.79      0.82     18769

    accuracy                           0.72     23565
   macro avg       0.60 

In [None]:
# train_bayesian_glm_kfold
from sklearn.linear_model import BayesianRidge
def train_bayesian_glm_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình Bayesian GLM (BayesianRidge) trên tập train
        model = BayesianRidge()
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        y_pred = (y_pred > 0.5).astype(int)  # Chuyển đổi thành nhãn nhị phân
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [None]:
train_bayesian_glm_kfold (processed_data)

Training on fold 1...
Accuracy for fold 1: 0.7953831791564118
Training on fold 2...
Accuracy for fold 2: 0.7938555546125774
Training on fold 3...
Accuracy for fold 3: 0.7952134430959857
Training on fold 4...
Accuracy for fold 4: 0.7959348213527964
Training on fold 5...
Accuracy for fold 5: 0.7956291109696584
Training on fold 6...
Accuracy for fold 6: 0.7954169318905155
Training on fold 7...
Accuracy for fold 7: 0.7961807765754296
Training on fold 8...
Accuracy for fold 8: 0.7944833439422873
Training on fold 9...
Accuracy for fold 9: 0.7947379588372586
Training on fold 10...
Accuracy for fold 10: 0.7959685974962869

Final Report:
Mean Accuracy: 0.7952803717929207
Standard Deviation of Accuracy: 0.0006948074051293184

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.43      0.01      0.02      4796
         1.0       0.80      1.00      0.89     18769

    accuracy                           0.80     23565
   macro avg       0.6

In [None]:
# train_svm_kfold
from sklearn.svm import SVC

def train_svm_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình SVM trên tập train
        model = SVC(probability=True)
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))



In [None]:
train_svm_kfold(processed_data)

Training on fold 1...


In [None]:
# train_automl_kfold with tpot
from tpot import TPOTClassifier

def train_automl_kfold(processed_data, k=5, generations=50, population_size=50):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình TPOTClassifier trên tập train
        model = TPOTClassifier(generations=generations, population_size=population_size, verbosity=2, random_state=42)
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [None]:
train_automl_kfold(processed_data)

Training on fold 1...


Optimization Progress:   0%|          | 0/2550 [00:00<?, ?pipeline/s]