In [5]:
import pandas as pd
import numpy as np
import pyodbc
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from tqdm import tqdm



In [6]:
server = '192.168.1.212'
database = 'master'
username = 'test'
password = 'tester2024'

mssql_conn_str = f'DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password};'

In [7]:
msql_query = f"""
        select cp.CompanyId, CompanyAge, CompanyType, FDI, CapitalAmount, NumberOfLabors, Region, Status,
                [2015_11], [2015_12], [2015_13], [2015_14], [2015_15], [2015_16], [2015_17], [2015_18], [2015_19], [2015_20], [2015_21], [2015_22], [2015_23], [2015_24],
                [2016_11], [2016_12], [2016_13], [2016_14], [2016_15], [2016_16], [2016_17], [2016_18], [2016_19], [2016_20], [2016_21], [2016_22], [2016_23], [2016_24],
                [2017_11], [2017_12], [2017_13], [2017_14], [2017_15], [2017_16], [2017_17], [2017_18], [2017_19], [2017_20], [2017_21], [2017_22], [2017_23], [2017_24],
                [2018_11], [2018_12], [2018_13], [2018_14], [2018_15], [2018_16], [2018_17], [2018_18], [2018_19], [2018_20], [2018_21], [2018_22], [2018_23], [2018_24],
                [2019_11], [2019_12], [2019_13], [2019_14], [2019_15], [2019_16], [2019_17], [2019_18], [2019_19], [2019_20], [2019_21], [2019_22], [2019_23], [2019_24],
                [2020_11], [2020_12], [2020_13], [2020_14], [2020_15], [2020_16], [2020_17], [2020_18], [2020_19], [2020_20], [2020_21], [2020_22], [2020_23], [2020_24],
                [2021_11], [2021_12], [2021_13], [2021_14], [2021_15], [2021_16], [2021_17], [2021_18], [2021_19], [2021_20], [2021_21], [2021_22], [2021_23], [2021_24],
                [2022_11], [2022_12], [2022_13], [2022_14], [2022_15], [2022_16], [2022_17], [2022_18], [2022_19], [2022_20], [2022_21], [2022_22], [2022_23], [2022_24]
        from ProjectNew..CompanyProfile cp
        join ProjectNew..FinancialValue2 fv on cp.CompanyId = fv.CompanyId
        where CompanyAge > 9
        """
print(msql_query)


        select cp.CompanyId, CompanyAge, CompanyType, FDI, CapitalAmount, NumberOfLabors, Region, Status,
                [2015_11], [2015_12], [2015_13], [2015_14], [2015_15], [2015_16], [2015_17], [2015_18], [2015_19], [2015_20], [2015_21], [2015_22], [2015_23], [2015_24],
                [2016_11], [2016_12], [2016_13], [2016_14], [2016_15], [2016_16], [2016_17], [2016_18], [2016_19], [2016_20], [2016_21], [2016_22], [2016_23], [2016_24],
                [2017_11], [2017_12], [2017_13], [2017_14], [2017_15], [2017_16], [2017_17], [2017_18], [2017_19], [2017_20], [2017_21], [2017_22], [2017_23], [2017_24],
                [2018_11], [2018_12], [2018_13], [2018_14], [2018_15], [2018_16], [2018_17], [2018_18], [2018_19], [2018_20], [2018_21], [2018_22], [2018_23], [2018_24],
                [2019_11], [2019_12], [2019_13], [2019_14], [2019_15], [2019_16], [2019_17], [2019_18], [2019_19], [2019_20], [2019_21], [2019_22], [2019_23], [2019_24],
                [2020_11], [2020_12], [2020

In [8]:
try:
    mssql_conn = pyodbc.connect(mssql_conn_str)
    print("Kết nối cơ sở dữ liệu thành công")
except pyodbc.Error as e:
    print(f"Lỗi khi kết nối cơ sở dữ liệu: {e}")

data = pd.read_sql_query(msql_query, mssql_conn)

mssql_conn.close()

Kết nối cơ sở dữ liệu thành công


  data = pd.read_sql_query(msql_query, mssql_conn)


In [9]:
def processing_data(df1, n):
    df = df1.copy()

    # One-hot encoding cho cột CompanyType
    df = pd.get_dummies(df, columns=['CompanyType'], prefix='Type')
        
    if 'CompanyId' in df.columns:
        df.drop(columns=['CompanyId'], inplace=True)

    # Duplicate các dòng dữ liệu có status = 1 n lần
    if n > 1:
        df_status_1 = df[df['status'] == 1]
        df = pd.concat([df] + [df_status_1] * (n - 1), ignore_index=True)

    # Normalize các cột còn lại với giá trị từ 0 đến 1
    scaler = MinMaxScaler()
    df[df.columns] = scaler.fit_transform(df[df.columns])
    
    # Thay thế tất cả các giá trị NaN trong df thành 0
    df.fillna(0, inplace=True)
    
    return df

In [10]:
processed_data = processing_data(data, 1)

In [11]:
processed_data

Unnamed: 0,CompanyAge,FDI,CapitalAmount,NumberOfLabors,Region,Status,2015_11,2015_12,2015_13,2015_14,...,2022_19,2022_20,2022_21,2022_22,2022_23,2022_24,Type_LLC1,Type_LLC2,Type_PE,Type_SC
0,0.28,0.0,0.000097,0.000076,1.0,0.0,0.000467,0.004616,0.000714,0.000517,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,1.0
1,0.08,0.0,0.000009,0.000031,1.0,0.0,0.000254,0.004240,0.000714,0.000000,...,0.236214,0.000006,0.003024,0.196133,0.332620,0.344626,0.0,1.0,0.0,0.0
2,0.28,0.0,0.000001,0.000138,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.0,0.0,0.0
3,0.08,0.0,0.000029,0.000076,1.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,1.0
4,0.44,0.0,0.000031,0.000000,1.0,1.0,0.000290,0.004277,0.000811,0.000562,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58896,0.24,0.0,0.000097,0.000153,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.236269,0.000047,0.003032,0.196137,0.332621,0.344626,0.0,1.0,0.0,0.0
58897,0.36,0.0,0.000005,0.000015,1.0,0.0,0.000375,0.004282,0.001035,0.000513,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,0.0,0.0
58898,0.20,0.0,0.000039,0.000046,1.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.236236,0.000023,0.003109,0.196147,0.332621,0.344627,0.0,1.0,0.0,0.0
58899,0.44,0.0,0.000243,0.004584,0.0,0.0,0.001674,0.005229,0.002323,0.000644,...,0.236548,0.000600,0.003298,0.196325,0.332678,0.344673,0.0,1.0,0.0,0.0


In [12]:
from sklearn.linear_model import LogisticRegression

def train_logistic_regression_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo mô hình Logistic Regression
    model = LogisticRegression()

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Huấn luyện mô hình trên tập train
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred))

In [13]:
train_logistic_regression_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.8768355827179357
Training on fold 2...
Accuracy for fold 2: 0.8771646859083192
Training on fold 3...
Accuracy for fold 3: 0.8739388794567062
Training on fold 4...
Accuracy for fold 4: 0.8733446519524618
Training on fold 5...
Accuracy for fold 5: 0.8737691001697793

Final Report:
Mean Accuracy: 0.8750105800410404
Standard Deviation of Accuracy: 0.0016392643269249711

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.93     10298
         1.0       0.48      0.04      0.08      1482

    accuracy                           0.87     11780
   macro avg       0.68      0.52      0.51     11780
weighted avg       0.83      0.87      0.82     11780


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.88      0.99      0.93     10298
         1.0       0.48      0.04      0.08      1482

    accuracy              

In [14]:
import xgboost as xgb

def train_xgboost_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình XGBoost trên tập train
        model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [15]:
train_xgboost_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.8801459977930566
Training on fold 2...
Accuracy for fold 2: 0.881578947368421
Training on fold 3...
Accuracy for fold 3: 0.8809847198641766
Training on fold 4...
Accuracy for fold 4: 0.881578947368421
Training on fold 5...
Accuracy for fold 5: 0.882088285229202

Final Report:
Mean Accuracy: 0.8812753795246554
Standard Deviation of Accuracy: 0.0006640946754673247

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.90      0.97      0.94     10298
         1.0       0.57      0.25      0.34      1482

    accuracy                           0.88     11780
   macro avg       0.74      0.61      0.64     11780
weighted avg       0.86      0.88      0.86     11780


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.90      0.97      0.94     10298
         1.0       0.57      0.25      0.34      1482

    accuracy                 

In [16]:
import lightgbm as lgb

def train_lightgbm_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình LightGBM trên tập train
        model = lgb.LGBMClassifier()
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [17]:
train_lightgbm_kfold(processed_data)

Training on fold 1...
[LightGBM] [Info] Number of positive: 5842, number of negative: 41278
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29107
[LightGBM] [Info] Number of data points in the train set: 47120, number of used features: 121
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.123981 -> initscore=-1.955256
[LightGBM] [Info] Start training from score -1.955256
Accuracy for fold 1: 0.8845598845598845
Training on fold 2...
[LightGBM] [Info] Number of positive: 5857, number of negative: 41264
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29107
[LightGBM] [Info] Number of data points in the train set: 47121, number of used features: 121
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.124297 -> 

In [18]:
import catboost as cb

def train_catboost_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']
    
    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình CatBoost trên tập train
        model = cb.CatBoostClassifier(verbose=0)
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [19]:
train_catboost_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.8863424157541805
Training on fold 2...
Accuracy for fold 2: 0.884125636672326
Training on fold 3...
Accuracy for fold 3: 0.883955857385399
Training on fold 4...
Accuracy for fold 4: 0.8847198641765704
Training on fold 5...
Accuracy for fold 5: 0.8863327674023769

Final Report:
Mean Accuracy: 0.8850953082781705
Standard Deviation of Accuracy: 0.0010455765657709426

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.90      0.98      0.94     10298
         1.0       0.62      0.25      0.36      1482

    accuracy                           0.89     11780
   macro avg       0.76      0.61      0.65     11780
weighted avg       0.87      0.89      0.86     11780


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.90      0.98      0.94     10298
         1.0       0.62      0.25      0.36      1482

    accuracy                

In [20]:
# train_hist_gradient_boosting_kfold
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier


def train_hist_gradient_boosting_kfold(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình HistGradientBoostingClassifier trên tập train
        model = HistGradientBoostingClassifier()
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))




In [21]:
train_hist_gradient_boosting_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.8829471182412358
Training on fold 2...
Accuracy for fold 2: 0.883955857385399
Training on fold 3...
Accuracy for fold 3: 0.8833616298811545
Training on fold 4...
Accuracy for fold 4: 0.881578947368421
Training on fold 5...
Accuracy for fold 5: 0.879881154499151

Final Report:
Mean Accuracy: 0.8823449414750723
Standard Deviation of Accuracy: 0.001459336525757947

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.89      0.99      0.94     10298
         1.0       0.62      0.12      0.20      1482

    accuracy                           0.88     11780
   macro avg       0.75      0.55      0.57     11780
weighted avg       0.85      0.88      0.84     11780


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.89      0.99      0.94     10298
         1.0       0.62      0.12      0.20      1482

    accuracy                  

In [22]:
# train_random_forest_kfold
from sklearn.ensemble import RandomForestClassifier

def train_random_forest_kfold(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình RandomForestClassifier trên tập train
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [23]:
train_random_forest_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.8812494694847636
Training on fold 2...
Accuracy for fold 2: 0.8804753820033956
Training on fold 3...
Accuracy for fold 3: 0.8804753820033956
Training on fold 4...
Accuracy for fold 4: 0.8786926994906621
Training on fold 5...
Accuracy for fold 5: 0.8828522920203735

Final Report:
Mean Accuracy: 0.8807490450005181
Standard Deviation of Accuracy: 0.001345541706440657

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.90      0.98      0.94     10298
         1.0       0.59      0.22      0.32      1482

    accuracy                           0.88     11780
   macro avg       0.74      0.60      0.63     11780
weighted avg       0.86      0.88      0.86     11780


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.90      0.98      0.94     10298
         1.0       0.59      0.22      0.32      1482

    accuracy               

In [24]:
# train_smote_RandomForest_kfold
from imblearn.over_sampling import SMOTE

def train_smote_RandomForest_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Áp dụng SMOTE cho tập train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        
        # Khởi tạo và huấn luyện mô hình RandomForestClassifier trên tập train đã được resample
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train_res, y_train_res)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))

In [25]:
train_smote_RandomForest_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.8702996350055173
Training on fold 2...
Accuracy for fold 2: 0.8695246179966044
Training on fold 3...
Accuracy for fold 3: 0.8699490662139219
Training on fold 4...
Accuracy for fold 4: 0.8678268251273344
Training on fold 5...
Accuracy for fold 5: 0.866044142614601

Final Report:
Mean Accuracy: 0.8687288573915957
Standard Deviation of Accuracy: 0.0015881336507073622

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.91      0.94      0.92     10298
         1.0       0.46      0.37      0.41      1482

    accuracy                           0.87     11780
   macro avg       0.69      0.65      0.67     11780
weighted avg       0.86      0.87      0.86     11780


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.91      0.94      0.92     10298
         1.0       0.46      0.37      0.41      1482

    accuracy               

In [26]:
# train_random_forest_class_weight_kfold

# Imbalanced Learning Techniques - Class Weight Adjustment: Điều chỉnh trọng số lớp để mô hình tập trung hơn vào lớp thiểu số.
from sklearn.ensemble import RandomForestClassifier

def train_random_forest_class_weight_kfold(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình RandomForestClassifier với class_weight='balanced'
        model = RandomForestClassifier(class_weight='balanced')
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [27]:
train_random_forest_class_weight_kfold(processed_data)

Training on fold 1...
Accuracy for fold 1: 0.8759018759018758
Training on fold 2...
Accuracy for fold 2: 0.8801358234295416
Training on fold 3...
Accuracy for fold 3: 0.8788624787775892
Training on fold 4...
Accuracy for fold 4: 0.8771646859083192
Training on fold 5...
Accuracy for fold 5: 0.8774193548387097

Final Report:
Mean Accuracy: 0.877896843771207
Standard Deviation of Accuracy: 0.0014621510518750185

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.89      0.98      0.93     10298
         1.0       0.55      0.15      0.24      1482

    accuracy                           0.88     11780
   macro avg       0.72      0.57      0.59     11780
weighted avg       0.85      0.88      0.85     11780


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.89      0.98      0.93     10298
         1.0       0.55      0.15      0.24      1482

    accuracy               

In [28]:
# train_random_forest_grid_search

# Hyperparameter Tuning - Grid Search

from sklearn.model_selection import KFold, GridSearchCV

def train_random_forest_grid_search(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # Thiết lập các tham số cho Grid Search
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Khởi tạo Grid Search với RandomForestClassifier
    grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=kf, scoring='accuracy', n_jobs=-1, verbose=2)

    # Huấn luyện Grid Search trên toàn bộ dữ liệu
    grid_search.fit(X, y)

    # Lấy mô hình tốt nhất từ Grid Search
    best_model = grid_search.best_estimator_

    # In ra các tham số tốt nhất
    print("Best parameters found: ", grid_search.best_params_)
    print("Best cross-validation accuracy: ", grid_search.best_score_)

    # Dự đoán và đánh giá mô hình tốt nhất trên toàn bộ tập dữ liệu
    y_pred = best_model.predict(X)
    print("Classification Report for best model:")
    print(classification_report(y, y_pred, zero_division=0))

In [29]:
train_random_forest_grid_search(processed_data)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


KeyboardInterrupt: 

In [None]:
# train_random_forest_random_search

# Hyperparameter Tuning - Random Search

from sklearn.model_selection import KFold, RandomizedSearchCV
from scipy.stats import randint

def train_random_forest_random_search(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # Thiết lập các tham số cho Random Search
    param_dist = {
        'n_estimators': randint(50, 200),
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': randint(2, 11),
        'min_samples_leaf': randint(1, 5)
    }

    # Khởi tạo Random Search với RandomForestClassifier
    random_search = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=param_dist, n_iter=100, cv=kf, scoring='accuracy', n_jobs=-1, verbose=2, random_state=42)

    # Huấn luyện Random Search trên toàn bộ dữ liệu
    random_search.fit(X, y)

    # Lấy mô hình tốt nhất từ Random Search
    best_model = random_search.best_estimator_

    # In ra các tham số tốt nhất
    print("Best parameters found: ", random_search.best_params_)
    print("Best cross-validation accuracy: ", random_search.best_score_)

    # Dự đoán và đánh giá mô hình tốt nhất trên toàn bộ tập dữ liệu
    y_pred = best_model.predict(X)
    print("Classification Report for best model:")
    print(classification_report(y, y_pred, zero_division=0))


In [None]:
train_random_forest_random_search(processed_data)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best parameters found:  {'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 165}
Best cross-validation accuracy:  0.8025410248150482
Classification Report for best model:
              precision    recall  f1-score   support

         0.0       0.93      0.18      0.31     48177
         1.0       0.83      1.00      0.90    187477

    accuracy                           0.83    235654
   macro avg       0.88      0.59      0.60    235654
weighted avg       0.85      0.83      0.78    235654



In [None]:
# train_stacking_kfold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

def train_stacking_kfold(processed_data, k=5):
    # Sao chép DataFrame và bỏ qua cảnh báo UndefinedMetricWarning
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    # Chia dữ liệu thành biến độc lập (X) và biến phụ thuộc (y)
    X = df.drop('Status', axis=1)
    y = df['Status']

    # Khởi tạo KFold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in tqdm(kf.split(X), total=k, desc="K-Fold Progress"):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo các mô hình cơ bản
        estimators = [
            ('rf', RandomForestClassifier()),
            ('gb', GradientBoostingClassifier())
        ]
        
        # Khởi tạo và huấn luyện mô hình StackingClassifier trên tập train
        model = StackingClassifier(
            estimators=estimators,
            final_estimator=LogisticRegression()
        )
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [None]:
train_stacking_kfold(processed_data)

K-Fold Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Training on fold 1...


K-Fold Progress:   0%|          | 0/5 [01:42<?, ?it/s]


KeyboardInterrupt: 

In [30]:
# train_smote_deep_learning_kfold
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ProgbarLogger
from imblearn.over_sampling import SMOTE

def build_model(input_shape):
    model = Sequential()
    model.add(Dense(64, input_dim=input_shape, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_smote_deep_learning_kfold(processed_data, k=5, epochs=50, batch_size=32):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Áp dụng SMOTE cho tập train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)        

        # Xây dựng mô hình
        model = build_model(X_train.shape[1])
        
        # Đào tạo mô hình với ProgbarLogger
        model.fit(X_train_res, y_train_res, epochs=epochs, batch_size=batch_size, verbose=1, 
                  validation_data=(X_test, y_test), callbacks=[ProgbarLogger()])
        
        # Dự đoán trên tập test
        y_pred = (model.predict(X_test) > 0.5).astype("int32")
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))



In [31]:
train_smote_deep_learning_kfold(processed_data)

Training on fold 1...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m2580/2580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 949us/step - accuracy: 0.7307 - loss: 0.5550 - val_accuracy: 0.8077 - val_loss: 0.4823
Epoch 2/50
[1m2580/2580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 903us/step - accuracy: 0.7486 - loss: 0.5218 - val_accuracy: 0.7702 - val_loss: 0.5370
Epoch 3/50
[1m2580/2580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 862us/step - accuracy: 0.7481 - loss: 0.5178 - val_accuracy: 0.8035 - val_loss: 0.5049
Epoch 4/50
[1m2580/2580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 857us/step - accuracy: 0.7464 - loss: 0.5159 - val_accuracy: 0.7073 - val_loss: 0.5918
Epoch 5/50
[1m2580/2580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 873us/step - accuracy: 0.7464 - loss: 0.5155 - val_accuracy: 0.7913 - val_loss: 0.5198
Epoch 6/50
[1m2580/2580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 807us/step - accuracy: 0.7504 - loss: 0.5091 - val_accuracy: 0.7558 - val_loss: 0.5336
Epoc

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2579/2579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 890us/step - accuracy: 0.7324 - loss: 0.5553 - val_accuracy: 0.7912 - val_loss: 0.5058
Epoch 2/50
[1m2579/2579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 820us/step - accuracy: 0.7467 - loss: 0.5265 - val_accuracy: 0.7766 - val_loss: 0.5110
Epoch 3/50
[1m2579/2579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 845us/step - accuracy: 0.7506 - loss: 0.5180 - val_accuracy: 0.7554 - val_loss: 0.5361
Epoch 4/50
[1m2579/2579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 844us/step - accuracy: 0.7491 - loss: 0.5197 - val_accuracy: 0.6829 - val_loss: 0.6210
Epoch 5/50
[1m2579/2579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 821us/step - accuracy: 0.7530 - loss: 0.5128 - val_accuracy: 0.7882 - val_loss: 0.5044
Epoch 6/50
[1m2579/2579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 842us/step - accuracy: 0.7509 - loss: 0.5135 - val_accuracy: 0.7482 - val_loss: 0.5462
Epoch 7/50
[1m

In [None]:
# train_smote_passive_aggressive_kfold
from sklearn.linear_model import PassiveAggressiveClassifier
from imblearn.over_sampling import SMOTE

def train_smote_passive_aggressive_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in tqdm(kf.split(X), total=k, desc="K-Fold Progress"):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Áp dụng SMOTE cho tập train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)        

        # Khởi tạo và huấn luyện mô hình PassiveAggressiveClassifier trên tập train
        model = PassiveAggressiveClassifier()
        model.fit(X_train_res, y_train_res)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [None]:
train_smote_passive_aggressive_kfold(processed_data)

K-Fold Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Training on fold 1...


K-Fold Progress:  20%|██        | 1/5 [00:02<00:09,  2.37s/it]

Accuracy for fold 1: 0.6528399567163863
Training on fold 2...


K-Fold Progress:  40%|████      | 2/5 [00:05<00:07,  2.55s/it]

Accuracy for fold 2: 0.34204663597207785
Training on fold 3...


K-Fold Progress:  60%|██████    | 3/5 [00:07<00:05,  2.58s/it]

Accuracy for fold 3: 0.7936602236320044
Training on fold 4...


K-Fold Progress:  80%|████████  | 4/5 [00:10<00:02,  2.52s/it]

Accuracy for fold 4: 0.4894867496976512
Training on fold 5...


K-Fold Progress: 100%|██████████| 5/5 [00:12<00:00,  2.49s/it]

Accuracy for fold 5: 0.2532569488648419

Final Report:
Mean Accuracy: 0.5062581029765922
Standard Deviation of Accuracy: 0.19765752551900564

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.92      0.07      0.13     37504
         1.0       0.21      0.98      0.35      9626

    accuracy                           0.25     47130
   macro avg       0.57      0.52      0.24     47130
weighted avg       0.78      0.25      0.17     47130


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.92      0.07      0.13     37504
         1.0       0.21      0.98      0.35      9626

    accuracy                           0.25     47130
   macro avg       0.57      0.52      0.24     47130
weighted avg       0.78      0.25      0.17     47130


Classification Report for fold 3:





              precision    recall  f1-score   support

         0.0       0.92      0.07      0.13     37504
         1.0       0.21      0.98      0.35      9626

    accuracy                           0.25     47130
   macro avg       0.57      0.52      0.24     47130
weighted avg       0.78      0.25      0.17     47130


Classification Report for fold 4:
              precision    recall  f1-score   support

         0.0       0.92      0.07      0.13     37504
         1.0       0.21      0.98      0.35      9626

    accuracy                           0.25     47130
   macro avg       0.57      0.52      0.24     47130
weighted avg       0.78      0.25      0.17     47130


Classification Report for fold 5:
              precision    recall  f1-score   support

         0.0       0.92      0.07      0.13     37504
         1.0       0.21      0.98      0.35      9626

    accuracy                           0.25     47130
   macro avg       0.57      0.52      0.24     47130
weig

In [None]:
# train_smote_ridge_classifier_kfold
from sklearn.linear_model import RidgeClassifier
from imblearn.over_sampling import SMOTE

def train_smote_ridge_classifier_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in tqdm(kf.split(X), total=k, desc="K-Fold Progress"):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Áp dụng SMOTE cho tập train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        
        # Khởi tạo và huấn luyện mô hình RidgeClassifier trên tập train đã áp dụng SMOTE
        model = RidgeClassifier()
        model.fit(X_train_res, y_train_res)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [None]:
train_smote_ridge_classifier_kfold(processed_data)

K-Fold Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Training on fold 1...


K-Fold Progress:  20%|██        | 1/5 [00:02<00:08,  2.01s/it]

Accuracy for fold 1: 0.5333856697290531
Training on fold 2...


K-Fold Progress:  40%|████      | 2/5 [00:04<00:06,  2.03s/it]

Accuracy for fold 2: 0.5286753941142772
Training on fold 3...


K-Fold Progress:  60%|██████    | 3/5 [00:06<00:04,  2.14s/it]

Accuracy for fold 3: 0.5340858458339521
Training on fold 4...


K-Fold Progress:  80%|████████  | 4/5 [00:08<00:02,  2.13s/it]

Accuracy for fold 4: 0.5239651184995013
Training on fold 5...


K-Fold Progress: 100%|██████████| 5/5 [00:10<00:00,  2.11s/it]

Accuracy for fold 5: 0.5352217271377042

Final Report:
Mean Accuracy: 0.5310667510628976
Standard Deviation of Accuracy: 0.00419303985958335

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.84      0.51      0.64     37504
         1.0       0.25      0.63      0.35      9626

    accuracy                           0.54     47130
   macro avg       0.54      0.57      0.50     47130
weighted avg       0.72      0.54      0.58     47130


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.84      0.51      0.64     37504
         1.0       0.25      0.63      0.35      9626

    accuracy                           0.54     47130
   macro avg       0.54      0.57      0.50     47130
weighted avg       0.72      0.54      0.58     47130


Classification Report for fold 3:





              precision    recall  f1-score   support

         0.0       0.84      0.51      0.64     37504
         1.0       0.25      0.63      0.35      9626

    accuracy                           0.54     47130
   macro avg       0.54      0.57      0.50     47130
weighted avg       0.72      0.54      0.58     47130


Classification Report for fold 4:
              precision    recall  f1-score   support

         0.0       0.84      0.51      0.64     37504
         1.0       0.25      0.63      0.35      9626

    accuracy                           0.54     47130
   macro avg       0.54      0.57      0.50     47130
weighted avg       0.72      0.54      0.58     47130


Classification Report for fold 5:
              precision    recall  f1-score   support

         0.0       0.84      0.51      0.64     37504
         1.0       0.25      0.63      0.35      9626

    accuracy                           0.54     47130
   macro avg       0.54      0.57      0.50     47130
weig

In [None]:
# train_smote_random_forest_class_weight_kfold
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

def train_smote_random_forest_class_weight_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Áp dụng SMOTE cho tập train
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
        
        # Khởi tạo và huấn luyện mô hình RandomForestClassifier trên tập train đã được resample
        model = RandomForestClassifier(class_weight='balanced',random_state=42)
        model.fit(X_train_res, y_train_res)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))

In [None]:
train_smote_random_forest_class_weight_kfold (processed_data)

Training on fold 1...
Accuracy for fold 1: 0.7178502471833825
Training on fold 2...
Accuracy for fold 2: 0.7186352931191785
Training on fold 3...
Accuracy for fold 3: 0.7219664339818803
Training on fold 4...
Accuracy for fold 4: 0.717998769387452
Training on fold 5...
Accuracy for fold 5: 0.7200721408869085

Final Report:
Mean Accuracy: 0.7193045769117604
Standard Deviation of Accuracy: 0.0015454958144291086

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.85      0.79      0.82     37504
         1.0       0.35      0.45      0.40      9626

    accuracy                           0.72     47130
   macro avg       0.60      0.62      0.61     47130
weighted avg       0.75      0.72      0.73     47130


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.85      0.79      0.82     37504
         1.0       0.35      0.45      0.40      9626

    accuracy               

In [None]:
# train_bayesian_glm_kfold
from sklearn.linear_model import BayesianRidge
def train_bayesian_glm_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình Bayesian GLM (BayesianRidge) trên tập train
        model = BayesianRidge()
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        y_pred = (y_pred > 0.5).astype(int)  # Chuyển đổi thành nhãn nhị phân
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [None]:
train_bayesian_glm_kfold (processed_data)

Training on fold 1...
Accuracy for fold 1: 0.7945725743141456
Training on fold 2...
Accuracy for fold 2: 0.7955061424540112
Training on fold 3...
Accuracy for fold 3: 0.7956546646580807
Training on fold 4...
Accuracy for fold 4: 0.7953364027922174
Training on fold 5...
Accuracy for fold 5: 0.7954593677063442

Final Report:
Mean Accuracy: 0.7953058303849597
Standard Deviation of Accuracy: 0.00038052151624859664

Classification Report for fold 1:
              precision    recall  f1-score   support

         0.0       0.80      1.00      0.89     37504
         1.0       0.46      0.01      0.02      9626

    accuracy                           0.80     47130
   macro avg       0.63      0.50      0.45     47130
weighted avg       0.73      0.80      0.71     47130


Classification Report for fold 2:
              precision    recall  f1-score   support

         0.0       0.80      1.00      0.89     37504
         1.0       0.46      0.01      0.02      9626

    accuracy             

In [None]:
# train_svm_kfold
from sklearn.svm import SVC

def train_svm_kfold(processed_data, k=5):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình SVM trên tập train
        model = SVC(probability=True)
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))



In [None]:
# train_svm_kfold(processed_data)

In [None]:
# train_automl_kfold with tpot
from tpot import TPOTClassifier

def train_automl_kfold(processed_data, k=5, generations=50, population_size=50):
    df = processed_data.copy()
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    
    X = df.drop('Status', axis=1)
    y = df['Status']

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    reports = []

    fold = 1
    for train_index, test_index in kf.split(X):
        print(f"Training on fold {fold}...")
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Khởi tạo và huấn luyện mô hình TPOTClassifier trên tập train
        model = TPOTClassifier(generations=generations, population_size=population_size, verbosity=2, random_state=42)
        model.fit(X_train, y_train)
        
        # Dự đoán trên tập test
        y_pred = model.predict(X_test)
        
        # Đánh giá mô hình
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        
        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
        reports.append(report)

        print(f"Accuracy for fold {fold}: {accuracy}")
        fold += 1

    # Tính toán và in ra báo cáo cuối cùng
    print("\nFinal Report:")
    print(f"Mean Accuracy: {np.mean(accuracies)}")
    print(f"Standard Deviation of Accuracy: {np.std(accuracies)}")
    
    # In ra classification report cho từng fold
    for i, report in enumerate(reports):
        print(f"\nClassification Report for fold {i+1}:")
        print(classification_report(y.iloc[test_index], y_pred, zero_division=0))


In [None]:
# train_automl_kfold(processed_data)