In [1]:
# !pip install torch==1.13.1

In [19]:
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
from tqdm import tqdm

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, count, round, when, countDistinct as _round
from pyspark.sql.functions import sum as _sum
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.functions import vector_to_array


In [20]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("FeatureExtraction") \
    .getOrCreate()

In [5]:
final_schema = StructType([
    StructField("initial_list_status_w", IntegerType(), nullable=False),
    StructField("last_pymnt_amnt", DoubleType(), nullable=True),
    StructField("out_prncp", DoubleType(), nullable=True),
    StructField("loan_amnt", DoubleType(), nullable=True),
    StructField("int_rate", DoubleType(), nullable=True),
    StructField("is_60_months", IntegerType(), nullable=False),
    StructField("purpose_debt_consolidation", IntegerType(), nullable=False),
    StructField("purpose_credit_card", IntegerType(), nullable=False),
    StructField("purpose_home_improvement", IntegerType(), nullable=False),
    StructField("purpose_major_purchase", IntegerType(), nullable=False),
    StructField("purpose_small_business", IntegerType(), nullable=False),
    StructField("purpose_car", IntegerType(), nullable=False),
    StructField("purpose_medical", IntegerType(), nullable=False),
    StructField("purpose_other", IntegerType(), nullable=False),
    StructField("total_pymnt", DoubleType(), nullable=True),
    StructField("total_rec_int", DoubleType(), nullable=True),
    StructField("total_rec_late_fee", DoubleType(), nullable=True),
    StructField("recoveries", DoubleType(), nullable=True),
    StructField("delinq_2yrs", DoubleType(), nullable=True),
    StructField("inq_last_6mths", DoubleType(), nullable=True),
    StructField("mths_since_last_delinq", DoubleType(), nullable=True),
    StructField("mths_since_last_record", DoubleType(), nullable=True),
    StructField("open_acc", DoubleType(), nullable=True),
    StructField("pub_rec", DoubleType(), nullable=True),
    StructField("revol_util", DoubleType(), nullable=True),
    StructField("total_acc", DoubleType(), nullable=True),
    StructField("collections_12_mths_ex_med", DoubleType(), nullable=True),
    StructField("mths_since_last_major_derog", DoubleType(), nullable=True),
    StructField("open_acc_6m", DoubleType(), nullable=True),
    StructField("open_il_24m", DoubleType(), nullable=True),
    StructField("mths_since_rcnt_il", DoubleType(), nullable=True),
    StructField("total_bal_il", DoubleType(), nullable=True),
    StructField("open_rv_24m", DoubleType(), nullable=True),
    StructField("all_util", DoubleType(), nullable=True),
    StructField("total_rev_hi_lim", DoubleType(), nullable=True),
    StructField("tot_coll_amt", DoubleType(), nullable=True),
    StructField("tot_cur_bal", DoubleType(), nullable=True),
    StructField("inq_fi", DoubleType(), nullable=True),
    StructField("total_cu_tl", DoubleType(), nullable=True),
    StructField("inq_last_12m", DoubleType(), nullable=True),
    StructField("annual_inc", DoubleType(), nullable=True),
    StructField("emp_length_num", DoubleType(), nullable=False),
    StructField("is_mortgage", IntegerType(), nullable=False),
    StructField("is_rent", IntegerType(), nullable=False),
    StructField("is_own", IntegerType(), nullable=False),
    StructField("is_source_verified", IntegerType(), nullable=False),
    StructField("is_verified", IntegerType(), nullable=False),
    StructField("is_not_verified", IntegerType(), nullable=False),
    StructField("loan_status_label", IntegerType(), nullable=True)
])

In [6]:
df = (
    spark.read.option("header", True)
    .option("quote", '"')
    .option("escape", '"')
    .schema(final_schema)
    .csv("hdfs://namenode:9000/bigdata/data/final_data")
)

In [7]:
label_col = "loan_status_label"

feature_cols = [c for c in df.columns if c != label_col]

train_df, test_df = df.randomSplit([0.85, 0.15], seed=42)
X_train_df, Y_train_df = train_df.select(*feature_cols), train_df.select(label_col)
X_test_df, Y_test_df = test_df.select(*feature_cols), test_df.select(label_col)

# I. Feature Scaling

In [8]:
# for c in feature_cols:
#     n_unique = X_train_df.select(c).distinct().count()
    
#     if n_unique <= 12:
#         print(f"Column: {c}, Unique values: {n_unique}")
#         unique_vals = [row[c] for row in df.select(c).distinct().collect()]
#         print(f"  Values: {unique_vals}\n")

### Như vậy, các cột có loại dữ liệu Categorical là:
`initial_list_status_w`, `initial_list_status_w`, `purpose_debt_consolidation`, `purpose_credit_card`, `purpose_home_improvement`, `purpose_major_purchase`, `purpose_small_business`, `purpose_car`, `purpose_medical`, `purpose_other`, `is_mortgage`, `is_rent`, `is_own`, `is_source_verified`, `is_verified`, `is_not_verified`, `emp_length_num`

### Với các cột còn lại, loại dữ liệu là Numerical, sẽ cần xem phân phối dữ liệu và kiểm tra outlier để lựa chọn phương pháp scale phù hợp

In [9]:
exclude_cols = [
    'initial_list_status_w', 'is_60_months', 'purpose_debt_consolidation', 'purpose_credit_card',
    'purpose_home_improvement', 'purpose_major_purchase', 'purpose_small_business',
    'purpose_car', 'purpose_medical', 'purpose_other', 
    'is_mortgage', 'is_rent', 'is_own', 
    'is_source_verified', 'is_verified', 'is_not_verified', 'emp_length_num'
]

numeric_cols = [c for c in feature_cols if c not in exclude_cols]

# for col in numeric_cols:
#     data = X_train_df.select(col).toPandas()[col]
    
#     fig, axes = plt.subplots(1, 2, figsize=(9, 3))

#     # Histogram
#     axes[0].hist(data, bins=30, color='skyblue', edgecolor='black')
#     axes[0].set_title(f'Histogram of {col}')
    
#     # Boxplot
#     axes[1].boxplot(data, vert=False)
#     axes[1].set_title(f'Boxplot of {col}')

#     plt.tight_layout()
#     plt.show()

### Có thể thấy hầu hết các cột đều có phân phối lệch, ngoại trừ `loan_amnt`. Vì vậy, để chuẩn hóa giá trị trong các cột, sẽ sử dụng: Log_Transform + Standard_Scaler cho các cột phân phối lệch; với cột phân phối không lệch thì sẽ sử dụng Standard_Scaler để chuẩn hóa

In [10]:
skewed_cols = [c for c in numeric_cols if c != 'loan_amnt']
unskewed_cols = ['loan_amnt']
has_neg_cols = ['mths_since_last_delinq', 'mths_since_last_record', 'mths_since_last_major_derog']
categorical_cols = [
    'initial_list_status_w', 'is_60_months', 'purpose_debt_consolidation', 'purpose_credit_card',
    'purpose_home_improvement', 'purpose_major_purchase', 'purpose_small_business',
    'purpose_car', 'purpose_medical', 'purpose_other', 
    'is_mortgage', 'is_rent', 'is_own', 
    'is_source_verified', 'is_verified', 'is_not_verified', 'emp_length_num'
]

#### Log-transform cho các cột lệch

In [11]:
for c in skewed_cols:
    X_train_df = X_train_df.withColumn(c, F.log1p(F.col(c)))
    X_test_df = X_test_df.withColumn(c, F.log1p(F.col(c)))

In [12]:
assembler = VectorAssembler(
    inputCols=skewed_cols + unskewed_cols,
    outputCol="features_unscaled"
)
train_vec = assembler.transform(X_train_df)
test_vec = assembler.transform(X_test_df)

scaler = StandardScaler(
    inputCol="features_unscaled",
    outputCol="features_scaled",
    withMean=True,
    withStd=True
)

scaler_model = scaler.fit(train_vec)
train_scaled = scaler_model.transform(train_vec)
test_scaled = scaler_model.transform(test_vec)

In [13]:
X_train_df = train_scaled.select("features_scaled", *categorical_cols)
X_test_df = test_scaled.select("features_scaled", *categorical_cols)

In [14]:
X_train_df.head()

Row(features_scaled=DenseVector([-3.8986, -1.5484, 1.1811, -4.9467, -4.4176, -0.1178, -0.1654, -0.4492, 1.4436, 1.1555, -0.4221, -0.3442, -0.4041, -5.9411, 1.4006, -0.1135, -0.5672, -0.1154, -0.1258, -0.1444, -0.1468, -0.1359, -0.1554, -0.3619, -0.3761, 0.1248, -0.0989, -0.0962, -0.1251, 0.7322, -1.506]), initial_list_status_w=0, is_60_months=0, purpose_debt_consolidation=1, purpose_credit_card=0, purpose_home_improvement=0, purpose_major_purchase=0, purpose_small_business=0, purpose_car=0, purpose_medical=0, purpose_other=0, is_mortgage=0, is_rent=1, is_own=0, is_source_verified=0, is_verified=1, is_not_verified=0, emp_length_num=10.0)

In [15]:
Y_train_df.head()

Row(loan_status_label=0)

In [16]:
def prepare_data_for_model(X_df, Y_df):
    df = X_df.withColumn("features_array", vector_to_array("features_scaled"))
    
    extra_cols = [
        "initial_list_status_w", "is_60_months",
        "purpose_debt_consolidation", "purpose_credit_card", "purpose_home_improvement",
        "purpose_major_purchase", "purpose_small_business", "purpose_car", "purpose_medical",
        "purpose_other", "is_mortgage", "is_rent", "is_own",
        "is_source_verified", "is_verified", "is_not_verified",
        "emp_length_num"
    ]
    
    arrays = [F.col("features_array")] + [F.array(F.col(c)) for c in extra_cols]
    df = df.withColumn("all_features", reduce(lambda x, y: F.concat(x, y), arrays))
    X_list, y_list = [], []

    x_iter = df.select("all_features").toLocalIterator()
    y_iter = Y_df.toLocalIterator()
    
    n_rows = df.count()
    
    for row_x, row_y in tqdm(
        zip(x_iter, y_iter),
        total=n_rows,
        desc="Preparing data"
    ):
        X_list.append(row_x["all_features"])
        y_list.append(row_y["loan_status_label"])

    X = np.array(X_list, dtype=np.float32)
    y = np.array(y_list, dtype=np.float32)

    
    # X_tensor = torch.tensor(X, dtype=torch.float32)
    # y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    
    return {
        "X": X,
        "y": y,
        # "X_tensor": X_tensor,
        # "y_tensor": y_tensor
    }

In [23]:
train_data = prepare_data_for_model(X_train_df, Y_train_df)

Preparing data: 100%|██████████| 753510/753510 [00:37<00:00, 20179.58it/s]


In [24]:
test_data = prepare_data_for_model(X_test_df, Y_test_df)

Preparing data: 100%|██████████| 132799/132799 [00:15<00:00, 8641.33it/s] 


In [25]:
# X_train
print("X_train info:")
print(f"Shape: {train_data['X'].shape}")
print(f"Dtype: {train_data['X'].dtype}")
print("\n")

# y_train
print("y_train info:")
print(f"Shape: {train_data['y'].shape}")
print(f"Dtype: {train_data['y'].dtype}")
print("\n")

# X_test
print("X_test info:")
print(f"Shape: {test_data['X'].shape}")
print(f"Dtype: {test_data['X'].dtype}")
print("\n")

# y_test
print("y_test info:")
print(f"Shape: {test_data['y'].shape}")
print(f"Dtype: {test_data['y'].dtype}")

X_train info:
Shape: (753510, 48)
Dtype: float32


y_train info:
Shape: (753510,)
Dtype: float32


X_test info:
Shape: (132799, 48)
Dtype: float32


y_test info:
Shape: (132799,)
Dtype: float32


# II. Modeling

In [26]:
train_data

{'X': array([[-3.8985558 , -1.5484096 ,  1.181145  , ...,  1.        ,
          0.        , 10.        ],
        [-3.8985558 , -1.5484096 ,  0.01006276, ...,  0.        ,
          1.        ,  5.        ],
        [-3.8985558 , -1.5484096 ,  0.31381395, ...,  0.        ,
          1.        , 10.        ],
        ...,
        [ 2.5383134 , -1.5484096 , -0.12208486, ...,  1.        ,
          0.        , 10.        ],
        [ 2.543024  , -1.5484096 ,  0.8875873 , ...,  1.        ,
          0.        ,  2.        ],
        [ 2.545331  , -1.5484096 ,  1.7928197 , ...,  1.        ,
          0.        ,  8.        ]], dtype=float32),
 'y': array([0., 0., 0., ..., 1., 1., 1.], dtype=float32)}

In [27]:
import numpy as np
from sklearn.metrics import roc_auc_score, classification_report
import joblib

## Data

In [28]:
# Data
X_train = train_data["X"]
y_train = train_data["y"]

X_test =  test_data['X']
y_test = test_data['y']

## Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression

In [31]:
import joblib

for C in [0.01, 0.1, 1]:
    model = LogisticRegression(
        solver="saga",
        penalty="l2",
        C=C,
        max_iter=500,
        n_jobs=-1
    )

    # Train
    model.fit(X_train, y_train)

    # Predict
    pred = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, pred)

    print(f"C = {C}, AUC = {auc:.4f}")
    print(classification_report(y_test, y_pred, digits=3))

    filename = f"logistic_model_C{C}.pkl"
    joblib.dump(model, filename)




C = 0.01, AUC = 0.5136
              precision    recall  f1-score   support

         0.0      0.086     0.064     0.074     10325
         1.0      0.923     0.943     0.933    122474

    accuracy                          0.874    132799
   macro avg      0.505     0.503     0.503    132799
weighted avg      0.858     0.874     0.866    132799

C = 0.1, AUC = 0.5150
              precision    recall  f1-score   support

         0.0      0.086     0.066     0.075     10325
         1.0      0.923     0.941     0.932    122474

    accuracy                          0.873    132799
   macro avg      0.505     0.504     0.503    132799
weighted avg      0.858     0.873     0.865    132799

C = 1, AUC = 0.5150
              precision    recall  f1-score   support

         0.0      0.086     0.066     0.075     10325
         1.0      0.923     0.941     0.932    122474

    accuracy                          0.873    132799
   macro avg      0.505     0.504     0.503    132799
weighted 

## XGBoost

In [32]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-nccl-cu12
  Downloading nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl (296.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.8/296.8 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
Installing collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.28.9 xgboost-3.1.1
Note: you may need to restart the kernel to use updated packages.


In [33]:
import xgboost as xgb

In [34]:
for depth in [6, 8]:
    for lr in [0.05, 0.1]:
        model = xgb.XGBClassifier(
            n_estimators=300,
            max_depth=depth,
            learning_rate=lr,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="auc",
            n_jobs=-1,
            tree_method="hist"
        )
        
        model.fit(X_train, y_train)
        pred = model.predict_proba(X_test)[:, 1]
        y_pred = model.predict(X_test)
        print(f"depth={depth}, lr={lr}, AUC={roc_auc_score(y_test, pred):.4f}")
        print(classification_report(y_test, y_pred, digits=3))
        filename = f"xgboost_model_depth{depth}_lr{lr}.pkl"
        # Lưu mô hình
        joblib.dump(model,filename)

depth=6, lr=0.05, AUC=0.9560
              precision    recall  f1-score   support

         0.0      0.945     0.731     0.824     10213
         1.0      0.978     0.996     0.987    122586

    accuracy                          0.976    132799
   macro avg      0.962     0.864     0.906    132799
weighted avg      0.975     0.976     0.975    132799

depth=6, lr=0.1, AUC=0.9571
              precision    recall  f1-score   support

         0.0      0.949     0.728     0.824     10213
         1.0      0.978     0.997     0.987    122586

    accuracy                          0.976    132799
   macro avg      0.963     0.863     0.906    132799
weighted avg      0.976     0.976     0.975    132799

depth=8, lr=0.05, AUC=0.9577
              precision    recall  f1-score   support

         0.0      0.951     0.734     0.828     10213
         1.0      0.978     0.997     0.987    122586

    accuracy                          0.977    132799
   macro avg      0.964     0.865     0.90

In [26]:
# Khởi tạo mô hình XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=300,       
    max_depth=6,            
    learning_rate=0.05,     
    subsample=0.8,          
    colsample_bytree=0.8,   
    eval_metric='auc',
    n_jobs=-1,              
    tree_method='hist',     
    random_state=42
)

# Huấn luyện
xgb_model.fit(X_train, y_train)

# Dự đoán
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# Đánh giá
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC: {auc:.4f}")
print(classification_report(y_test, y_pred, digits=3))

# Lưu mô hình
joblib.dump(model, "xgboost_model.pkl")

AUC (XGBoost): 0.9560
              precision    recall  f1-score   support

         0.0      0.949     0.729     0.824     10213
         1.0      0.978     0.997     0.987    122586

    accuracy                          0.976    132799
   macro avg      0.963     0.863     0.906    132799
weighted avg      0.976     0.976     0.975    132799



## LightGBM

In [35]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [36]:
from lightgbm import LGBMClassifier

In [37]:
for leaves in [64, 128]:
    lbgm_model = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.03,
        max_depth=-1,
        num_leaves=leaves,
        subsample=0.8,
        colsample_bytree=0.8,
        n_jobs=-1
    )

    lbgm_model.fit(X_train, y_train)

    # Dự đoán
    y_pred = lbgm_model.predict(X_test)
    y_pred_proba = lbgm_model.predict_proba(X_test)[:, 1]

    # Đánh giá
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"leaves={leaves}, AUC={roc_auc_score(y_test, pred):.4f}")
    print(classification_report(y_test, y_pred, digits=3))
    filename = f"lightbgm_model_leaves{leaves}.pkl"
    # Lưu mô hình
    joblib.dump(model, filename)

[LightGBM] [Info] Number of positive: 694347, number of negative: 59163
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038492 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4688
[LightGBM] [Info] Number of data points in the train set: 753510, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.921483 -> initscore=2.462675
[LightGBM] [Info] Start training from score 2.462675
leaves=64, AUC=0.9579
              precision    recall  f1-score   support

         0.0      0.968     0.732     0.834     10213
         1.0      0.978     0.998     0.988    122586

    accuracy                          0.978    132799
   macro avg      0.973     0.865     0.911    132799
weighted avg      0.977     0.978     0.976    132799

[LightGBM] [Info] Number of positive: 694347, number of negative: 59163
[LightGBM] [Info] Aut

## MLP

In [21]:
from sklearn.neural_network import MLPClassifier

In [32]:
# Sử dụng cùng tập train/test ở trên
for lr in [0.001, 0.005]:
    for alpha in [1e-5, 1e-4]:
        mlp_model = MLPClassifier(
            hidden_layer_sizes=(128, 64, 32),  
            activation='relu',
            solver='adam',
            alpha=alpha,        
            batch_size=256,
            learning_rate_init=lr,
            max_iter=50,       # epoch
            verbose=True,
            random_state=42
        )

        # Huấn luyện
        mlp_model.fit(X_train, y_train)

        # Dự đoán
        y_pred = mlp_model.predict(X_test)
        y_pred_proba = mlp_model.predict_proba(X_test)[:, 1]

        # Đánh giá
        auc = roc_auc_score(y_test, y_pred_proba)
        print(f"lr={lr},alpha={alpha}, AUC={roc_auc_score(y_test, pred):.4f}")
        print(classification_report(y_test, y_pred, digits=3))
        filename = f"mpl_model_lr{lr}_alpha{alpha}.pkl"
        # Lưu mô hình
        joblib.dump(mlp_model, filename)

Iteration 1, loss = 0.10032717
Iteration 2, loss = 0.08948881
Iteration 3, loss = 0.08825390
Iteration 4, loss = 0.08737925
Iteration 5, loss = 0.08663596
Iteration 6, loss = 0.08620690
Iteration 7, loss = 0.08572415
Iteration 8, loss = 0.08528415
Iteration 9, loss = 0.08487504
Iteration 10, loss = 0.08459917
Iteration 11, loss = 0.08429612
Iteration 12, loss = 0.08396978
Iteration 13, loss = 0.08382028
Iteration 14, loss = 0.08345939
Iteration 15, loss = 0.08315792
Iteration 16, loss = 0.08295861
Iteration 17, loss = 0.08268822
Iteration 18, loss = 0.08248459
Iteration 19, loss = 0.08222440
Iteration 20, loss = 0.08180436
Iteration 21, loss = 0.08165028
Iteration 22, loss = 0.08145213
Iteration 23, loss = 0.08123373
Iteration 24, loss = 0.08081916
Iteration 25, loss = 0.08068796
Iteration 26, loss = 0.08041923
Iteration 27, loss = 0.08009395
Iteration 28, loss = 0.07999890
Iteration 29, loss = 0.07967473
Iteration 30, loss = 0.07944952
Iteration 31, loss = 0.07929126
Iteration 32, los



lr=0.001,alpha=1e-05, AUC=0.5150
              precision    recall  f1-score   support

         0.0      0.085     0.065     0.073     10325
         1.0      0.923     0.941     0.932    122474

    accuracy                          0.873    132799
   macro avg      0.504     0.503     0.503    132799
weighted avg      0.858     0.873     0.865    132799

Iteration 1, loss = 0.10025631
Iteration 2, loss = 0.08953454
Iteration 3, loss = 0.08833178
Iteration 4, loss = 0.08753424
Iteration 5, loss = 0.08684963
Iteration 6, loss = 0.08631850
Iteration 7, loss = 0.08584148
Iteration 8, loss = 0.08537236
Iteration 9, loss = 0.08507237
Iteration 10, loss = 0.08468931
Iteration 11, loss = 0.08443596
Iteration 12, loss = 0.08414324
Iteration 13, loss = 0.08394644
Iteration 14, loss = 0.08371036
Iteration 15, loss = 0.08338619
Iteration 16, loss = 0.08313147
Iteration 17, loss = 0.08288985
Iteration 18, loss = 0.08274390
Iteration 19, loss = 0.08238685
Iteration 20, loss = 0.08215351
Iteration



lr=0.001,alpha=0.0001, AUC=0.5150
              precision    recall  f1-score   support

         0.0      0.083     0.065     0.073     10325
         1.0      0.923     0.939     0.931    122474

    accuracy                          0.871    132799
   macro avg      0.503     0.502     0.502    132799
weighted avg      0.857     0.871     0.864    132799

Iteration 1, loss = 0.09656619
Iteration 2, loss = 0.09013385
Iteration 3, loss = 0.08890692
Iteration 4, loss = 0.08798579
Iteration 5, loss = 0.08737904
Iteration 6, loss = 0.08702557
Iteration 7, loss = 0.08655904
Iteration 8, loss = 0.08629048
Iteration 9, loss = 0.08596890
Iteration 10, loss = 0.08568515
Iteration 11, loss = 0.08550607
Iteration 12, loss = 0.08535498
Iteration 13, loss = 0.08524913
Iteration 14, loss = 0.08493395
Iteration 15, loss = 0.08477886
Iteration 16, loss = 0.08454520
Iteration 17, loss = 0.08443680
Iteration 18, loss = 0.08415458
Iteration 19, loss = 0.08410262
Iteration 20, loss = 0.08399535
Iteratio



lr=0.005,alpha=1e-05, AUC=0.5150
              precision    recall  f1-score   support

         0.0      0.088     0.059     0.071     10325
         1.0      0.923     0.949     0.936    122474

    accuracy                          0.880    132799
   macro avg      0.506     0.504     0.503    132799
weighted avg      0.858     0.880     0.868    132799

Iteration 1, loss = 0.09657566
Iteration 2, loss = 0.09024772
Iteration 3, loss = 0.08894810
Iteration 4, loss = 0.08831118
Iteration 5, loss = 0.08797514
Iteration 6, loss = 0.08741336
Iteration 7, loss = 0.08697941
Iteration 8, loss = 0.08672670
Iteration 9, loss = 0.08652233
Iteration 10, loss = 0.08639159
Iteration 11, loss = 0.08611452
Iteration 12, loss = 0.08594648
Iteration 13, loss = 0.08582546
Iteration 14, loss = 0.08578735
Iteration 15, loss = 0.08555548
Iteration 16, loss = 0.08547935
Iteration 17, loss = 0.08523302
Iteration 18, loss = 0.08520928
Iteration 19, loss = 0.08519159
Iteration 20, loss = 0.08504574
Iteration



lr=0.005,alpha=0.0001, AUC=0.5150
              precision    recall  f1-score   support

         0.0      0.089     0.058     0.071     10325
         1.0      0.923     0.950     0.936    122474

    accuracy                          0.880    132799
   macro avg      0.506     0.504     0.503    132799
weighted avg      0.858     0.880     0.869    132799



## Nhận xét:
- Phân tích từng model:
1. Logistic Regression
+ Ưu: Nhanh, dễ hiểu, ổn định
+ Nhược: Bắt quan hệ phi tuyến kém (f1 cho lớp 0 chỉ 0.760), Recall thấp cho class 0 → chỉ 0.674 → bỏ sót nhiều ca vỡ nợ
+ KL: Logistic làm baseline tốt nhưng độ chính xác không bằng các model khác

2. XGBoost
+ Ưu: AUC tăng từ 0.93 → 0.956, Precision class 0 rất cao (0.949), Recall class 0 khá hơn logistic (0.729).
+ Hạn chế: recall chưa cao lắm
+ Nhận xét: Mạnh, rất phù hợp

3. LightBGM
+ Ưu: AUC cao nhất, Precision class 0 = 0.968 (hơn XGB), Recall class 0 = 0.732 (hơn XGB), F1 class 0 = 0.834 (cao nhất), Accuracy toàn tập = 0.978 (cao nhất)
+ KL: Đây là mô hình cho performance tốt nhất

4. MLP
+ Ưu: recision tốt nhưng recall cho class 0 vẫn thấp (0.729), bắt quan hệ phi tuyến tốt
+ Nhược: Dễ overfit, tốn GPU
+ KL: Phù hợp nếu có GPU khỏe, xử lý kĩ thuật tốt để tránh overfit

In [26]:
spark.stop()