In [1]:
# !pip install torch==1.13.1

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
from tqdm import tqdm

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, count, round, when, countDistinct as _round
from pyspark.sql.functions import sum as _sum
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.functions import vector_to_array


In [2]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("FeatureExtraction") \
    .getOrCreate()

In [3]:
final_schema = StructType([
    StructField("initial_list_status_w", IntegerType(), nullable=False),
    StructField("last_pymnt_amnt", DoubleType(), nullable=True),
    StructField("out_prncp", DoubleType(), nullable=True),
    StructField("loan_amnt", DoubleType(), nullable=True),
    StructField("int_rate", DoubleType(), nullable=True),
    StructField("is_60_months", IntegerType(), nullable=False),
    StructField("purpose_debt_consolidation", IntegerType(), nullable=False),
    StructField("purpose_credit_card", IntegerType(), nullable=False),
    StructField("purpose_home_improvement", IntegerType(), nullable=False),
    StructField("purpose_major_purchase", IntegerType(), nullable=False),
    StructField("purpose_small_business", IntegerType(), nullable=False),
    StructField("purpose_car", IntegerType(), nullable=False),
    StructField("purpose_medical", IntegerType(), nullable=False),
    StructField("purpose_other", IntegerType(), nullable=False),
    StructField("total_pymnt", DoubleType(), nullable=True),
    StructField("total_rec_int", DoubleType(), nullable=True),
    StructField("total_rec_late_fee", DoubleType(), nullable=True),
    StructField("recoveries", DoubleType(), nullable=True),
    StructField("delinq_2yrs", DoubleType(), nullable=True),
    StructField("inq_last_6mths", DoubleType(), nullable=True),
    StructField("mths_since_last_delinq", DoubleType(), nullable=True),
    StructField("mths_since_last_record", DoubleType(), nullable=True),
    StructField("open_acc", DoubleType(), nullable=True),
    StructField("pub_rec", DoubleType(), nullable=True),
    StructField("revol_util", DoubleType(), nullable=True),
    StructField("total_acc", DoubleType(), nullable=True),
    StructField("collections_12_mths_ex_med", DoubleType(), nullable=True),
    StructField("mths_since_last_major_derog", DoubleType(), nullable=True),
    StructField("open_acc_6m", DoubleType(), nullable=True),
    StructField("open_il_24m", DoubleType(), nullable=True),
    StructField("mths_since_rcnt_il", DoubleType(), nullable=True),
    StructField("total_bal_il", DoubleType(), nullable=True),
    StructField("open_rv_24m", DoubleType(), nullable=True),
    StructField("all_util", DoubleType(), nullable=True),
    StructField("total_rev_hi_lim", DoubleType(), nullable=True),
    StructField("tot_coll_amt", DoubleType(), nullable=True),
    StructField("tot_cur_bal", DoubleType(), nullable=True),
    StructField("inq_fi", DoubleType(), nullable=True),
    StructField("total_cu_tl", DoubleType(), nullable=True),
    StructField("inq_last_12m", DoubleType(), nullable=True),
    StructField("annual_inc", DoubleType(), nullable=True),
    StructField("emp_length_num", DoubleType(), nullable=False),
    StructField("is_mortgage", IntegerType(), nullable=False),
    StructField("is_rent", IntegerType(), nullable=False),
    StructField("is_own", IntegerType(), nullable=False),
    StructField("is_source_verified", IntegerType(), nullable=False),
    StructField("is_verified", IntegerType(), nullable=False),
    StructField("is_not_verified", IntegerType(), nullable=False),
    StructField("loan_status_label", IntegerType(), nullable=True)
])

In [4]:
df = (
    spark.read.option("header", True)
    .option("quote", '"')
    .option("escape", '"')
    .schema(final_schema)
    .csv("hdfs://namenode:9000/bigdata/data/final_data")
)

In [5]:
label_col = "loan_status_label"

feature_cols = [c for c in df.columns if c != label_col]

train_df, test_df = df.randomSplit([0.85, 0.15], seed=42)
X_train_df, Y_train_df = train_df.select(*feature_cols), train_df.select(label_col)
X_test_df, Y_test_df = test_df.select(*feature_cols), test_df.select(label_col)

# I. Feature Scaling

In [6]:
# for c in feature_cols:
#     n_unique = X_train_df.select(c).distinct().count()
    
#     if n_unique <= 12:
#         print(f"Column: {c}, Unique values: {n_unique}")
#         unique_vals = [row[c] for row in df.select(c).distinct().collect()]
#         print(f"  Values: {unique_vals}\n")

### Như vậy, các cột có loại dữ liệu Categorical là:
`initial_list_status_w`, `initial_list_status_w`, `purpose_debt_consolidation`, `purpose_credit_card`, `purpose_home_improvement`, `purpose_major_purchase`, `purpose_small_business`, `purpose_car`, `purpose_medical`, `purpose_other`, `is_mortgage`, `is_rent`, `is_own`, `is_source_verified`, `is_verified`, `is_not_verified`, `emp_length_num`

### Với các cột còn lại, loại dữ liệu là Numerical, sẽ cần xem phân phối dữ liệu và kiểm tra outlier để lựa chọn phương pháp scale phù hợp

In [7]:
exclude_cols = [
    'initial_list_status_w', 'is_60_months', 'purpose_debt_consolidation', 'purpose_credit_card',
    'purpose_home_improvement', 'purpose_major_purchase', 'purpose_small_business',
    'purpose_car', 'purpose_medical', 'purpose_other', 
    'is_mortgage', 'is_rent', 'is_own', 
    'is_source_verified', 'is_verified', 'is_not_verified', 'emp_length_num'
]

numeric_cols = [c for c in feature_cols if c not in exclude_cols]

# for col in numeric_cols:
#     data = X_train_df.select(col).toPandas()[col]
    
#     fig, axes = plt.subplots(1, 2, figsize=(9, 3))

#     # Histogram
#     axes[0].hist(data, bins=30, color='skyblue', edgecolor='black')
#     axes[0].set_title(f'Histogram of {col}')
    
#     # Boxplot
#     axes[1].boxplot(data, vert=False)
#     axes[1].set_title(f'Boxplot of {col}')

#     plt.tight_layout()
#     plt.show()

### Có thể thấy hầu hết các cột đều có phân phối lệch, ngoại trừ `loan_amnt`. Vì vậy, để chuẩn hóa giá trị trong các cột, sẽ sử dụng: Log_Transform + Standard_Scaler cho các cột phân phối lệch; với cột phân phối không lệch thì sẽ sử dụng Standard_Scaler để chuẩn hóa

In [8]:
skewed_cols = [c for c in numeric_cols if c != 'loan_amnt']
unskewed_cols = ['loan_amnt']
has_neg_cols = ['mths_since_last_delinq', 'mths_since_last_record', 'mths_since_last_major_derog']
categorical_cols = [
    'initial_list_status_w', 'is_60_months', 'purpose_debt_consolidation', 'purpose_credit_card',
    'purpose_home_improvement', 'purpose_major_purchase', 'purpose_small_business',
    'purpose_car', 'purpose_medical', 'purpose_other', 
    'is_mortgage', 'is_rent', 'is_own', 
    'is_source_verified', 'is_verified', 'is_not_verified', 'emp_length_num'
]

#### Log-transform cho các cột lệch

In [9]:
for c in skewed_cols:
    X_train_df = X_train_df.withColumn(c, F.log1p(F.col(c)))
    X_test_df = X_test_df.withColumn(c, F.log1p(F.col(c)))

In [10]:
assembler = VectorAssembler(
    inputCols=skewed_cols + unskewed_cols,
    outputCol="features_unscaled"
)
train_vec = assembler.transform(X_train_df)
test_vec = assembler.transform(X_test_df)

scaler = StandardScaler(
    inputCol="features_unscaled",
    outputCol="features_scaled",
    withMean=True,
    withStd=True
)

scaler_model = scaler.fit(train_vec)
train_scaled = scaler_model.transform(train_vec)
test_scaled = scaler_model.transform(test_vec)

In [11]:
X_train_df = train_scaled.select("features_scaled", *categorical_cols)
X_test_df = test_scaled.select("features_scaled", *categorical_cols)

In [12]:
X_train_df.head()

Row(features_scaled=DenseVector([-3.8986, -1.5484, 1.1811, -4.9467, -4.4176, -0.1178, -0.1654, -0.4492, 1.4436, 1.1555, -0.4221, -0.3442, -0.4041, -5.9411, 1.4006, -0.1135, -0.5672, -0.1154, -0.1258, -0.1444, -0.1468, -0.1359, -0.1554, -0.3619, -0.3761, 0.1248, -0.0989, -0.0962, -0.1251, 0.7322, -1.506]), initial_list_status_w=0, is_60_months=0, purpose_debt_consolidation=1, purpose_credit_card=0, purpose_home_improvement=0, purpose_major_purchase=0, purpose_small_business=0, purpose_car=0, purpose_medical=0, purpose_other=0, is_mortgage=0, is_rent=1, is_own=0, is_source_verified=0, is_verified=1, is_not_verified=0, emp_length_num=10.0)

In [13]:
Y_train_df.head()

Row(loan_status_label=0)

In [None]:
def prepare_data_for_model(X_df, Y_df):
    df = X_df.withColumn("features_array", vector_to_array("features_scaled"))
    
    extra_cols = [
        "initial_list_status_w", "is_60_months",
        "purpose_debt_consolidation", "purpose_credit_card", "purpose_home_improvement",
        "purpose_major_purchase", "purpose_small_business", "purpose_car", "purpose_medical",
        "purpose_other", "is_mortgage", "is_rent", "is_own",
        "is_source_verified", "is_verified", "is_not_verified",
        "emp_length_num"
    ]
    
    arrays = [F.col("features_array")] + [F.array(F.col(c)) for c in extra_cols]
    df = df.withColumn("all_features", reduce(lambda x, y: F.concat(x, y), arrays))
    X_list, y_list = [], []

    x_iter = df.select("all_features").toLocalIterator()
    y_iter = Y_df.toLocalIterator()
    
    n_rows = df.count()
    
    for row_x, row_y in tqdm(
        zip(x_iter, y_iter),
        total=n_rows,
        desc="Preparing data"
    ):
        X_list.append(row_x["all_features"])
        y_list.append(row_y["loan_status_label"])

    X = np.array(X_list, dtype=np.float32)
    y = np.array(y_list, dtype=np.float32)

    
    # X_tensor = torch.tensor(X, dtype=torch.float32)
    # y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)
    
    return {
        "X": X,
        "y": y,
        # "X_tensor": X_tensor,
        # "y_tensor": y_tensor
    }

In [15]:
train_data = prepare_data_for_model(X_train_df, Y_train_df)

Preparing data: 100%|██████████| 753510/753510 [00:38<00:00, 19365.93it/s]


In [16]:
test_data = prepare_data_for_model(X_test_df, Y_test_df)

Preparing data: 100%|██████████| 132799/132799 [00:15<00:00, 8405.95it/s]


In [17]:
# X_train
print("X_train info:")
print(f"Shape: {train_data['X'].shape}")
print(f"Dtype: {train_data['X'].dtype}")
print("\n")

# y_train
print("y_train info:")
print(f"Shape: {train_data['y'].shape}")
print(f"Dtype: {train_data['y'].dtype}")
print("\n")

# X_test
print("X_test info:")
print(f"Shape: {test_data['X'].shape}")
print(f"Dtype: {test_data['X'].dtype}")
print("\n")

# y_test
print("y_test info:")
print(f"Shape: {test_data['y'].shape}")
print(f"Dtype: {test_data['y'].dtype}")

X_train info:
Shape: (753510, 48)
Dtype: float32


y_train info:
Shape: (753510,)
Dtype: float32


X_test info:
Shape: (132799, 48)
Dtype: float32


y_test info:
Shape: (132799,)
Dtype: float32


# II. Modeling

In [19]:
train_data

{'X': array([[-3.8985558 , -1.5484096 ,  1.181145  , ...,  1.        ,
          0.        , 10.        ],
        [-3.8985558 , -1.5484096 ,  0.01006276, ...,  0.        ,
          1.        ,  5.        ],
        [-3.8985558 , -1.5484096 ,  0.31381395, ...,  0.        ,
          1.        , 10.        ],
        ...,
        [ 2.5383134 , -1.5484096 , -0.12208486, ...,  1.        ,
          0.        , 10.        ],
        [ 2.543024  , -1.5484096 ,  0.8875873 , ...,  1.        ,
          0.        ,  2.        ],
        [ 2.545331  , -1.5484096 ,  1.7928197 , ...,  1.        ,
          0.        ,  8.        ]], dtype=float32),
 'y': array([0., 0., 0., ..., 1., 1., 1.], dtype=float32)}

In [21]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
import joblib

In [22]:
# Data
X_train = train_data["X"]
y_train = train_data["y"]

X_test =  test_data['X']
y_test = test_data['y']

# Huấn luyện logistic regression 
model = LogisticRegression(
    solver="saga",          
    max_iter=500,
    penalty="l2",
    C=1.0,                 
    n_jobs=-1,              
    verbose=1              
)
model.fit(X_train, y_train)

# Dự đoán
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba >= 0.5).astype(int)

# Đánh giá
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC: {auc:.4f}")
print(classification_report(y_test, y_pred, digits=3))

# Lưu mô hình
joblib.dump(model, "logistic_model.pkl")


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 38 epochs took 21 seconds
AUC: 0.9327


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   22.0s finished


              precision    recall  f1-score   support

         0.0      0.871     0.674     0.760     10213
         1.0      0.973     0.992     0.982    122586

    accuracy                          0.967    132799
   macro avg      0.922     0.833     0.871    132799
weighted avg      0.966     0.967     0.965    132799



['logistic_model.pkl']

In [26]:
spark.stop()