In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


1. Load và xem dữ liệu

In [None]:
# Import các thư viện
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_auc_score, roc_curve
)
BASE = Path(".")
DATA = BASE / "data" / "/content/bank-additional-full.csv"
REPORT_DIR = BASE / "reports"
CHART_DIR = REPORT_DIR / "charts"
REPORT_DIR.mkdir(parents=True, exist_ok=True)
CHART_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Đọc dữ liệu
df = pd.read_csv(DATA, sep=";")
print("Shape:", df.shape)
df.head()

Shape: (41188, 21)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


2. Làm sạch dữ liệu

In [None]:
# Xử lý giá trị thiếu
df = df.replace("unknown", np.nan)

In [None]:
if "pdays" in df.columns:
    df["pdays_clean"] = df["pdays"].replace(999, np.nan) #Thay thế các giá trị "999" thành NaN
    df["has_prev_contact"] = np.where(df["pdays_clean"].notna(), 1, 0) # Rà soát cột pdays_clean nếu mang NaN thì sẽ gán 1 vào cột has_prev_contact; còn lại là 0


In [None]:
df.isna().mean().sort_values(ascending=False).head(10)

Unnamed: 0,0
pdays_clean,0.963217
default,0.208726
education,0.042027
loan,0.024036
housing,0.024036
job,0.008012
marital,0.001942
age,0.0
contact,0.0
day_of_week,0.0


In [None]:
# Chọn biến dùng trong phân tích
keep_cols = [
    "age","job","marital","education",
    "default","housing","loan",
    "campaign","pdays_clean","previous","poutcome","has_prev_contact",
    "contact","month","day_of_week",
    "emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed",
    "y"
]
use_cols = [c for c in keep_cols if c in df.columns]  # an toàn nếu thiếu cột nào đó
data = df[use_cols].copy()
data.head()


Unnamed: 0,age,job,marital,education,default,housing,loan,campaign,pdays_clean,previous,...,has_prev_contact,contact,month,day_of_week,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,1,,0,...,0,telephone,may,mon,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,,no,no,1,,0,...,0,telephone,may,mon,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,1,,0,...,0,telephone,may,mon,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,1,,0,...,0,telephone,may,mon,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,1,,0,...,0,telephone,may,mon,1.1,93.994,-36.4,4.857,5191.0,no


In [None]:
# Xử lý các giá trị thiếu
# Numeric columns
num_cols = ["age","campaign","pdays_clean","previous",
            "emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed"]

# Categorical columns
cat_cols = ["job","marital","education","default","housing","loan",
            "poutcome","contact","month","day_of_week"]
# Điền numeric
for c in num_cols:
    data[c] = data[c].fillna(data[c].median())

# Điền categorical
for c in cat_cols:
    mode_val = data[c].mode(dropna=True)
    if not mode_val.empty:
        data[c] = data[c].fillna(mode_val.iloc[0])

# Encode target thành 0/1 để tiện tính toán
data["target"] = data["y"].map({"no":0,"yes":1})

In [None]:
def hist(series, bins, title, xlabel, filename):
    plt.figure()
    plt.hist(series.dropna(), bins=bins)
    plt.title(title); plt.xlabel(xlabel); plt.ylabel("Count")
    plt.tight_layout(); plt.savefig(CHART_DIR/filename, dpi=150); plt.close()

def barh_from_series(s, title, xlabel, ylabel, filename):
    plt.figure()
    s.plot(kind="barh")
    plt.title(title); plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.tight_layout(); plt.savefig(CHART_DIR/filename, dpi=150); plt.close()

def subscription_rate_by(col):
    rate = data.groupby(col)["target"].mean().sort_values(ascending=False) # Nhóm cột col vào, tính giá trị trung trình target theo từng nhóm và xếp theo thứ tự giảm dần
    barh_from_series(rate, f"Subscription Rate by {col}", "Rate", col, f"sub_rate_by_{col}.png")
    return rate
# Phân phối tuổi & chiến dịch
hist(data["age"], bins=20, title="Age Distribution", xlabel="Age", filename="age_dist.png")
hist(data["campaign"], bins=20, title="Campaign Count Distribution", xlabel="campaign", filename="campaign_dist.png")

# Tỉ lệ subscribe tổng thể
target_share = data["y"].value_counts(normalize=True).sort_index()
barh_from_series(target_share, "Target Share (no/yes)", "Share", "y", "target_share.png")

# Tỉ lệ subscribe theo một số biến phân loại
rate_job = subscription_rate_by("job") if "job" in data.columns else None
rate_edu = subscription_rate_by("education") if "education" in data.columns else None
rate_pout = subscription_rate_by("poutcome") if "poutcome" in data.columns else None
rate_contact = subscription_rate_by("contact") if "contact" in data.columns else None

# Tỉ lệ subscribe theo has_prev_contact (0/1)
if "has_prev_contact" in data.columns:
    rate_prev = subscription_rate_by("has_prev_contact")

In [None]:
# Nhóm tuổi
age_bins = [17,30,45,60,120]
age_labels = ["17-30","31-45","46-60","60+"]
data["age_group"] = pd.cut(data["age"], bins=age_bins, labels=age_labels, include_lowest=True, right=True)

# Giới hạn campaign để giảm nhiễu do vài giá trị quá lớn
if "campaign" in data.columns:
    data["campaign_capped"] = data["campaign"].clip(upper=20) #Tạo cột mới với giá trị cột campaign không được vượt quá 20, tức giá trị nào trong cột campaign lớn hơn 20 thì điều chỉnh thành 20

In [None]:
# Chọn features cho model (tránh 'duration'; giữ macro & hành vi)
feature_cols = [
    "age","age_group",
    "job","marital","education","default","housing","loan",
    "previous","has_prev_contact","poutcome",
    "campaign_capped",
    "emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed",
    # (tùy chọn) "contact","month","day_of_week"
]
feature_cols = [c for c in feature_cols if c in data.columns]
model_df = data[feature_cols + ["target"]].copy()

In [None]:
cat_for_dummy = model_df.select_dtypes(include=["object","category"]).columns.tolist()
X = pd.get_dummies(model_df.drop(columns=["target"]), columns=cat_for_dummy, drop_first=True)
y = model_df["target"]

In [None]:
# Chia train/test có stratify để giữ tỉ lệ lớp
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

In [None]:
# Fit Logistic Regression
clf = LogisticRegression(max_iter=1000, class_weight="balanced") #Khởi tạo mô hình
clf.fit(X_train, y_train) #Thực hiện huấn luyện

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Đánh giá
y_pred = clf.predict(X_test) # Danh sách dự đoán "Yes" or "No" cho từng trường hợp test
y_proba = clf.predict_proba(X_test)[:,1] # Tính toán độ tin cậy của của từng dự đoán "Yes" or "No"


In [None]:
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
cm  = confusion_matrix(y_test, y_pred)

In [None]:
print(f"Accuracy: {acc:.4f}")
print(f"ROC AUC : {auc:.4f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))

Accuracy: 0.7876
ROC AUC : 0.7767
Confusion Matrix:
 [[8765 2200]
 [ 425  967]]

Classification Report:
               precision    recall  f1-score   support

           0      0.954     0.799     0.870     10965
           1      0.305     0.695     0.424      1392

    accuracy                          0.788     12357
   macro avg      0.630     0.747     0.647     12357
weighted avg      0.881     0.788     0.820     12357



In [None]:
# ROC
fpr, tpr, thr = roc_curve(y_test, y_proba) #Thr là ngưỡng phân loại, yproba < thr => gán nhãn 0, ngược lại gán 1; mỗi thr cho ra 1 cặp fpr, tpr khác nhau
plt.figure()
plt.plot(fpr, tpr) # Vẽ đường ROC thông qua việc nối các điểm (fpr, tpr) ứng với nhiều thr
plt.plot([0,1],[0,1], linestyle="--") #Vẽ đường chéo tham chiếu với đường ROC
plt.title(f"ROC Curve (AUC={auc:.3f})")
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.tight_layout(); plt.savefig(CHART_DIR/"roc_curve.png", dpi=150); plt.close()
# Confusion Matrix đơn giản
plt.figure()
plt.imshow(cm) # Vẽ ma trận confusion chỉ có heatmap không nhãn
plt.title("Confusion Matrix")
plt.xlabel("Predicted"); plt.ylabel("Actual")
for (i, j), v in np.ndenumerate(cm): #Điền giá trị cho heatmap
    plt.text(j, i, str(v), ha="center", va="center")
plt.tight_layout(); plt.savefig(CHART_DIR/"confusion_matrix.png", dpi=150); plt.close()

In [None]:
coef = pd.Series(clf.coef_[0], index=X.columns).sort_values(ascending=False)
coef.head(10), coef.tail(10)  # xem nhanh top +/- 10

# Vẽ top dương & âm
top_pos = coef.head(12).sort_values()
top_neg = coef.tail(12).sort_values()

plt.figure()
top_pos.plot(kind="barh")
plt.title("Top Positive Coefficients (↑ likelihood of YES)")
plt.tight_layout(); plt.savefig(CHART_DIR/"top_positive_coeffs.png", dpi=150); plt.close()

plt.figure()
top_neg.plot(kind="barh")
plt.title("Top Negative Coefficients (↓ likelihood of YES)")
plt.tight_layout(); plt.savefig(CHART_DIR/"top_negative_coeffs.png", dpi=150); plt.close()

In [None]:
def safe_head_val(s):
    try:
        return s.index[0], float(s.iloc[0])
    except:
        return None, None

summ = {
    "overall_subscribe_rate": float(data["target"].mean())
}
if rate_pout is not None:
    k, v = safe_head_val(rate_pout)
    summ["best_poutcome"], summ["best_poutcome_rate"] = k, v
if rate_job is not None:
    k, v = safe_head_val(rate_job)
    summ["best_job"], summ["best_job_rate"] = k, v

pd.Series(summ).to_csv(REPORT_DIR/"summary_insights.csv")
print("Saved: reports/summary_insights.csv")

Saved: reports/summary_insights.csv
