<a href="https://colab.research.google.com/github/michael-0907/tibami/blob/main/bank_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer



In [None]:
data = pd.read_csv("/content/sample_data/bank-full.csv", encoding="utf-8", sep=";")
data.shape

(45211, 17)

In [None]:
# df = data.copy()

# 刪除有 unknkwn 的行
df = data[~data.isin(["unknown"]).any(axis=1)].copy()

df.shape


(7842, 17)

In [None]:
display(df.shape)
column_info = []
for col in df.columns:
  c = [col, df[col].dtype, df[col].isna().any(), df[col].nunique(), df[col].unique()[0: 20]]
  column_info.append(c)
columns = ["col_name", "dtype", "isna", "unique_num", "unique_20"]
column_info_df = pd.DataFrame(column_info, columns=columns)
display(column_info_df)



(7842, 17)

Unnamed: 0,col_name,dtype,isna,unique_num,unique_20
0,age,int64,False,70,"[33, 42, 36, 44, 26, 51, 30, 34, 49, 47, 40, 3..."
1,job,object,False,11,"[admin., services, management, blue-collar, te..."
2,marital,object,False,3,"[married, single, divorced]"
3,education,object,False,3,"[tertiary, secondary, primary]"
4,default,object,False,2,"[no, yes]"
5,balance,int64,False,3090,"[882, -247, 3444, 2415, 0, 1324, 172, 3132, 10..."
6,housing,object,False,2,"[no, yes]"
7,loan,object,False,2,"[no, yes]"
8,contact,object,False,2,"[telephone, cellular]"
9,day,int64,False,31,"[21, 22, 23, 25, 4, 5, 10, 12, 13, 17, 18, 19,..."


In [None]:
for col in df.columns:
  unknown_col = df[col].isin(["unknown"])
  if unknown_col.any():
    print(f"含有unknown的特徵: {col}, 數量: {unknown_col.sum()}")
unknown_row = df.isin(["unknown"]).any(axis=1)
print(f"總行數: {df.shape[0]}")
print(f"含有unknown的行數: {unknown_row.sum()}")

總行數: 7842
含有unknown的行數: 0


In [None]:
# 顯示特徵屬性分佈
df.dtypes.value_counts()

Unnamed: 0,count
object,10
int64,7


In [None]:
# 處理特徵變數 - month
df["month"] = df["month"].map({'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'oct': 10,
                    'nov': 11 ,'dec': 12, 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'sep': 9})

In [None]:
# 處理二元特徵變數
for col in df.select_dtypes(include=["object"]).columns:
  if df[col].isin(["yes", "no"]).any():
    df[col] = df[col].map({"yes": 1, "no": 0})

In [None]:
# 提取數值欄位，含目標變數
numerical_columns = []
for col in df.select_dtypes(include=["int64"]):
  numerical_columns.append(col)
print(f"{len(numerical_columns)}, {numerical_columns}")

12, ['age', 'default', 'balance', 'housing', 'loan', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'y']


In [None]:
# 確認數值變數的相關性係數
corr_matrix = df[numerical_columns].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Matrix")

In [None]:
# 畫出數值型特徵的分佈
fig, axes = plt.subplots(3, 4, figsize=(11.33, 7.5))
ax = axes.flatten()
for i, col in enumerate(numerical_columns):
  sns.histplot(data=df, x=col, ax=ax[i])
plt.tight_layout()

In [None]:
# 畫出數值型特徵的分佈
fig, axes = plt.subplots(3, 4, figsize=(11.33, 7.5))
ax = axes.flatten()
for i, col in enumerate(numerical_columns):
  sns.boxplot(data=df, x="y", y=col, ax=ax[i])
plt.tight_layout()


In [None]:
sns.countplot(data=df, x="month", hue="y")

In [None]:
# 畫出類別特徵與目標變數的 countplot
fig, axes = plt.subplots(3, 2, figsize=(11.33, 7.5))
ax = axes.flatten()
for i, col in enumerate(['job', 'marital', 'education', 'contact', 'poutcome']):
  sns.countplot(data=df, x=col, ax=ax[i])
plt.tight_layout()

In [None]:
# 畫出類別特徵與目標變數的 countplot
fig, axes = plt.subplots(3, 2, figsize=(11.33, 7.5))
ax = axes.flatten()
for i, col in enumerate(['job', 'marital', 'education', 'contact', 'poutcome']):
  sns.countplot(data=df, x=col, hue="y", ax=ax[i])
plt.tight_layout()

In [None]:
# 提取特徵和目標變數
X = df.drop(columns=["y"])
y = df["y"]
print(X.shape, y.shape)
print(X.dtypes.value_counts())

(7842, 16) (7842,)
int64     11
object     5
Name: count, dtype: int64


In [None]:
# 特徵分組

categorical_features = ['marital', 'education', 'contact', 'poutcome'] # 做標籤編碼
frequency_features = ['job', 'day', 'month'] # 做頻率編碼
binary_features = ['default', 'housing', 'loan'] # 已經處理過，不用再處理
outlier_features = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous'] # 考慮做 RobustScaler 縮放

print(len(categorical_features) + len(frequency_features) + len(binary_features) + len(outlier_features))



16


In [None]:
# 分割數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
display(X_train.head(3))
display(X_test.head(3))

(6273, 16) (6273,)
(1569, 16) (1569,)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
31160,35,technician,divorced,secondary,0,2823,1,0,cellular,26,2,74,1,24,1,failure
34803,40,management,married,tertiary,0,-606,1,0,cellular,6,5,90,1,345,1,failure
40055,42,management,married,tertiary,0,2665,1,0,cellular,4,6,280,2,126,11,failure


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
31466,59,retired,married,secondary,0,1035,1,1,cellular,1,4,126,2,239,1,failure
30415,49,blue-collar,married,primary,0,1,0,1,cellular,5,2,98,1,210,1,other
34439,27,services,single,secondary,0,-72,1,0,cellular,5,5,276,1,309,9,failure


In [None]:
# 設置頻率編碼器
def frequency_encoder(frequency_df):
  for col in frequency_df.columns:
    if frequency_df[col].dtype == "int64":
      frequency_df[col] = frequency_df[col].astype(float)
    frequency_df.loc[ : , col] = frequency_df[col].map(frequency_df[col].value_counts(normalize=True))
  return frequency_df

# 設置預處理器，並指定欄位
preprocessor = ColumnTransformer(transformers=[
    ("outlier", "passthrough", outlier_features),
    ("binary", "passthrough", binary_features),
    ("categorical", OrdinalEncoder(handle_unknown="error"), categorical_features),
    ("frequency", FunctionTransformer(func=frequency_encoder, validate=False), frequency_features)
])


In [None]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
print(X_train_preprocessed.shape, y_train.shape)
print(X_test_preprocessed.shape, y_test.shape)


(6273, 16) (6273,)
(1569, 16) (1569,)


In [None]:
all_feature_names = outlier_features + binary_features + categorical_features + frequency_features

X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=all_feature_names, index=X_train.index)
X_test_preprocessed = pd.DataFrame(X_test_preprocessed, columns=all_feature_names, index=X_test.index)
print(X_train_preprocessed.shape, y_train.shape)
print(X_test_preprocessed.shape, y_test.shape)
display(X_train_preprocessed.head(3))
display(X_test_preprocessed.head(3))

(6273, 16) (6273,)
(1569, 16) (1569,)


Unnamed: 0,age,balance,duration,campaign,pdays,previous,default,housing,loan,marital,education,contact,poutcome,job,day,month
31160,35,2823,74,1,24,1,0,1,0,0.0,1.0,0.0,0.0,0.162602,0.017376,0.115256
34803,40,-606,90,1,345,1,0,1,0,1.0,2.0,0.0,0.0,0.223338,0.04336,0.307349
40055,42,2665,280,2,126,11,0,1,0,1.0,2.0,0.0,0.0,0.223338,0.041766,0.038897


Unnamed: 0,age,balance,duration,campaign,pdays,previous,default,housing,loan,marital,education,contact,poutcome,job,day,month
31466,59,1035,126,2,239,1,0,1,1,1.0,1.0,0.0,0.0,0.056087,0.01211,0.136393
30415,49,1,98,1,210,1,0,0,1,1.0,0.0,0.0,1.0,0.189293,0.050351,0.100701
34439,27,-72,276,1,309,9,0,1,0,2.0,1.0,0.0,0.0,0.08413,0.050351,0.323773


In [None]:
# 測試過濾欄位


fil = ['poutcome', 'pdays', 'housing', 'balance', 'age', 'previous', 'job', 'campaign', 'education']
X_train_preprocessed = X_train_preprocessed[fil]
X_test_preprocessed = X_test_preprocessed[fil]

print(X_train_preprocessed.shape, y_train.shape)
print(X_test_preprocessed.shape, y_test.shape)
display(X_train_preprocessed.head(3))
display(X_test_preprocessed.head(3))

(6273, 9) (6273,)
(1569, 9) (1569,)


Unnamed: 0,poutcome,pdays,housing,balance,age,previous,job,campaign,education
31160,0.0,24,1,2823,35,1,0.162602,1,1.0
34803,0.0,345,1,-606,40,1,0.223338,1,2.0
40055,0.0,126,1,2665,42,11,0.223338,2,2.0


Unnamed: 0,poutcome,pdays,housing,balance,age,previous,job,campaign,education
31466,0.0,239,1,1035,59,1,0.056087,2,1.0
30415,1.0,210,0,1,49,1,0.189293,1,0.0
34439,0.0,309,1,-72,27,9,0.08413,1,1.0


In [None]:
# 目前測最佳超參數,threshold=0.65(含所有特徵)
model = RandomForestClassifier(n_estimators=150, max_depth=20, min_samples_leaf=5, class_weight="balanced", random_state=42)


# model = RandomForestClassifier(n_estimators=150, max_depth=20, min_samples_leaf=3, class_weight="balanced", random_state=42)

model.fit(X_train_preprocessed, y_train)


In [None]:
y_train_proba = model.predict_proba(X_train_preprocessed)[:,1]
threshold = 0.65
y_train_pred = (y_train_proba >= threshold).astype("int")
train_accuracy = accuracy_score(y_train, y_train_pred)
train_report = classification_report(y_train, y_train_pred)
print(f"train accuracy: {train_accuracy}")
print(train_report)

print("-" * 50)
y_test_proba = model.predict_proba(X_test_preprocessed)[:, 1]
y_test_pred = (y_test_proba >= threshold).astype("int")
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)
print(f"test accuracy: {test_accuracy}")
print(test_report)



train accuracy: 0.8901641957596047
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      4844
           1       0.82      0.66      0.73      1429

    accuracy                           0.89      6273
   macro avg       0.86      0.81      0.83      6273
weighted avg       0.89      0.89      0.89      6273

--------------------------------------------------
test accuracy: 0.8304652644996813
              precision    recall  f1-score   support

           0       0.86      0.93      0.89      1212
           1       0.68      0.48      0.57       357

    accuracy                           0.83      1569
   macro avg       0.77      0.71      0.73      1569
weighted avg       0.82      0.83      0.82      1569



In [None]:
feature_importance = model.feature_importances_
sorted_index = np.argsort(feature_importance)[::-1]
sorted_feature = np.array(X_train_preprocessed.columns)[sorted_index]
sorted_importance = feature_importance[sorted_index]

important_features = []
threshold = 0.01
for i, j in zip(sorted_feature, sorted_importance):
  if j >= threshold:
    important_features.append(i)
    print(f"{i}: {j}")
print(important_features)

poutcome: 0.2685445078968816
pdays: 0.2144358668947337
balance: 0.13040771830268105
housing: 0.12248408811547427
age: 0.10240555725200809
previous: 0.05053515672568413
job: 0.04622419877318538
campaign: 0.03936940650723887
education: 0.02559349953211286
['poutcome', 'pdays', 'balance', 'housing', 'age', 'previous', 'job', 'campaign', 'education']


In [None]:
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')