<a href="https://colab.research.google.com/github/michael-0907/tibami/blob/main/bank_after_discussion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from xgboost import XGBClassifier

import joblib
# import gradio as gr


In [16]:
data = pd.read_csv("/content/sample_data/bank-full.csv", encoding="utf-8", sep=";")
display(data.shape)
display(data.head(3))

(45211, 17)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no


In [17]:
# 刪除有疑慮的欄位
df = data.drop(columns=["month", "day"]).copy()

# 刪除有unknown的行
df = df[~df.isin(["unknown"]).any(axis=1)]

df.shape


(7842, 15)

In [18]:
for col in df.columns:
  unknown_col = df[col].isin(["unknown"])
  if unknown_col.any():
    print(f"含有unknown的特徵: {col}, 數量: {unknown_col.sum()}")
unknown_row = df.isin(["unknown"]).any(axis=1)
print(f"總行數: {df.shape[0]}")
print(f"含有unknown的行數: {unknown_row.sum()}")


總行數: 7842
含有unknown的行數: 0


In [23]:
column_info = []
for col in df.columns:
  info_dtype = df[col].dtype
  info_isna = df[col].isna().any()
  info_nunique = df[col].nunique()
  info_min = df[col].min()
  info_max = df[col].max()
  info_unique_20 = df[col].unique()[0: 20]

  c = [col, info_dtype, info_isna, info_nunique, info_min, info_max, info_unique_20]
  column_info.append(c)
columns = ["col_name", "dtype", "isna", "unique_num", "min", "max", "unique_20"]
column_info_df = pd.DataFrame(column_info, columns=columns)
display(column_info_df)

column_info_df.to_csv("column_info_bank.csv")


Unnamed: 0,col_name,dtype,isna,unique_num,min,max,unique_20
0,age,int64,False,70,18,89,"[33, 42, 36, 44, 26, 51, 30, 34, 49, 47, 40, 3..."
1,job,object,False,11,admin.,unemployed,"[admin., services, management, blue-collar, te..."
2,marital,object,False,3,divorced,single,"[married, single, divorced]"
3,education,object,False,3,primary,tertiary,"[tertiary, secondary, primary]"
4,default,float64,False,2,0.0,1.0,"[0.0, 1.0]"
5,balance,int64,False,3090,-1884,81204,"[882, -247, 3444, 2415, 0, 1324, 172, 3132, 10..."
6,housing,float64,False,2,0.0,1.0,"[0.0, 1.0]"
7,loan,float64,False,2,0.0,1.0,"[0.0, 1.0]"
8,contact,object,False,2,cellular,telephone,"[telephone, cellular]"
9,duration,int64,False,973,5,2219,"[39, 519, 144, 73, 140, 119, 21, 449, 175, 86,..."


In [20]:
# 顯示特徵屬性分佈
df.dtypes.value_counts()

Unnamed: 0,count
object,9
int64,6


In [22]:
# 二元特徵及目標變數編碼
ordinal_features = ["default", "housing", "loan", "y"]

for col in ordinal_features:
  df[col] = df[col].map({"yes": 1, "no": 0}).astype(float)

display(df["y"].value_counts())



Unnamed: 0_level_0,count
y,Unnamed: 1_level_1
0.0,6056
1.0,1786


In [24]:
# 提取特徵變數及目標變數

X = df.drop(columns=["y"])
y = df["y"]
print(X.shape, y.shape)


(7842, 14) (7842,)


In [25]:
# 切分訓練組及測試組

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(X_train.index.equals(y_train.index))
print(X_test.index.equals(y_test.index))
display(X_train.head(2))
display(X_test.head(2))



(6273, 14) (6273,)
(1569, 14) (1569,)
True
True


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome
31160,35,technician,divorced,secondary,0.0,2823,1.0,0.0,cellular,74,1,24,1,failure
34803,40,management,married,tertiary,0.0,-606,1.0,0.0,cellular,90,1,345,1,failure


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome
31466,59,retired,married,secondary,0.0,1035,1.0,1.0,cellular,126,2,239,1,failure
30415,49,blue-collar,married,primary,0.0,1,0.0,1.0,cellular,98,1,210,1,other


In [26]:
X_train.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome'],
      dtype='object')

In [27]:
# 對job欄位進頻率編碼

# 取得並保留頻率編碼map
frequency_map = X_train["job"].value_counts(normalize=True).round(4).to_dict()

# 訓練組、測試組執行頻率編碼
X_train["job"] = X_train["job"].map(frequency_map)
X_test["job"] = X_test["job"].map(frequency_map)

X_test["job"].value_counts()
frequency_map


{'management': 0.2233,
 'blue-collar': 0.1977,
 'technician': 0.1626,
 'admin.': 0.132,
 'services': 0.0877,
 'retired': 0.059,
 'self-employed': 0.0327,
 'student': 0.0322,
 'entrepreneur': 0.0279,
 'unemployed': 0.0266,
 'housemaid': 0.0183}

In [34]:
# 對類別欄位進行標籤編碼

categories_features = ["marital", "education", "contact", "poutcome"]
categories_order = [
    ['single', 'married', 'divorced'],
    ["primary", "secondary", "tertiary"],
    ['cellular', 'telephone'],
    ["success", "failure", "other"]
]

ordinal_encoder = OrdinalEncoder(categories=categories_order)
X_train[categories_features] = ordinal_encoder.fit_transform(X_train[categories_features])
X_test[categories_features] = ordinal_encoder.transform(X_test[categories_features])



In [37]:
# 檢查編碼器(預處理器)

display(frequency_map)
display(ordinal_encoder.categories_)
display(ordinal_encoder)

# 將編碼器(預處理器)存檔
joblib.dump((frequency_map, ordinal_encoder), "preprocessor.pkl")


{'management': 0.2233,
 'blue-collar': 0.1977,
 'technician': 0.1626,
 'admin.': 0.132,
 'services': 0.0877,
 'retired': 0.059,
 'self-employed': 0.0327,
 'student': 0.0322,
 'entrepreneur': 0.0279,
 'unemployed': 0.0266,
 'housemaid': 0.0183}

[array(['single', 'married', 'divorced'], dtype=object),
 array(['primary', 'secondary', 'tertiary'], dtype=object),
 array(['cellular', 'telephone'], dtype=object),
 array(['success', 'failure', 'other'], dtype=object)]

['preprocessor.pkl']

In [38]:
# 訓練模型
model = RandomForestClassifier(n_estimators=120, max_depth=15, min_samples_leaf=3, class_weight="balanced", random_state=42)

model.fit(X_train, y_train)

# 將模型存檔
joblib.dump(model, "rfc_model.pkl")


['rfc_model.pkl']

In [42]:
# 對訓練組預測，並計算accuracy_score, classification_report
y_train_proba = model.predict_proba(X_train)[:,1]
threshold = 0.55
y_train_pred = (y_train_proba >= threshold).astype("int")
train_accuracy = accuracy_score(y_train, y_train_pred)
train_report = classification_report(y_train, y_train_pred)
print(f"train accuracy: {train_accuracy}")
print(train_report)

print("-" * 50)

# 對測試組預測，並計算accuracy_score, classification_report
y_test_proba = model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= threshold).astype("int")
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)
print(f"test accuracy: {test_accuracy}")
print(test_report)


train accuracy: 0.9418141240235932
              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96      4844
         1.0       0.85      0.91      0.88      1429

    accuracy                           0.94      6273
   macro avg       0.91      0.93      0.92      6273
weighted avg       0.94      0.94      0.94      6273

--------------------------------------------------
test accuracy: 0.8387507966857871
              precision    recall  f1-score   support

         0.0       0.90      0.89      0.90      1212
         1.0       0.64      0.66      0.65       357

    accuracy                           0.84      1569
   macro avg       0.77      0.78      0.77      1569
weighted avg       0.84      0.84      0.84      1569



In [43]:
# 顯示特徵重要性

feature_importance = model.feature_importances_
sorted_index = np.argsort(feature_importance)[::-1]
sorted_feature = np.array(X_train.columns)[sorted_index]
sorted_importance = feature_importance[sorted_index]

important_features = []
threshold = 0.01
for i, j in zip(sorted_feature, sorted_importance):
  if j >= threshold:
    important_features.append(i)
    print(f"{i}: {j}")
print(important_features)



duration: 0.3050761909844126
poutcome: 0.19137509473825373
pdays: 0.13107733341734443
housing: 0.08553521216396552
balance: 0.07427163362327494
age: 0.06584216539055286
previous: 0.03424109029912419
job: 0.0342136964951251
campaign: 0.024680539824049573
education: 0.02226277753536293
marital: 0.013607425647016023
loan: 0.013354823402494624
['duration', 'poutcome', 'pdays', 'housing', 'balance', 'age', 'previous', 'job', 'campaign', 'education', 'marital', 'loan']
