<a href="https://colab.research.google.com/github/michael-0907/tibami/blob/main/bank_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio

In [138]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from xgboost import XGBClassifier

import joblib
import gradio as gr


In [32]:
data = pd.read_csv("/content/sample_data/bank-full.csv", encoding="utf-8", sep=";")
display(data.shape)
display(data.head(3))

(45211, 17)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no


In [63]:
# df = data.copy()

# 刪除有unknown的行
df = data[~data.isin(["unknown"]).any(axis=1)].copy()

# 留下important欄位
fil = ["pdays", "balance", "age", "previous", "campaign", "housing", "poutcome", "education", "job", "y"]
df = df[fil]
df.shape


(7842, 10)

In [64]:
for col in df.columns:
  unknown_col = df[col].isin(["unknown"])
  if unknown_col.any():
    print(f"含有unknown的特徵: {col}, 數量: {unknown_col.sum()}")
unknown_row = df.isin(["unknown"]).any(axis=1)
print(f"總行數: {df.shape[0]}")
print(f"含有unknown的行數: {unknown_row.sum()}")

總行數: 7842
含有unknown的行數: 0


In [65]:
column_info = []
for col in df.columns:
  info_dtype = df[col].dtype
  info_isna = df[col].isna().any()
  info_nunique = df[col].nunique()
  info_min = df[col].min()
  info_max = df[col].max()
  info_unique_20 = df[col].unique()[0: 20]

  c = [col, info_dtype, info_isna, info_nunique, info_min, info_max, info_unique_20]
  column_info.append(c)
columns = ["col_name", "dtype", "isna", "unique_num", "min", "max", "unique_20"]
column_info_df = pd.DataFrame(column_info, columns=columns)
display(column_info_df)


Unnamed: 0,col_name,dtype,isna,unique_num,min,max,unique_20
0,pdays,int64,False,527,1,871,"[151, 166, 91, 86, 143, 89, 140, 176, 174, 167..."
1,balance,int64,False,3090,-1884,81204,"[882, -247, 3444, 2415, 0, 1324, 172, 3132, 10..."
2,age,int64,False,70,18,89,"[33, 42, 36, 44, 26, 51, 30, 34, 49, 47, 40, 3..."
3,previous,int64,False,39,1,275,"[3, 1, 4, 2, 16, 6, 5, 10, 12, 7, 18, 9, 8, 11..."
4,campaign,int64,False,16,1,16,"[1, 2, 3, 5, 4, 6, 7, 8, 9, 11, 10, 12, 14, 15..."
5,housing,object,False,2,no,yes,"[no, yes]"
6,poutcome,object,False,3,failure,success,"[failure, other, success]"
7,education,object,False,3,primary,tertiary,"[tertiary, secondary, primary]"
8,job,object,False,11,admin.,unemployed,"[admin., services, management, blue-collar, te..."
9,y,object,False,2,no,yes,"[no, yes]"


In [66]:
# 顯示特徵屬性分佈
df.dtypes.value_counts()

Unnamed: 0,count
int64,5
object,5


In [68]:
# 二元特徵及目標變數編碼

df["housing"] = df["housing"].map({"yes": 1, "no": 0}).astype(float)
df["y"] = df["y"].map({"yes": 1, "no": 0}).astype(float)

display(df["housing"].value_counts())
display(df["y"].value_counts())

Unnamed: 0_level_0,count
housing,Unnamed: 1_level_1
1.0,4942
0.0,2900


Unnamed: 0_level_0,count
y,Unnamed: 1_level_1
0.0,6056
1.0,1786


In [112]:
# 提取特徵變數及目標變數

X = df.drop(columns=["y"])
y = df["y"]
print(X.shape, y.shape)


(7842, 9) (7842,)


In [116]:
# 切分訓練組及測試組

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(X_train.index.equals(y_train.index))
print(X_test.index.equals(y_test.index))
display(X_train.head(2))
display(X_test.head(2))



(6273, 9) (6273,)
(1569, 9) (1569,)
True
True


Unnamed: 0,pdays,balance,age,previous,campaign,housing,poutcome,education,job
31160,24,2823,35,1,1,1.0,failure,secondary,technician
34803,345,-606,40,1,1,1.0,failure,tertiary,management


Unnamed: 0,pdays,balance,age,previous,campaign,housing,poutcome,education,job
31466,239,1035,59,1,2,1.0,failure,secondary,retired
30415,210,1,49,1,1,0.0,other,primary,blue-collar


In [117]:
# 頻率編碼

# 取得並保留頻率編碼map
frequency_map = X_train["job"].value_counts(normalize=True).round(4).to_dict()

# 訓練組、測試組執行頻率編碼
X_train["job"] = X_train["job"].map(frequency_map)
X_test["job"] = X_test["job"].map(frequency_map)

X_test["job"].value_counts()
frequency_map

{'management': 0.2233,
 'blue-collar': 0.1977,
 'technician': 0.1626,
 'admin.': 0.132,
 'services': 0.0877,
 'retired': 0.059,
 'self-employed': 0.0327,
 'student': 0.0322,
 'entrepreneur': 0.0279,
 'unemployed': 0.0266,
 'housemaid': 0.0183}

In [118]:
display(X_train.head(2))
display(X_test.head(2))

Unnamed: 0,pdays,balance,age,previous,campaign,housing,poutcome,education,job
31160,24,2823,35,1,1,1.0,failure,secondary,0.1626
34803,345,-606,40,1,1,1.0,failure,tertiary,0.2233


Unnamed: 0,pdays,balance,age,previous,campaign,housing,poutcome,education,job
31466,239,1035,59,1,2,1.0,failure,secondary,0.059
30415,210,1,49,1,1,0.0,other,primary,0.1977


In [119]:
display(df["poutcome"].unique())
display(df["education"].unique())

array(['failure', 'other', 'success'], dtype=object)

array(['tertiary', 'secondary', 'primary'], dtype=object)

In [120]:
# 對類別欄位進行標籤編碼

categories_order = [
    ["success", "failure", "other"],
    ["primary", "secondary", "tertiary"]
]
ordinal_encoder = OrdinalEncoder(categories=categories_order)
X_train[["poutcome", "education"]] = ordinal_encoder.fit_transform(X_train[["poutcome", "education"]])
X_test[["poutcome", "education"]] = ordinal_encoder.fit_transform(X_test[["poutcome", "education"]])



In [110]:
display(X_train.head(2))
display(X_test.head(2))

Unnamed: 0,pdays,balance,age,previous,campaign,housing,poutcome,education,job
31160,24,2823,35,1,1,1.0,1.0,1.0,0.1626
34803,345,-606,40,1,1,1.0,1.0,2.0,0.2233


Unnamed: 0,pdays,balance,age,previous,campaign,housing,poutcome,education,job
31466,239,1035,59,1,2,1.0,1.0,1.0,0.059
30415,210,1,49,1,1,0.0,2.0,0.0,0.1977


In [126]:
# 檢查編碼器

display(frequency_map)
display(ordinal_encoder.categories_)
display(ordinal_encoder)

# 將編碼器存檔
joblib.dump((frequency_map, ordinal_encoder), "preprocessor.pkl")


{'management': 0.2233,
 'blue-collar': 0.1977,
 'technician': 0.1626,
 'admin.': 0.132,
 'services': 0.0877,
 'retired': 0.059,
 'self-employed': 0.0327,
 'student': 0.0322,
 'entrepreneur': 0.0279,
 'unemployed': 0.0266,
 'housemaid': 0.0183}

[array(['success', 'failure', 'other'], dtype=object),
 array(['primary', 'secondary', 'tertiary'], dtype=object)]

['preprocessor.pkl']

In [131]:
# 訓練模型
model = RandomForestClassifier(n_estimators=150, max_depth=20, min_samples_leaf=3, class_weight="balanced", random_state=42)

model.fit(X_train, y_train)


In [135]:
# 將模型存檔
joblib.dump(model, "rfc_model.pkl")


['rfc_model.pkl']

In [132]:
y_train_proba = model.predict_proba(X_train)[:,1]
threshold = 0.5
y_train_pred = (y_train_proba >= threshold).astype("int")
train_accuracy = accuracy_score(y_train, y_train_pred)
train_report = classification_report(y_train, y_train_pred)
print(f"train accuracy: {train_accuracy}")
print(train_report)

print("-" * 50)
y_test_proba = model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= threshold).astype("int")
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)
print(f"test accuracy: {test_accuracy}")
print(test_report)


train accuracy: 0.9354375896700143
              precision    recall  f1-score   support

         0.0       0.97      0.94      0.96      4844
         1.0       0.83      0.90      0.86      1429

    accuracy                           0.94      6273
   macro avg       0.90      0.92      0.91      6273
weighted avg       0.94      0.94      0.94      6273

--------------------------------------------------
test accuracy: 0.8253664754620778
              precision    recall  f1-score   support

         0.0       0.88      0.90      0.89      1212
         1.0       0.63      0.58      0.60       357

    accuracy                           0.83      1569
   macro avg       0.75      0.74      0.74      1569
weighted avg       0.82      0.83      0.82      1569



In [134]:
feature_importance = model.feature_importances_
sorted_index = np.argsort(feature_importance)[::-1]
sorted_feature = np.array(X_train.columns)[sorted_index]
sorted_importance = feature_importance[sorted_index]

important_features = []
threshold = 0.01
for i, j in zip(sorted_feature, sorted_importance):
  if j >= threshold:
    important_features.append(i)
    print(f"{i}: {j}")
print(important_features)



poutcome: 0.23350393488520158
pdays: 0.22283702365880118
balance: 0.145595676920045
age: 0.11723726539495208
housing: 0.09524086347041599
previous: 0.058945408883318784
job: 0.053818573956217974
campaign: 0.04512522651695135
education: 0.027696026314095956
['poutcome', 'pdays', 'balance', 'age', 'housing', 'previous', 'job', 'campaign', 'education']


In [146]:
# 載入預處理器及模型

frequency_map_loaded, ordinal_encoder_loaded = joblib.load("/content/preprocessor.pkl")
display(frequency_map_loaded)
display(ordinal_encoder_loaded.categories_)
display(ordinal_encoder_loaded)

model_loaded = joblib.load("/content/rfc_model.pkl")
display(model_loaded)


{'management': 0.2233,
 'blue-collar': 0.1977,
 'technician': 0.1626,
 'admin.': 0.132,
 'services': 0.0877,
 'retired': 0.059,
 'self-employed': 0.0327,
 'student': 0.0322,
 'entrepreneur': 0.0279,
 'unemployed': 0.0266,
 'housemaid': 0.0183}

[array(['success', 'failure', 'other'], dtype=object),
 array(['primary', 'secondary', 'tertiary'], dtype=object)]

In [144]:
X_train.columns

Index(['pdays', 'balance', 'age', 'previous', 'campaign', 'housing',
       'poutcome', 'education', 'job'],
      dtype='object')

In [147]:
# fil = ["pdays", "balance", "age", "previous", "campaign", "housing", "poutcome", "education", "job"]

def func_predict(pdays, balance, age, previous, campaign, housing, poutcome, education, job):
  input_data = np.array([[pdays, balance, age, previous, campaign, housing, poutcome, education, job]]) # 格式轉換為二維numpy
  gr_df = pd.DataFrame(input_data, columns=fil)
  gr_df["job"] = gr_df["job"].map(frequency_map_loaded)
  gr_df[["poutcome", "education"]] = ordinal_encoder_loaded.transform(gr_df[["poutcome", "education"]])

  # threshold = 0.5
  y_proba = model_loaded.predict_proba(gr_df)

  return y_proba

model_predict = gr.Interface(
    fn = func_predict,
    inputs = [
        gr.Slider(minimum=-1, maximum=854, step=1, value=1, label="上次活動以來與客戶聯繫後的天數(-1表示之前未與客戶聯繫): "), # pdays
        gr.Slider(minimum=-1800, maximum=80000, step=100, value=1, label="平均年餘額: "), # balance
        gr.Slider(minimum=18, maximum=90, step=1, value=1, label="年齡: "), # age
        gr.Slider(minimum=1, maximum=275, step=1, value=1, label="在此活動之前的聯繫次數: "), # previous
        gr.Slider(minimum=1, maximum=14, step=1, value=1, label="聯繫次數: "), # campaign
        gr.Radio([("yes", 1.0), ("no", 0.0)], label="是否有房貸: ", value=1.0), # housing
        gr.Radio([("success", "success"), ("failure", "failure"), ("no_record", "other")], label="前一次行銷活動的結果: ", value="success"), # poutcome
        gr.Radio(['primary', 'secondary', 'tertiary'], label="education: ", value="primary"),  # education
        gr.Radio(['technician', 'management', 'admin.', 'retired', 'blue-collar', 'entrepreneur',
                  'student', 'services', 'unemployed', 'self-employed',
                  'housemaid'], label="職業: ", value="technician") # job
    ],
    outputs = gr.Textbox(label="result: "),
    title = "這是 title",
    description = "這是 description"
)


model_predict.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1556eab64878ceb69a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


