<a href="https://colab.research.google.com/github/michael-0907/tibami/blob/main/bank_after_discussion_important.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from xgboost import XGBClassifier

import joblib
import gradio as gr


In [None]:
data = pd.read_csv("/content/sample_data/bank-full.csv", encoding="utf-8", sep=";")
display(data.shape)
display(data.head(3))

(45211, 17)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no


In [None]:
# 刪除有疑慮的欄位
df = data.drop(columns=["month", "day"]).copy()

# 刪除有unknown的行
df = df[~df.isin(["unknown"]).any(axis=1)]

df.shape


(7842, 15)

In [None]:
# 將balance < 0 的刪除
# balance < 0  的刪除後， padys=-1 的就沒了，pdays的最小值成 1
df = df[df["balance"] >= 0 ]
df.shape


(7371, 15)

In [None]:
# 取important features
# ['duration', 'poutcome', 'pdays', 'housing', 'balance', 'age', 'previous', 'job', 'campaign', 'education']
df = df[['duration', 'poutcome', 'pdays', 'housing', 'balance', 'age', 'previous', 'job', 'campaign', 'education', "y"]]

df.shape


(7371, 11)

In [None]:
for col in df.columns:
  unknown_col = df[col].isin(["unknown"])
  if unknown_col.any():
    print(f"含有unknown的特徵: {col}, 數量: {unknown_col.sum()}")
unknown_row = df.isin(["unknown"]).any(axis=1)
print(f"總行數: {df.shape[0]}")
print(f"含有unknown的行數: {unknown_row.sum()}")


總行數: 7371
含有unknown的行數: 0


In [None]:
column_info = []
for col in df.columns:
  info_dtype = df[col].dtype
  info_isna = df[col].isna().any()
  info_nunique = df[col].nunique()
  info_min = df[col].min()
  info_max = df[col].max()
  info_unique_20 = df[col].unique()[0: 20]

  c = [col, info_dtype, info_isna, info_nunique, info_min, info_max, info_unique_20]
  column_info.append(c)
columns = ["col_name", "dtype", "isna", "unique_num", "min", "max", "unique_20"]
column_info_df = pd.DataFrame(column_info, columns=columns)
display(column_info_df)

# column_info_df.to_csv("column_info_bank.csv")


Unnamed: 0,col_name,dtype,isna,unique_num,min,max,unique_20
0,duration,int64,False,957,5,2219,"[39, 144, 73, 140, 119, 21, 449, 175, 86, 81, ..."
1,poutcome,object,False,3,failure,success,"[failure, other, success]"
2,pdays,int64,False,525,1,871,"[151, 91, 86, 143, 89, 140, 176, 174, 167, 195..."
3,housing,float64,False,2,0.0,1.0,"[0.0, 1.0]"
4,balance,int64,False,2764,0,81204,"[882, 3444, 2415, 0, 1324, 172, 3132, 1005, 87..."
5,age,int64,False,70,18,89,"[33, 36, 44, 26, 51, 30, 34, 49, 47, 40, 38, 5..."
6,previous,int64,False,39,1,275,"[3, 4, 2, 1, 16, 6, 5, 10, 12, 7, 18, 9, 8, 11..."
7,job,object,False,11,admin.,unemployed,"[admin., services, management, blue-collar, te..."
8,campaign,int64,False,16,1,16,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 10, 14, 12, 15..."
9,education,object,False,3,primary,tertiary,"[tertiary, secondary, primary]"


In [None]:
# 顯示特徵屬性分佈
df.dtypes.value_counts()

Unnamed: 0,count
int64,6
object,5


In [None]:
# 二元特徵及目標變數編碼
ordinal_features = ["housing", "y"]

for col in ordinal_features:
  df[col] = df[col].map({"yes": 1, "no": 0}).astype(float)

display(df["y"].value_counts())



Unnamed: 0_level_0,count
y,Unnamed: 1_level_1
0.0,5622
1.0,1749


In [None]:
# 提取特徵變數及目標變數

X = df.drop(columns=["y"])
y = df["y"]
print(X.shape, y.shape)


(7371, 10) (7371,)


In [None]:
# 切分訓練組及測試組

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(X_train.index.equals(y_train.index))
print(X_test.index.equals(y_test.index))
display(X_train.head(2))
display(X_test.head(2))



(5896, 10) (5896,)
(1475, 10) (1475,)
True
True


Unnamed: 0,duration,poutcome,pdays,housing,balance,age,previous,job,campaign,education
40871,251,failure,117,1.0,245,38,4,admin.,2,secondary
44432,231,success,2,1.0,0,34,3,technician,2,secondary


Unnamed: 0,duration,poutcome,pdays,housing,balance,age,previous,job,campaign,education
40199,148,success,101,0.0,3434,46,1,technician,2,secondary
42783,186,failure,433,0.0,1181,56,1,technician,1,tertiary


In [None]:
# 對job欄位進頻率編碼

# 取得並保留頻率編碼map
frequency_map = X_train["job"].value_counts(normalize=True).round(4).to_dict()

# 訓練組、測試組執行頻率編碼
X_train["job"] = X_train["job"].map(frequency_map)
X_test["job"] = X_test["job"].map(frequency_map)

X_test["job"].value_counts()
frequency_map


{'management': 0.2285,
 'blue-collar': 0.1861,
 'technician': 0.161,
 'admin.': 0.1353,
 'services': 0.084,
 'retired': 0.0629,
 'self-employed': 0.0341,
 'student': 0.031,
 'unemployed': 0.029,
 'entrepreneur': 0.0288,
 'housemaid': 0.0193}

In [None]:
X_train.select_dtypes(include=["object"]).columns

Index(['poutcome', 'education'], dtype='object')

In [None]:
# 對類別欄位進行標籤編碼

categories_features = ["poutcome", "education"]
categories_order = [
    ["success", "failure", "other"],
    ["primary", "secondary", "tertiary"]
]

ordinal_encoder = OrdinalEncoder(categories=categories_order)
X_train[categories_features] = ordinal_encoder.fit_transform(X_train[categories_features])
X_test[categories_features] = ordinal_encoder.transform(X_test[categories_features])




In [None]:
display(X_train.head(2))
display(X_test.head(2))

Unnamed: 0,duration,poutcome,pdays,housing,balance,age,previous,job,campaign,education
40871,251,1.0,117,1.0,245,38,4,0.1353,2,1.0
44432,231,0.0,2,1.0,0,34,3,0.161,2,1.0


Unnamed: 0,duration,poutcome,pdays,housing,balance,age,previous,job,campaign,education
40199,148,0.0,101,0.0,3434,46,1,0.161,2,1.0
42783,186,1.0,433,0.0,1181,56,1,0.161,1,2.0


In [None]:
# 檢查編碼器(預處理器)

display(frequency_map)
display(ordinal_encoder.categories_)
display(ordinal_encoder)

# 將編碼器(預處理器)存檔
joblib.dump((frequency_map, ordinal_encoder), "preprocessor.pkl")


{'management': 0.2285,
 'blue-collar': 0.1861,
 'technician': 0.161,
 'admin.': 0.1353,
 'services': 0.084,
 'retired': 0.0629,
 'self-employed': 0.0341,
 'student': 0.031,
 'unemployed': 0.029,
 'entrepreneur': 0.0288,
 'housemaid': 0.0193}

[array(['success', 'failure', 'other'], dtype=object),
 array(['primary', 'secondary', 'tertiary'], dtype=object)]

['preprocessor.pkl']

In [None]:
# 訓練模型
model = RandomForestClassifier(n_estimators=120, max_depth=15, min_samples_leaf=3, class_weight="balanced", random_state=42)

model.fit(X_train, y_train)

# 將模型存檔
joblib.dump(model, "rfc_model.pkl")


['rfc_model.pkl']

In [None]:
# 對訓練組預測，並計算accuracy_score, classification_report
y_train_proba = model.predict_proba(X_train)[:,1]
threshold = 0.55
y_train_pred = (y_train_proba >= threshold).astype("int")
train_accuracy = accuracy_score(y_train, y_train_pred)
train_report = classification_report(y_train, y_train_pred)
print(f"train accuracy: {train_accuracy}")
print(train_report)

print("-" * 50)

# 對測試組預測，並計算accuracy_score, classification_report
y_test_proba = model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= threshold).astype("int")
test_accuracy = accuracy_score(y_test, y_test_pred)
test_report = classification_report(y_test, y_test_pred)
print(f"test accuracy: {test_accuracy}")
print(test_report)


train accuracy: 0.9484396200814111
              precision    recall  f1-score   support

         0.0       0.97      0.96      0.97      4497
         1.0       0.87      0.92      0.89      1399

    accuracy                           0.95      5896
   macro avg       0.92      0.94      0.93      5896
weighted avg       0.95      0.95      0.95      5896

--------------------------------------------------
test accuracy: 0.8481355932203389
              precision    recall  f1-score   support

         0.0       0.90      0.90      0.90      1125
         1.0       0.68      0.67      0.68       350

    accuracy                           0.85      1475
   macro avg       0.79      0.79      0.79      1475
weighted avg       0.85      0.85      0.85      1475



In [None]:
# 顯示特徵重要性

feature_importance = model.feature_importances_
sorted_index = np.argsort(feature_importance)[::-1]
sorted_feature = np.array(X_train.columns)[sorted_index]
sorted_importance = feature_importance[sorted_index]

important_features = []
threshold = 0.02
for i, j in zip(sorted_feature, sorted_importance):
  if j >= threshold:
    important_features.append(i)
    print(f"{i}: {j}")
print(important_features)



duration: 0.31623978255645885
poutcome: 0.19206554076657115
pdays: 0.14714205433221855
balance: 0.0789612429800955
housing: 0.07783820359863007
age: 0.06888055323934257
previous: 0.037450974574978674
job: 0.03676142324043618
campaign: 0.023818735253726967
education: 0.020841489457541457
['duration', 'poutcome', 'pdays', 'balance', 'housing', 'age', 'previous', 'job', 'campaign', 'education']


In [None]:
# 載入預處理器及模型

frequency_map_loaded, ordinal_encoder_loaded = joblib.load("/content/preprocessor.pkl")
display(frequency_map_loaded)
display(ordinal_encoder_loaded.categories_)
display(ordinal_encoder_loaded)

model_loaded = joblib.load("/content/rfc_model.pkl")
display(model_loaded)


{'management': 0.2285,
 'blue-collar': 0.1861,
 'technician': 0.161,
 'admin.': 0.1353,
 'services': 0.084,
 'retired': 0.0629,
 'self-employed': 0.0341,
 'student': 0.031,
 'unemployed': 0.029,
 'entrepreneur': 0.0288,
 'housemaid': 0.0193}

[array(['success', 'failure', 'other'], dtype=object),
 array(['primary', 'secondary', 'tertiary'], dtype=object)]

In [None]:
# ['duration', 'poutcome', 'pdays', 'housing', 'balance', 'age', 'previous', 'job', 'campaign', 'education']


def func_predict(duration, poutcome, pdays, housing, balance, age, previous, job, campaign, education):
  input_data = {"duration": [duration], "poutcome": [poutcome] ,"pdays": [pdays], "housing": [housing],
                "balance": [balance], "age": [age], "previous": [previous],
                "job": [job], "campaign": [campaign], "education": [education], }
  gr_df = pd.DataFrame(input_data)
  gr_df["job"] = gr_df["job"].map(frequency_map_loaded)
  gr_df[["poutcome", "education"]] = ordinal_encoder_loaded.transform(gr_df[["poutcome", "education"]])
  gr_df = gr_df.astype(float)

  y_proba = model_loaded.predict_proba(gr_df)

  return y_proba

model_predict = gr.Interface(
    fn = func_predict,
    inputs = [
        gr.Slider(minimum=0, maximum=2219, step=100, value=5, label="最後一次聯繫的時長，以秒為單位: "), # duration
        gr.Radio([("success", "success"), ("failure", "failure"), ("no_record", "other")], label="前一次行銷活動的結果: ", value="success"), # poutcome
        gr.Slider(minimum=1, maximum=871, step=1, value=1, label="上次活動以來與客戶聯繫後的天數(-1表示之前未與客戶聯繫): "), # pdays
        gr.Radio([("yes", 1), ("no", 0)], label="是否有房貸: ", value=1), # housing
        gr.Slider(minimum=0, maximum=81204, step=100, value=1, label="平均年餘額: "), # balance
        gr.Slider(minimum=18, maximum=89, step=1, value=1, label="年齡: "), # age
        gr.Slider(minimum=1, maximum=275, step=1, value=1, label="在此活動之前的聯繫次數: "), # previous
        gr.Radio(['technician', 'management', 'admin.', 'retired', 'blue-collar', 'entrepreneur',
                  'student', 'services', 'unemployed', 'self-employed', 'housemaid'], label="職業: ", value="technician"), # job
        gr.Slider(minimum=1, maximum=16, step=1, value=1, label="聯繫次數: "), # campaign
        gr.Radio(['primary', 'secondary', 'tertiary'], label="education: ", value="primary")  # education

    ],
    outputs = gr.Textbox(label="result: "),
    title = "這是 title",
    description = "這是 description"
)


model_predict.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://88fa1ec531338ed2fd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


