In [35]:
import pandas as pd

data = pd.read_csv("bank-full.csv", sep=";")

data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [36]:
# 資料分析：欄位資訊
print("=== 資料集基本資訊 ===")
print(f"資料筆數: {data.shape[0]:,}")
print(f"欄位數量: {data.shape[1]}")
print(f"資料大小: {data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n=== 欄位名稱和資料型態 ===")
data.info()

print("\n=== 各欄位的唯一值數量 ===")
for col in data.columns:
    unique_count = data[col].nunique()
    print(f"{col}: {unique_count} 個唯一值")
    if unique_count <= 10:  # 如果唯一值少於等於10個，顯示所有值
        print(f"  → {sorted(data[col].unique())}")
    print()

=== 資料集基本資訊 ===
資料筆數: 45,211
欄位數量: 17
資料大小: 29.20 MB

=== 欄位名稱和資料型態 ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB

=== 各欄位的唯一值數量 ===
age: 77 

In [37]:
# (手動)移除不必要的特徵
# 移除 day因為與 month 相關，而且哪一天影響不大
# 移除 marital、education、contact、pdays 因為與是否貸款較無關
my_choice_data = data.drop(['day','marital','education','contact', 'pdays'],axis=1)
display(my_choice_data)


Unnamed: 0,age,job,default,balance,housing,loan,month,duration,campaign,previous,poutcome,y
0,58,management,no,2143,yes,no,may,261,1,0,unknown,no
1,44,technician,no,29,yes,no,may,151,1,0,unknown,no
2,33,entrepreneur,no,2,yes,yes,may,76,1,0,unknown,no
3,47,blue-collar,no,1506,yes,no,may,92,1,0,unknown,no
4,33,unknown,no,1,no,no,may,198,1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,no,825,no,no,nov,977,3,0,unknown,yes
45207,71,retired,no,1729,no,no,nov,456,2,0,unknown,yes
45208,72,retired,no,5715,no,no,nov,1127,5,3,success,yes
45209,57,blue-collar,no,668,no,no,nov,508,4,0,unknown,no


In [38]:
# 檢查有無缺失值
print("=== 檢查缺失值 ===")
missing_values = my_choice_data.isnull().sum()
total_missing = missing_values.sum()
if total_missing == 0:
    print("✅ 資料集中沒有缺失值。")
else:
    print(f"⚠️ 資料集中有 {total_missing} 個缺失值，請進行處理。")
    print(missing_values[missing_values > 0])
my_choice_data.isnull().sum()

=== 檢查缺失值 ===
✅ 資料集中沒有缺失值。


age         0
job         0
default     0
balance     0
housing     0
loan        0
month       0
duration    0
campaign    0
previous    0
poutcome    0
y           0
dtype: int64

In [39]:
# 轉換類別型變數為數值型變數
# 使用LabelEncoder將資料類型轉換為Number
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in my_choice_data.select_dtypes(include=['object']).columns:
  my_choice_data[col] = le.fit_transform(my_choice_data[col])

display(my_choice_data)

Unnamed: 0,age,job,default,balance,housing,loan,month,duration,campaign,previous,poutcome,y
0,58,4,0,2143,1,0,8,261,1,0,3,0
1,44,9,0,29,1,0,8,151,1,0,3,0
2,33,2,0,2,1,1,8,76,1,0,3,0
3,47,1,0,1506,1,0,8,92,1,0,3,0
4,33,11,0,1,0,0,8,198,1,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,0,825,0,0,9,977,3,0,3,1
45207,71,5,0,1729,0,0,9,456,2,0,3,1
45208,72,5,0,5715,0,0,9,1127,5,3,2,1
45209,57,1,0,668,0,0,9,508,4,0,3,0


In [40]:
# # 將資料框匯出成CSV檔案
# print("=== 匯出資料到CSV檔案 ===")

# # 方法1：匯出到當前目錄
# my_choice_data.to_csv("bank_my_choice.csv", index=False)
# print("✅ 已匯出到 bank_my_choice.csv")

# # 方法2：匯出到指定路徑（可選）
# # my_choice_data.to_csv("/path/to/your/bank_my_choice.csv", index=False)

# # 方法3：匯出時指定編碼（處理中文字符）
# # my_choice_data.to_csv("bank_my_choice_utf8.csv", index=False, encoding='utf-8')

# # 檢查匯出的檔案資訊
# print(f"匯出資料維度: {my_choice_data.shape}")
# print(f"匯出欄位: {list(my_choice_data.columns)}")

# # 驗證匯出結果 - 讀取剛匯出的檔案來確認
# verify_data = pd.read_csv("bank_my_choice.csv")
# print(f"驗證讀取維度: {verify_data.shape}")
# print("✅ CSV檔案匯出成功！")

In [41]:
# 資料分割 x 和 y
# 注意：scalered_data是numpy array，需要先分離特徵和目標變數再進行正規化

# 方法1：從原始資料分離，然後分別處理
print("=== 正確的資料分割和正規化流程 ===")

# 先從原始資料分離特徵和目標變數
X = my_choice_data.drop('y', axis=1)  # 特徵
y = my_choice_data['y']               # 目標變數

print(f"特徵矩陣形狀: {X.shape}")
print(f"目標變數形狀: {y.shape}")

# 進行資料分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)


=== 正確的資料分割和正規化流程 ===
特徵矩陣形狀: (45211, 11)
目標變數形狀: (45211,)


In [42]:

# 準備正規化所需的套件
from sklearn.preprocessing import StandardScaler
# 只對特徵進行正規化（目標變數通常不需要正規化）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # 注意：測試集使用transform，不是fit_transform

print(f"訓練集特徵形狀: {X_train_scaled.shape}")
print(f"測試集特徵形狀: {X_test_scaled.shape}")
print(f"訓練集目標形狀: {y_train.shape}")
print(f"測試集目標形狀: {y_test.shape}")

print("\n✅ 資料分割和正規化完成！")

訓練集特徵形狀: (36168, 11)
測試集特徵形狀: (9043, 11)
訓練集目標形狀: (36168,)
測試集目標形狀: (9043,)

✅ 資料分割和正規化完成！


In [43]:
# 📚 機器學習資料處理的正確流程說明
print("=== 機器學習資料處理的正確順序 ===")
print()
print("✅ 正確流程：")
print("1. 資料清理 (移除缺失值、異常值)")
print("2. 特徵工程 (編碼類別變數)")
print("3. 分離特徵(X)和目標變數(y)")
print("4. 資料分割 (train/test split)")
print("5. 特徵正規化 (只對特徵，不對目標變數)")
print("6. 模型訓練")
print()
print("⚠️ 重要注意事項：")
print("• 正規化應該在資料分割之後進行")
print("• 只對特徵進行正規化，目標變數通常保持原樣")
print("• 測試集使用訓練集的scaler參數(避免資料洩露)")
print("• StandardScaler輸出是numpy array，不是DataFrame")

# 檢視我們處理後的資料
print(f"\n📊 目前資料狀態：")
print(f"X_train_scaled type: {type(X_train_scaled)}")
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"y_train type: {type(y_train)}")
print(f"y_train shape: {y_train.shape}")

# 顯示前幾行資料（轉換為DataFrame方便查看）
print(f"\n前5行正規化後的特徵資料：")
feature_names = my_choice_data.drop('y', axis=1).columns
X_train_df = pd.DataFrame(X_train_scaled, columns=feature_names)
display(X_train_df.head())

print(f"\n前5行目標變數：")
display(y_train.head())

=== 機器學習資料處理的正確順序 ===

✅ 正確流程：
1. 資料清理 (移除缺失值、異常值)
2. 特徵工程 (編碼類別變數)
3. 分離特徵(X)和目標變數(y)
4. 資料分割 (train/test split)
5. 特徵正規化 (只對特徵，不對目標變數)
6. 模型訓練

⚠️ 重要注意事項：
• 正規化應該在資料分割之後進行
• 只對特徵進行正規化，目標變數通常保持原樣
• 測試集使用訓練集的scaler參數(避免資料洩露)
• StandardScaler輸出是numpy array，不是DataFrame

📊 目前資料狀態：
X_train_scaled type: <class 'numpy.ndarray'>
X_train_scaled shape: (36168, 11)
y_train type: <class 'pandas.core.series.Series'>
y_train shape: (36168,)

前5行正規化後的特徵資料：


Unnamed: 0,age,job,default,balance,housing,loan,month,duration,campaign,previous,poutcome
0,-0.933868,-0.107122,-0.13528,-0.440447,-1.119078,-0.436621,-1.509293,2.285006,-0.57189,-0.24319,0.443734
1,0.008528,-1.023563,-0.13528,0.049741,0.893593,-0.436621,-1.842333,-0.821114,0.394687,-0.24319,0.443734
2,1.704842,-0.107122,-0.13528,-0.28813,-1.119078,-0.436621,-1.509293,-0.023283,-0.57189,-0.24319,0.443734
3,-0.274191,-1.329043,-0.13528,-0.080954,0.893593,2.290317,0.155903,0.573154,-0.249697,-0.24319,0.443734
4,0.008528,-1.023563,-0.13528,-0.417212,-1.119078,-0.436621,-0.177137,-0.251788,0.394687,-0.24319,0.443734



前5行目標變數：


20017    1
31835    0
41232    1
10704    0
13670    0
Name: y, dtype: int64

In [52]:
# 線性迴歸模型訓練 (線性)
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

score = lr_model.score(X_test_scaled, y_test)

print(f"線性迴歸擬合成度: {score}")
print(f"係數: {lr_model.coef_}")


線性迴歸擬合成度: 0.17991087230917635
係數: [-0.00071481  0.00668627 -0.00362423  0.00941586 -0.04795645 -0.01767056
  0.00234711  0.12679776 -0.01056783  0.02009697 -0.02032867]


In [47]:
# 邏輯迴歸模型訓練 (非線性)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred = lr_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"邏輯迴歸準確率: {accuracy}")

邏輯迴歸準確率: 0.8865420767444432


In [None]:
# # 多項式迴歸模型訓練 (線性)
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression

# # 建立多項式特徵轉換器
# poly_features = PolynomialFeatures(degree=2)

# # 轉換訓練和測試資料
# X_train_poly = poly_features.fit_transform(X_train)
# X_test_poly = poly_features.transform(X_test)

# # 建立線性迴歸模型並訓練
# poly_model = LinearRegression()
# poly_model.fit(X_train_poly, y_train)

# # 計算模型擬合成度
# poly_score = poly_model.score(X_test_poly, y_test)
# print(f"多項式迴歸擬合成度: {poly_score}")
