In [253]:
import pandas as pd

data = pd.read_csv("bank-full.csv", sep=";")

data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [254]:
# 資料分析：欄位資訊
print("=== 資料集基本資訊 ===")
print(f"資料筆數: {data.shape[0]:,}")
print(f"欄位數量: {data.shape[1]}")
print(f"資料大小: {data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n=== 欄位名稱和資料型態 ===")
data.info()

print("\n=== 各欄位的唯一值數量 ===")
for col in data.columns:
    unique_count = data[col].nunique()
    print(f"{col}: {unique_count} 個唯一值")
    if unique_count <= 10:  # 如果唯一值少於等於10個，顯示所有值
        print(f"  → {sorted(data[col].unique())}")
    print()

=== 資料集基本資訊 ===
資料筆數: 45,211
欄位數量: 17
資料大小: 29.20 MB

=== 欄位名稱和資料型態 ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB

=== 各欄位的唯一值數量 ===
age: 77 

In [255]:
display(data)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [256]:
# 檢查有無缺失值
print("=== 檢查缺失值 ===")
missing_values = data.isnull().sum()
total_missing = missing_values.sum()
if total_missing == 0:
    print("✅ 資料集中沒有缺失值。")
else:
    print(f"⚠️ 資料集中有 {total_missing} 個缺失值，請進行處理。")
    print(missing_values[missing_values > 0])
data.isnull().sum()

=== 檢查缺失值 ===
✅ 資料集中沒有缺失值。


age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [257]:
# 轉換類別型變數為數值型變數
# 使用LabelEncoder將資料類型轉換為Number
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in data.select_dtypes(include=['object']).columns:
  data[col] = le.fit_transform(data[col])

display(data)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,825,0,0,0,17,9,977,3,-1,0,3,1
45207,71,5,0,0,0,1729,0,0,0,17,9,456,2,-1,0,3,1
45208,72,5,1,1,0,5715,0,0,0,17,9,1127,5,184,3,2,1
45209,57,1,1,1,0,668,0,0,1,17,9,508,4,-1,0,3,0


In [258]:
# 資料分割 x 和 y
# 注意：scalered_data是numpy array，需要先分離特徵和目標變數再進行正規化

# 方法1：從原始資料分離，然後分別處理
print("=== 正確的資料分割和正規化流程 ===")

# 先從原始資料分離特徵和目標變數
X = data.drop('y', axis=1)  # 特徵
y = data['y']               # 目標變數

print(f"特徵矩陣形狀: {X.shape}")
print(f"目標變數形狀: {y.shape}")

# 進行資料分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

display(X_train)

=== 正確的資料分割和正規化流程 ===
特徵矩陣形狀: (45211, 16)
目標變數形狀: (45211,)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
20017,31,4,2,2,0,2,0,0,0,8,1,848,1,-1,0,3
31835,41,1,1,1,0,1521,1,0,0,8,0,46,4,-1,0,3
41232,59,4,1,2,0,474,0,0,0,21,1,252,1,-1,0,3
10704,38,0,1,1,0,1116,1,1,2,16,6,406,2,-1,0,3
13670,41,1,2,0,0,74,0,0,2,9,5,193,4,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,25,4,1,2,0,937,1,0,0,23,5,115,1,-1,0,3
79,55,7,0,1,1,1,1,0,2,5,8,208,1,-1,0,3
12119,54,1,1,0,0,1033,0,0,2,20,6,152,3,-1,0,3
14147,44,3,1,0,0,71,1,0,0,11,5,251,2,-1,0,3


In [259]:
# 資料正規化
from sklearn.preprocessing import StandardScaler, Normalizer
# scaler = StandardScaler()
# scalered_data = scaler.fit_transform(data)
# scalered_data[0]

no = Normalizer()
scalered_data = no.fit_transform(X_train)
display(scalered_data.shape)


(36168, 16)

In [260]:
# 篩選特徵(移除不必要的欄位)
import numpy as np

# 1. Low Variance Filter - 移除低變異的特徵
print("=== Step 1: Low Variance Filter ===")
from sklearn.feature_selection import VarianceThreshold

lvf = VarianceThreshold(threshold=1e-06)  # 設定變異數閾值
X_train_lvf = lvf.fit_transform(scalered_data)  # ✅ 使用新變數名稱

print(f"原始特徵數量: {scalered_data.shape[1]}")
print(f"Low Variance Filter 後: {X_train_lvf.shape[1]}")
print(f"移除了 {scalered_data.shape[1] - X_train_lvf.shape[1]} 個低變異特徵")

# 檢查每個特徵的變異數（用於診斷）
variances = scalered_data.var(axis=0)
print(f"\n各特徵變異數:")
for i, var in enumerate(variances):
    print(f"Feature {i}: {var:.6f}")

# 2. High Correlation Filter - 移除高相關性的特徵
print("\n=== Step 2: High Correlation Filter ===")

# 轉換為 DataFrame 以便處理
X_train_df = pd.DataFrame(X_train_lvf)

# 計算相關性矩陣
corr_matrix = X_train_df.corr().abs()
print("相關性矩陣:")
display(corr_matrix)

# 找出高相關性的特徵對（設定閾值 0.80）
correlation_threshold = 0.80
upper_triangle = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

# 找出需要移除的欄位
to_drop = [column for column in upper_triangle.columns 
           if any(upper_triangle[column] > correlation_threshold)]

print(f"\n高相關性特徵（相關性 > {correlation_threshold}）:")
if to_drop:
    print(f"準備移除: {to_drop}")
    X_train_hcf = X_train_df.drop(columns=to_drop)
else:
    print("沒有發現高相關性特徵")
    X_train_hcf = X_train_df

print(f"\nHigh Correlation Filter 前: {X_train_df.shape[1]} 個特徵")
print(f"High Correlation Filter 後: {X_train_hcf.shape[1]} 個特徵")
print(f"移除了 {len(to_drop)} 個高相關特徵")

# 最終結果
print(f"\n🎯 特徵篩選完成:")
print(f"原始: {scalered_data.shape[1]} → Low Variance: {X_train_lvf.shape[1]} → High Correlation: {X_train_hcf.shape[1]}")

# 保存最終結果
X_train_filtered = X_train_hcf.values  # 轉回 numpy array
display(X_train_filtered.shape)

=== Step 1: Low Variance Filter ===
原始特徵數量: 16
Low Variance Filter 後: 15
移除了 1 個低變異特徵

各特徵變異數:
Feature 0: 0.015048
Feature 1: 0.000348
Feature 2: 0.000019
Feature 3: 0.000022
Feature 4: 0.000001
Feature 5: 0.262440
Feature 6: 0.000006
Feature 7: 0.000003
Feature 8: 0.000019
Feature 9: 0.003751
Feature 10: 0.000391
Feature 11: 0.117867
Feature 12: 0.000596
Feature 13: 0.032095
Feature 14: 0.000019
Feature 15: 0.000086

=== Step 2: High Correlation Filter ===
相關性矩陣:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.672145,0.714807,0.71984,0.407524,0.527009,0.316887,0.516327,0.79297,0.769697,0.42433,0.524755,0.049026,0.003961,0.926515
1,0.672145,1.0,0.619337,0.624849,0.285594,0.350289,0.202864,0.320689,0.619007,0.519952,0.309042,0.42602,0.041539,0.006292,0.69575
2,0.714807,0.619337,1.0,0.679199,0.323561,0.461181,0.247573,0.426797,0.690011,0.673215,0.35009,0.493137,0.031401,0.014407,0.801784
3,0.71984,0.624849,0.679199,1.0,0.306212,0.415116,0.224678,0.354452,0.654141,0.608308,0.33457,0.439233,0.03267,0.016294,0.751497
4,0.407524,0.285594,0.323561,0.306212,1.0,0.289295,0.19725,0.214545,0.346026,0.35634,0.60498,0.182917,0.123945,0.061193,0.380745
5,0.527009,0.350289,0.461181,0.415116,0.289295,1.0,0.228406,0.417539,0.455043,0.665274,0.26347,0.316019,0.027846,0.02611,0.568883
6,0.316887,0.202864,0.247573,0.224678,0.19725,0.228406,1.0,0.13325,0.282988,0.297498,0.16133,0.196336,0.019807,0.004029,0.322414
7,0.516327,0.320689,0.426797,0.354452,0.214545,0.417539,0.13325,1.0,0.3662,0.647176,0.206619,0.243046,0.132729,0.070733,0.54778
8,0.79297,0.619007,0.690011,0.654141,0.346026,0.455043,0.282988,0.3662,1.0,0.636383,0.340346,0.600915,0.068829,0.018762,0.819714
9,0.769697,0.519952,0.673215,0.608308,0.35634,0.665274,0.297498,0.647176,0.636383,1.0,0.354097,0.391921,0.023308,0.012599,0.80072



高相關性特徵（相關性 > 0.8）:
準備移除: [14]

High Correlation Filter 前: 15 個特徵
High Correlation Filter 後: 14 個特徵
移除了 1 個高相關特徵

🎯 特徵篩選完成:
原始: 16 → Low Variance: 15 → High Correlation: 14


(36168, 14)

In [261]:
# Feature Importance Filter
from sklearn.feature_selection import SelectKBest, f_regression

kb_regressor = SelectKBest(score_func=f_regression, k=10)
X_train_kb = kb_regressor.fit_transform(X_train_filtered, y_train)

print(f"Feature Importance Filter 後: {X_train_kb.shape[1]} 個特徵")

# # 查看被選中的特徵詳細資訊
# print("\n=== 被選中的 10 個特徵詳細資訊 ===")

# selected_indices = kb_regressor.get_support(indices=True)
# print(f"特徵索引: {selected_indices}")

# # 顯示每個特徵的分數
# print("\n特徵分數排名:")
# scores_df = pd.DataFrame({
#     'Index': range(len(kb_regressor.scores_)),
#     'F-Score': kb_regressor.scores_
# }).sort_values('F-Score', ascending=False)

# print("前 10 名:")
# display(scores_df.head(10))

# ============================================================
# 1. Low Variance Filter
print("=== Step 1: Low Variance Filter ===")
from sklearn.feature_selection import VarianceThreshold

# 保存原始欄位名稱
original_columns = X.columns.tolist()  # 從 X = data.drop('y', axis=1) 取得
print(f"原始欄位: {original_columns}")

lvf = VarianceThreshold(threshold=1e-06)
X_train_lvf = lvf.fit_transform(scalered_data)

# 記錄保留的欄位
lvf_mask = lvf.get_support()
lvf_columns = [col for col, mask in zip(original_columns, lvf_mask) if mask]
print(f"Low Variance Filter 保留的欄位: {lvf_columns}")

# 2. High Correlation Filter
X_train_df = pd.DataFrame(X_train_lvf, columns=lvf_columns)  # 使用欄位名稱

# 計算相關性矩陣並移除高相關特徵
corr_matrix = X_train_df.corr().abs()
upper_triangle = corr_matrix.where(
	np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

correlation_threshold = 0.80
to_drop = [column for column in upper_triangle.columns 
		   if any(upper_triangle[column] > correlation_threshold)]

if to_drop:
	X_train_hcf = X_train_df.drop(columns=to_drop)
else:
	X_train_hcf = X_train_df

# 記錄 High Correlation Filter 後保留的欄位名稱
hcf_columns = X_train_hcf.columns.tolist()
print(f"High Correlation Filter 保留的欄位: {hcf_columns}")

# 最後在 Feature Importance Filter 中
selected_indices = kb_regressor.get_support(indices=True)
final_columns = [hcf_columns[i] for i in selected_indices]  # 現在 hcf_columns 已定義
print(f"最終選中的欄位: {final_columns}")

Feature Importance Filter 後: 10 個特徵
=== Step 1: Low Variance Filter ===
原始欄位: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
Low Variance Filter 保留的欄位: ['age', 'job', 'marital', 'education', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
High Correlation Filter 保留的欄位: ['age', 'job', 'marital', 'education', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous']
最終選中的欄位: ['age', 'job', 'marital', 'education', 'housing', 'contact', 'day', 'month', 'duration', 'campaign']


In [262]:
# 將資料前處理後的結果匯出成CSV檔案
print("=== 匯出完整資料集到CSV檔案 ===\n")

# 1. 準備訓練集資料
print("1️⃣ 準備訓練集資料...")
# 訓練集特徵（已經過所有篩選）
train_features = X_train_hcf[final_columns].copy()
# 加上目標變數
train_features['y'] = y_train.values
print(f"   訓練集維度: {train_features.shape}")

# 2. 準備測試集資料
print("\n2️⃣ 準備測試集資料...")
# 對測試集進行相同的前處理
# Step 1: 正規化
X_test_normalized = no.transform(X_test)

# Step 2: Low Variance Filter
X_test_lvf = lvf.transform(X_test_normalized)

# Step 3: High Correlation Filter
X_test_df = pd.DataFrame(X_test_lvf, columns=lvf_columns)
X_test_hcf = X_test_df.drop(columns=to_drop) if to_drop else X_test_df

# Step 4: 選擇相同的最終特徵
test_features = X_test_hcf[final_columns].copy()
# 加上目標變數
test_features['y'] = y_test.values
print(f"   測試集維度: {test_features.shape}")

# 3. 合併訓練集和測試集
print("\n3️⃣ 合併訓練集和測試集...")
complete_data = pd.concat([train_features, test_features], axis=0, ignore_index=True)
print(f"   合併後維度: {complete_data.shape}")
print(f"   包含: 訓練集 {train_features.shape[0]} 筆 + 測試集 {test_features.shape[0]} 筆")

# 4. 匯出完整資料集
print("\n4️⃣ 匯出到CSV檔案...")
complete_data.to_csv("data_choice_complete.csv", index=False)
print("✅ 已匯出完整資料集: data_choice_complete.csv")

# 5. 顯示資料資訊
print("\n" + "="*60)
print("📊 匯出資料摘要:")
print(f"   總筆數: {complete_data.shape[0]:,}")
print(f"   特徵數量: {len(final_columns)}")
print(f"   欄位: {list(complete_data.columns)}")
print(f"   目標變數 (y) 分布:")
y_counts = complete_data['y'].value_counts().sort_index()
for val, count in y_counts.items():
    pct = count / len(complete_data) * 100
    print(f"      {val}: {count:,} 筆 ({pct:.2f}%)")

# 6. 驗證匯出結果
print("\n" + "="*60)
print("🔍 驗證匯出的檔案:")
verify_data = pd.read_csv("data_choice_complete.csv")
print(f"   讀取維度: {verify_data.shape}")
print(f"   欄位檢查: {list(verify_data.columns) == list(complete_data.columns)}")
print(f"   資料完整性: 無缺失值" if verify_data.isnull().sum().sum() == 0 else "有缺失值")

print("\n✅ CSV檔案匯出成功！")
print("="*60)

# 顯示前幾行資料
print("\n前5行資料預覽:")
display(complete_data.head())

=== 匯出完整資料集到CSV檔案 ===

1️⃣ 準備訓練集資料...
   訓練集維度: (36168, 11)

2️⃣ 準備測試集資料...
   測試集維度: (9043, 11)

3️⃣ 合併訓練集和測試集...
   合併後維度: (45211, 11)
   包含: 訓練集 36168 筆 + 測試集 9043 筆

4️⃣ 匯出到CSV檔案...
✅ 已匯出完整資料集: data_choice_complete.csv

📊 匯出資料摘要:
   總筆數: 45,211
   特徵數量: 10
   欄位: ['age', 'job', 'marital', 'education', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'y']
   目標變數 (y) 分布:
      0: 39,922 筆 (88.30%)
      1: 5,289 筆 (11.70%)

🔍 驗證匯出的檔案:
   讀取維度: (45211, 11)
   欄位檢查: True
   資料完整性: 無缺失值

✅ CSV檔案匯出成功！

前5行資料預覽:
✅ 已匯出完整資料集: data_choice_complete.csv

📊 匯出資料摘要:
   總筆數: 45,211
   特徵數量: 10
   欄位: ['age', 'job', 'marital', 'education', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'y']
   目標變數 (y) 分布:
      0: 39,922 筆 (88.30%)
      1: 5,289 筆 (11.70%)

🔍 驗證匯出的檔案:
   讀取維度: (45211, 11)
   欄位檢查: True
   資料完整性: 無缺失值

✅ CSV檔案匯出成功！

前5行資料預覽:


Unnamed: 0,age,job,marital,education,housing,contact,day,month,duration,campaign,y
0,0.03653,0.004713,0.002357,0.002357,0.0,0.0,0.009427,0.001178,0.99926,0.001178,1
1,0.026933,0.000657,0.000657,0.000657,0.000657,0.0,0.005255,0.0,0.030218,0.002628,0
2,0.109159,0.007401,0.00185,0.0037,0.0,0.0,0.038853,0.00185,0.466239,0.00185,1
3,0.031979,0.0,0.000842,0.000842,0.000842,0.001683,0.013465,0.005049,0.341665,0.001683,0
4,0.194256,0.004738,0.009476,0.0,0.0,0.009476,0.042642,0.02369,0.914425,0.018952,0


In [251]:
# 線性迴歸模型訓練 (線性)
# 需要對測試資料進行相同的前處理
# 1. 正規化測試資料
from sklearn.linear_model import LinearRegression
X_test_normalized = no.transform(X_test)

# 2. 套用 Low Variance Filter
X_test_lvf = lvf.transform(X_test_normalized)

# 3. 套用 High Correlation Filter (移除相同的欄位)
X_test_df = pd.DataFrame(X_test_lvf, columns=lvf_columns)
X_test_hcf = X_test_df.drop(columns=to_drop) if to_drop else X_test_df

# 4. 套用 Feature Importance Filter (選擇相同的特徵)
X_test_kb = kb_regressor.transform(X_test_hcf.values)

# 訓練線性迴歸模型
lr_model = LinearRegression()
lr_model.fit(filtered_data, y_train)

# 使用處理過的測試資料進行預測和評分
score = lr_model.score(X_test_kb, y_test)

print(f"線性迴歸擬合成度: {score}")
print(f"係數: {lr_model.coef_}")

線性迴歸擬合成度: 0.06004949835498463
係數: [ -0.44638719  -0.08101529  -1.34557328   0.98209086 -12.47638466
  -3.80741218  -0.26032869   0.68546113   0.17996294   0.5251328 ]




In [252]:
# 邏輯迴歸模型訓練 (非線性)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr_model = LogisticRegression()
lr_model.fit(filtered_data, y_train)
y_pred = lr_model.predict(X_test_kb)
accuracy = accuracy_score(y_test, y_pred)
print(f"邏輯迴歸準確率: {accuracy}")

邏輯迴歸準確率: 0.8858785801172178


