In [6]:
import pandas as pd

data = pd.read_csv("bank-full.csv", sep=";")

data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [7]:
# 移除不必要的特徵(欄位)
final = data.drop([
  'job','marital','education','default','housing','contact','day','month','duration','campaign','pdays','previous','poutcome'
],axis=1)

# 將 loan和 y的資料類型轉換為Number
final['y'] = final['y'].replace(('yes', 'no'), (1, 0))
final['loan'] = final['loan'].replace(('yes', 'no'), (1, 0))
# display(final
# final.head()

# 分割 X和 y
X = final.drop(['y'], axis=1)
y = final.drop(['age', 'balance', 'loan'], axis=1)
X.head()
# y.head()


# 分割"訓練資料"及"測試資料"
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# 使用線性迴歸(評分較差)
# from sklearn.linear_model import LinearRegression
# lr = LinearRegression()
# lr.fit(X_train, y_train)  // fit() 是訓練模型
# lr.score(X_test, y_test)  // score 是用來衡量線性迴歸訓練完成的模型擬合程度
# lr.coef_           // coef_是 迴歸係數（回歸權重），表示每個特徵對目標變數的影響程度

# 使用邏輯迴歸(評分較佳)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train, y_train.values.ravel())  # 注意：需要轉換 y_train 的形狀

# 預測和評估
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"準確率: {accuracy}")

# 係數解釋
print(f"係數: {lr.coef_}")
print(f"特徵名稱: {X.columns.tolist()}")

準確率: 0.8848833351763795
係數: [[ 5.87252035e-03  3.56424966e-05 -7.21278083e-01]]
特徵名稱: ['age', 'balance', 'loan']


  final['y'] = final['y'].replace(('yes', 'no'), (1, 0))
  final['loan'] = final['loan'].replace(('yes', 'no'), (1, 0))


In [9]:
# 資料分析：欄位資訊
print("=== 資料集基本資訊 ===")
print(f"資料筆數: {data.shape[0]:,}")
print(f"欄位數量: {data.shape[1]}")
print(f"資料大小: {data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n=== 欄位名稱和資料型態 ===")
data.info()

print("\n=== 各欄位的唯一值數量 ===")
for col in data.columns:
    unique_count = data[col].nunique()
    print(f"{col}: {unique_count} 個唯一值")
    if unique_count <= 10:  # 如果唯一值少於等於10個，顯示所有值
        print(f"  → {sorted(data[col].unique())}")
    print()

=== 資料集基本資訊 ===
資料筆數: 45,211
欄位數量: 17
資料大小: 29.20 MB

=== 欄位名稱和資料型態 ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB

=== 各欄位的唯一值數量 ===
age: 77 