<a href="https://colab.research.google.com/github/michael-0907/tibami/blob/main/kaggle_car_insurance_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from imblearn.over_sampling import SMOTE



In [66]:
data = pd.read_csv("/content/sample_data/train.csv", encoding="utf-8")
data.shape

(58592, 44)

In [67]:
# 統計目標變數的分佈
print(data["is_claim"].value_counts())
print(data["is_claim"].value_counts(normalize=True))

is_claim
0    54844
1     3748
Name: count, dtype: int64
is_claim
0    0.936032
1    0.063968
Name: proportion, dtype: float64


In [68]:
# 刪除不必要的欄位
data = data.drop(columns=["policy_id"])
data.shape

(58592, 43)

In [69]:
# 提取並轉換 max_torque, max_power 欄位
def torque_transform(torque):
  torque = float(torque.lower().split("n")[0])
  return torque
data["max_torque"] = data["max_torque"].apply(torque_transform)

def power_transform(power):
  power = float(power.lower().split("b")[0])
  return power
data["max_power"] = data["max_power"].apply(power_transform)



In [70]:
# 對二元欄位進行編碼
for col in data.columns:
  if (data[col].dtype == "object") & (data[col].nunique() == 2) & ("is_" in col):
    data[col] = data[col].map({"Yes": 1, "No": 0})

In [71]:
# 提取特徵變數、目標變數
X = data.drop(columns=["is_claim"])
y = data["is_claim"]

In [72]:
# 數據分割成訓練組、測試組
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True ,stratify=y)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
print(Counter(y_train))
print(Counter(y_test))

(46873, 42) (11719, 42)
(46873,) (11719,)
Counter({0: 43875, 1: 2998})
Counter({0: 10969, 1: 750})


In [None]:

for i, col in enumerate(X_train.columns):
  if X_train[col].dtype != "object":
    print(f"no: {i}, col_name: {col}, dtype: {X_train[col].dtype}, nunique: {X_train[col].nunique()}")
    print(X_train[col].unique())
    print("-" * 30)


In [73]:
# 將特徵欄位做分類
unchanged_columns = ["policy_tenure", "age_of_car", "age_of_policyholder", 'is_esc', 'is_adjustable_steering', 'is_tpms',
            'is_parking_sensors', 'is_parking_camera', 'is_front_fog_lights', 'is_rear_window_wiper', 'is_rear_window_washer',
            'is_rear_window_defogger', 'is_brake_assist', 'is_power_door_locks', 'is_central_locking', 'is_power_steering',
            'is_driver_seat_height_adjustable', 'is_day_night_rear_view_mirror', 'is_ecw', 'is_speed_alert']
numerical_columns = ["population_density", "make", "max_torque", "max_power", "airbags", "displacement", 'cylinder',
            'gear_box', 'turning_radius', 'length', 'width', 'height', 'gross_weight', "ncap_rating"]
categorical_columns = list(X_train.select_dtypes(include=["object"]).columns)
print(len(unchanged_columns) + len(numerical_columns) + len(categorical_columns))
print(X_train.shape)

42
(46873, 42)


In [74]:
# 設置前處理器並執行資料前處理
preprocessor = ColumnTransformer(transformers=[
    ("passthrough", "passthrough", unchanged_columns),
     ("numerical", StandardScaler(), numerical_columns),
      ("categorical", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_columns)])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
print(X_train.shape, X_test.shape)
print(X_train_preprocessed.shape, X_test_preprocessed.shape)



(46873, 42) (11719, 42)
(46873, 94) (11719, 94)


In [75]:
# 取得 ohe 的編碼欄位名稱，並重整合成 dataframe
cat_ohe_columns = preprocessor.named_transformers_["categorical"].get_feature_names_out()
all_column_names = unchanged_columns + numerical_columns + list(cat_ohe_columns)
len(all_column_names)
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=all_column_names)
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed, columns=all_column_names)
print(X_train_preprocessed_df.shape, X_test_preprocessed_df.shape)
X_train_preprocessed_df.head(3)


(46873, 94) (11719, 94)


Unnamed: 0,policy_tenure,age_of_car,age_of_policyholder,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,is_parking_camera,is_front_fog_lights,is_rear_window_wiper,...,engine_type_K Series Dual jet,engine_type_K10C,engine_type_i-DTEC,rear_brakes_type_Disc,rear_brakes_type_Drum,transmission_type_Automatic,transmission_type_Manual,steering_type_Electric,steering_type_Manual,steering_type_Power
0,0.293321,0.18,0.471154,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,0.393585,0.09,0.336538,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,1.068827,0.03,0.605769,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


In [None]:
# 預處理結束，開始訓練

In [116]:
# 用決策樹訓練模型
model_dtc = DecisionTreeClassifier(max_depth=20, min_samples_leaf=5, class_weight={0: 1, 1:10}, random_state=42)
model_dtc.fit(X_train_preprocessed_df, y_train)
print(f"最大深度: {model_dtc.tree_.max_depth}, 節點總數: {model_dtc.tree_.node_count}")
print(f"類別標籤: {model_dtc.classes_}, 類別數量: {model_dtc.n_classes_}")

最大深度: 20, 節點總數: 4255
類別標籤: [0 1], 類別數量: 2


In [117]:
# 計算訓練組的預測結果
y_train_pred_dtc = model_dtc.predict(X_train_preprocessed_df)
train_acc_dtc = accuracy_score(y_train, y_train_pred_dtc)
train_report_dtc = classification_report(y_train, y_train_pred_dtc)

# 計算測試組的預測結果
y_test_pred_dtc = model_dtc.predict(X_test_preprocessed_df)
test_acc_dtc = accuracy_score(y_test, y_test_pred_dtc)
test_report_dtc = classification_report(y_test, y_test_pred_dtc)

# 顯示結果
print(f"Train Accuracy: {train_acc_dtc:.4f}")
print(f"Test Accuracy: {test_acc_dtc:.4f}")
print("Train Classification Report:\n", train_report_dtc)
print("Test Classification Report:\n", test_report_dtc)

Train Accuracy: 0.8005
Test Accuracy: 0.7193
Train Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.79      0.88     43875
           1       0.23      0.92      0.37      2998

    accuracy                           0.80     46873
   macro avg       0.61      0.86      0.63     46873
weighted avg       0.94      0.80      0.85     46873

Test Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.75      0.83     10969
           1       0.08      0.31      0.13       750

    accuracy                           0.72     11719
   macro avg       0.51      0.53      0.48     11719
weighted avg       0.89      0.72      0.79     11719



In [108]:
# 用隨機森林訓練模型
model_rfc = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=10, class_weight={0: 1, 1: 10}, random_state=42)
# model_rfc.fit(X_train_preprocessed_df, y_train)
model_rfc.fit(X_train_preprocessed_df[fil], y_train)
fil = ['policy_tenure', 'age_of_car', 'age_of_policyholder', 'population_density',]

print(f"樹數量: {len(model_rfc.estimators_)}")

樹數量: 100


In [109]:
# 計算訓練組的預測結果
# y_train_pred_rfc = model_dtc.predict(X_train_preprocessed_df)
y_train_pred_rfc = model_dtc.predict(X_train_preprocessed_df[fil])

train_acc_rfc = accuracy_score(y_train, y_train_pred_rfc)
train_report_rfc = classification_report(y_train, y_train_pred_rfc)

# 計算測試組的預測結果
# y_test_pred_rfc = model_dtc.predict(X_test_preprocessed_df)
y_test_pred_rfc = model_dtc.predict(X_test_preprocessed_df[fil])

test_acc_rfc = accuracy_score(y_test, y_test_pred_rfc)
test_report_rfc = classification_report(y_test, y_test_pred_rfc)

# 顯示結果
print(f"Train Accuracy: {train_acc_rfc:.4f}")
print(f"Test Accuracy: {test_acc_rfc:.4f}")
print("Train Classification Report:\n", train_report_rfc)
print("Test Classification Report:\n", test_report_rfc)

Train Accuracy: 0.8027
Test Accuracy: 0.7286
Train Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.80      0.88     43875
           1       0.23      0.89      0.37      2998

    accuracy                           0.80     46873
   macro avg       0.61      0.84      0.62     46873
weighted avg       0.94      0.80      0.85     46873

Test Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.76      0.84     10969
           1       0.08      0.29      0.12       750

    accuracy                           0.73     11719
   macro avg       0.51      0.52      0.48     11719
weighted avg       0.88      0.73      0.79     11719



In [None]:
# 以下是分割數據集後，對訓練組做 smote

In [17]:
# SMOTE 之前的類別分佈
print("SMOTE 之前類別分佈:", Counter(y_train))
# 使用 SMOTE 進行過採樣
smote = SMOTE(sampling_strategy=0.3, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed_df, y_train)
# SMOTE 之後的類別分佈
print("SMOTE 之後類別分佈:", Counter(y_train_smote))
print(X_train_smote.shape)
print(y_train_smote.shape)

SMOTE 之前類別分佈: Counter({0: 43875, 1: 2998})
SMOTE 之後類別分佈: Counter({0: 43875, 1: 13162})
(57037, 94)
(57037,)


In [18]:
model_smote_dtc = DecisionTreeClassifier(max_depth=5, min_samples_leaf=50, class_weight="balanced", random_state=42)
model_smote_dtc.fit(X_train_smote, y_train_smote)
y_pred_smote_dtc = model_smote_dtc.predict(X_test_preprocessed_df)

In [19]:
print(f"accuracy: {accuracy_score(y_test, y_pred_smote_dtc)}")
print(classification_report(y_test, y_pred_smote_dtc))
print(f"最大深度: {model_smote_dtc.tree_.max_depth}, 節點總數: {model_smote_dtc.tree_.node_count}")
print(f"類別標籤: {model_smote_dtc.classes_}, 類別數量: {model_smote_dtc.n_classes_}")


accuracy: 0.5826435702705008
              precision    recall  f1-score   support

           0       0.95      0.58      0.72     10969
           1       0.08      0.56      0.15       750

    accuracy                           0.58     11719
   macro avg       0.52      0.57      0.44     11719
weighted avg       0.90      0.58      0.69     11719

最大深度: 5, 節點總數: 59
類別標籤: [0 1], 類別數量: 2


In [20]:
model_smote_rfc = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=5, max_features="sqrt", class_weight="balanced", random_state=42)
model_smote_rfc.fit(X_train_smote, y_train_smote)
y_pred_smote_rfc = model_smote_rfc.predict(X_test_preprocessed_df)

In [21]:
print(f"accuracy: {accuracy_score(y_test, y_pred_smote_rfc)}")
print(classification_report(y_test, y_pred_smote_rfc))
print(f"樹數量: {len(model_rfc.estimators_)}")


accuracy: 0.5877634610461644
              precision    recall  f1-score   support

           0       0.96      0.58      0.73     10969
           1       0.09      0.63      0.16       750

    accuracy                           0.59     11719
   macro avg       0.53      0.61      0.45     11719
weighted avg       0.90      0.59      0.69     11719

樹數量: 50


In [None]:
# 取重要特徵欄位重新訓練

In [22]:
# model_select = model_dtc
# model_select = model_rfc
# model_select = model_smote_dtc
model_select = model_smote_rfc

feature_importance = model_select.feature_importances_
sorted_index = np.argsort(feature_importance)[::-1]
sorted_feature_names = X_train_smote.columns[sorted_index]
sorted_feature_importance = feature_importance[sorted_index]
print(sorted_feature_importance.shape, sorted_feature_names.shape)
important_features = []
threshold = 0.001
for i, j in zip(sorted_feature_names, sorted_feature_importance):
  if j >= threshold:
    print(f"{i}: {j}")
    important_features.append(i)
display(important_features)

(94,) (94,)
policy_tenure: 0.2907425442068121
age_of_car: 0.2717363775812463
age_of_policyholder: 0.09872627326165705
population_density: 0.03815402950106605
area_cluster_C10: 0.027932058400188278
area_cluster_C17: 0.026165518620180083
max_power: 0.014043556139207125
area_cluster_C3: 0.013705683071015384
area_cluster_C12: 0.011781028974037048
length: 0.010913151126238875
area_cluster_C15: 0.010856883864770576
height: 0.009874606698944053
area_cluster_C16: 0.009817923954025684
width: 0.009128271792031458
displacement: 0.008883723676857143
area_cluster_C7: 0.008609106883792232
area_cluster_C9: 0.007992410691736196
is_adjustable_steering: 0.007477692791201224
segment_B2: 0.0074292197650376954
engine_type_F8D Petrol Engine: 0.006699862605921855
gross_weight: 0.00578285435166556
max_torque: 0.005612027460309586
ncap_rating: 0.005248547205997884
turning_radius: 0.005025340076054302
area_cluster_C8: 0.004340493183685569
area_cluster_C19: 0.004111923452792223
model_M1: 0.003919045100639477
cyl

['policy_tenure',
 'age_of_car',
 'age_of_policyholder',
 'population_density',
 'area_cluster_C10',
 'area_cluster_C17',
 'max_power',
 'area_cluster_C3',
 'area_cluster_C12',
 'length',
 'area_cluster_C15',
 'height',
 'area_cluster_C16',
 'width',
 'displacement',
 'area_cluster_C7',
 'area_cluster_C9',
 'is_adjustable_steering',
 'segment_B2',
 'engine_type_F8D Petrol Engine',
 'gross_weight',
 'max_torque',
 'ncap_rating',
 'turning_radius',
 'area_cluster_C8',
 'area_cluster_C19',
 'model_M1',
 'cylinder',
 'fuel_type_CNG',
 'is_power_door_locks',
 'make',
 'area_cluster_C18',
 'steering_type_Electric',
 'is_central_locking',
 'area_cluster_C5',
 'area_cluster_C2',
 'area_cluster_C1',
 'area_cluster_C20',
 'is_brake_assist',
 'area_cluster_C11',
 'airbags',
 'segment_A',
 'model_M6',
 'segment_B1',
 'engine_type_K10C',
 'model_M8',
 'transmission_type_Manual',
 'is_front_fog_lights',
 'is_rear_window_defogger',
 'area_cluster_C6',
 'steering_type_Power',
 'transmission_type_Autom

In [23]:
X_train_important = X_train_smote[important_features]
X_test_important = X_test_preprocessed_df[important_features]
print(X_train_important.shape, X_test_important.shape)

# X_train_important = X_train_smote[["age_of_car"]]
# X_test_important = X_test_preprocessed_df[["age_of_car"]]

(57037, 53) (11719, 53)


In [24]:
model_important_dtc = DecisionTreeClassifier(max_depth=5, min_samples_leaf=20, class_weight="balanced", random_state=42)
model_important_dtc.fit(X_train_important, y_train_smote)
y_pred_important_dtc = model_important_dtc.predict(X_test_important)

In [25]:
print(f"accuracy: {accuracy_score(y_test, y_pred_important_dtc)}")
print(classification_report(y_test, y_pred_important_dtc))
print(f"最大深度: {model_important_dtc.tree_.max_depth}, 節點總數: {model_important_dtc.tree_.node_count}")
print(f"類別標籤: {model_important_dtc.classes_}, 類別數量: {model_important_dtc.n_classes_}")

accuracy: 0.58230224421879
              precision    recall  f1-score   support

           0       0.95      0.58      0.72     10969
           1       0.08      0.56      0.15       750

    accuracy                           0.58     11719
   macro avg       0.52      0.57      0.43     11719
weighted avg       0.90      0.58      0.69     11719

最大深度: 5, 節點總數: 57
類別標籤: [0 1], 類別數量: 2


In [None]:
model_important_rfc = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=5, max_features="sqrt", class_weight="balanced_subsample", random_state=42)
model_important_rfc.fit(X_train_important, y_train_smote)
y_pred_important_rfc = model_important_rfc.predict(X_test_important)
y_prob_important_rfc = model_important_rfc.predict_proba(X_test_important)[:, 1]


In [None]:
y_pred_important_rfc

array([1, 0, 0, ..., 0, 1, 1])

In [None]:
y_prob_important_rfc

array([[0.46229278, 0.53770722],
       [0.58638651, 0.41361349],
       [0.56380609, 0.43619391],
       ...,
       [0.61032147, 0.38967853],
       [0.43865995, 0.56134005],
       [0.42834665, 0.57165335]])

In [None]:
print(f"accuracy: {accuracy_score(y_test, y_pred_important_rfc)}")
print(classification_report(y_test, y_pred_important_rfc))
print(f"樹數量: {len(model_important_rfc.estimators_)}")

threshold = 0.6
y_pred_pro = (y_prob_important_rfc >= threshold).astype(int)
print(classification_report(y_test, y_pred_pro))


accuracy: 0.5744517450294394
              precision    recall  f1-score   support

           0       0.96      0.57      0.71     10969
           1       0.09      0.65      0.16       750

    accuracy                           0.57     11719
   macro avg       0.53      0.61      0.44     11719
weighted avg       0.90      0.57      0.68     11719

樹數量: 50
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     10969
           1       0.00      0.00      0.00       750

    accuracy                           0.94     11719
   macro avg       0.47      0.50      0.48     11719
weighted avg       0.88      0.94      0.91     11719



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
