<a href="https://colab.research.google.com/github/michael-0907/tibami/blob/main/kaggle_car_insurance_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [262]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


from imblearn.over_sampling import SMOTE





In [291]:
data = pd.read_csv("/content/sample_data/train.csv", encoding="utf-8")
data.shape

(58592, 44)

In [292]:
# 統計目標變數的分佈
print(data["is_claim"].value_counts())
print(data["is_claim"].value_counts(normalize=True))

is_claim
0    54844
1     3748
Name: count, dtype: int64
is_claim
0    0.936032
1    0.063968
Name: proportion, dtype: float64


In [293]:
# 刪除不必要的欄位
data = data.drop(columns=["policy_id"])
data.shape

(58592, 43)

In [294]:
# 提取並轉換 max_torque, max_power 欄位
def torque_transform(torque):
  torque = float(torque.lower().split("n")[0])
  return torque
data["max_torque"] = data["max_torque"].apply(torque_transform)

def power_transform(power):
  power = float(power.lower().split("b")[0])
  return power
data["max_power"] = data["max_power"].apply(power_transform)



In [295]:
# 對二元欄位進行編碼
for col in data.columns:
  if (data[col].dtype == "object") & (data[col].nunique() == 2) & ("is_" in col):
    data[col] = data[col].map({"Yes": 1, "No": 0})

In [296]:
# 提取特徵變數、目標變數
X = data.drop(columns=["is_claim"])
y = data["is_claim"]

In [297]:
# 數據分割成訓練組、測試組
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True ,stratify=y)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
print(Counter(y_train))
print(Counter(y_test))

(46873, 42) (11719, 42)
(46873,) (11719,)
Counter({0: 43875, 1: 2998})
Counter({0: 10969, 1: 750})


In [None]:

for i, col in enumerate(X_train.columns):
  if X_train[col].dtype != "object":
    print(f"no: {i}, col_name: {col}, dtype: {X_train[col].dtype}, nunique: {X_train[col].nunique()}")
    print(X_train[col].unique())
    print("-" * 30)


In [299]:
# 將特徵欄位做分類
unchanged_columns = ["policy_tenure", "age_of_car", "age_of_policyholder", 'is_esc', 'is_adjustable_steering', 'is_tpms',
            'is_parking_sensors', 'is_parking_camera', 'is_front_fog_lights', 'is_rear_window_wiper', 'is_rear_window_washer',
            'is_rear_window_defogger', 'is_brake_assist', 'is_power_door_locks', 'is_central_locking', 'is_power_steering',
            'is_driver_seat_height_adjustable', 'is_day_night_rear_view_mirror', 'is_ecw', 'is_speed_alert']
numerical_columns = ["population_density", "make", "max_torque", "max_power", "airbags", "displacement", 'cylinder',
            'gear_box', 'turning_radius', 'length', 'width', 'height', 'gross_weight', "ncap_rating"]
categorical_columns = list(X_train.select_dtypes(include=["object"]).columns)
print(len(unchanged_columns) + len(numerical_columns) + len(categorical_columns))
print(X_train.shape)

42
(46873, 42)


In [303]:
# 設置前處理器並執行資料前處理
preprocessor = ColumnTransformer(transformers=[
    ("passthrough", "passthrough", unchanged_columns),
     ("numerical", StandardScaler(), numerical_columns),
      ("categorical", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_columns)])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
print(X_train.shape, X_test.shape)
print(X_train_preprocessed.shape, X_test_preprocessed.shape)



(46873, 42) (11719, 42)
(46873, 94) (11719, 94)


In [307]:
# 取得 ohe 的編碼欄位名稱，並重整合成 dataframe
cat_ohe_columns = preprocessor.named_transformers_["categorical"].get_feature_names_out()
all_column_names = unchanged_columns + numerical_columns + list(cat_ohe_columns)
len(all_column_names)
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=all_column_names)
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed, columns=all_column_names)
print(X_train_preprocessed_df.shape, X_test_preprocessed_df.shape)
X_train_preprocessed_df.head(3)


(46873, 94) (11719, 94)


Unnamed: 0,policy_tenure,age_of_car,age_of_policyholder,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,is_parking_camera,is_front_fog_lights,is_rear_window_wiper,...,engine_type_K Series Dual jet,engine_type_K10C,engine_type_i-DTEC,rear_brakes_type_Disc,rear_brakes_type_Drum,transmission_type_Automatic,transmission_type_Manual,steering_type_Electric,steering_type_Manual,steering_type_Power
0,0.293321,0.18,0.471154,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,0.393585,0.09,0.336538,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,1.068827,0.03,0.605769,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


In [None]:
# 預處理結束，開始訓練

In [308]:
model_dtc = DecisionTreeClassifier(max_depth=20, min_samples_leaf=20, class_weight="balanced", random_state=42)
model_dtc.fit(X_train_preprocessed_df, y_train)
y_pred_dtc = model_dtc.predict(X_test_preprocessed_df)

In [309]:
print(f"accuracy: {accuracy_score(y_test, y_pred_dtc)}")
print(classification_report(y_test, y_pred_dtc))
print(f"最大深度: {model_dtc.tree_.max_depth}, 節點總數: {model_dtc.tree_.node_count}")
print(f"類別標籤: {model_dtc.classes_}, 類別數量: {model_dtc.n_classes_}")


accuracy: 0.6185681372130728
              precision    recall  f1-score   support

           0       0.95      0.63      0.76     10969
           1       0.08      0.48      0.14       750

    accuracy                           0.62     11719
   macro avg       0.51      0.55      0.45     11719
weighted avg       0.89      0.62      0.72     11719

最大深度: 20, 節點總數: 2477
類別標籤: [0 1], 類別數量: 2


In [310]:
model_rfc = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=10, class_weight="balanced", random_state=42)
model_rfc.fit(X_train_preprocessed_df, y_train)
y_pred_rfc = model_rfc.predict(X_test_preprocessed_df)

In [311]:
print(f"accuracy: {accuracy_score(y_test, y_pred_rfc)}")
print(classification_report(y_test, y_pred_rfc))
print(f"樹數量: {len(model_rfc.estimators_)}")


accuracy: 0.8162812526666098
              precision    recall  f1-score   support

           0       0.95      0.85      0.90     10969
           1       0.11      0.28      0.16       750

    accuracy                           0.82     11719
   macro avg       0.53      0.56      0.53     11719
weighted avg       0.89      0.82      0.85     11719

樹數量: 100


In [None]:
# 以下是分割數據集後，對訓練組做 smote

In [312]:
# SMOTE 之前的類別分佈
print("SMOTE 之前類別分佈:", Counter(y_train))
# 使用 SMOTE 進行過採樣
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_preprocessed_df, y_train)
# SMOTE 之後的類別分佈
print("SMOTE 之後類別分佈:", Counter(y_train_smote))
print(X_train_smote.shape)
print(y_train_smote.shape)

SMOTE 之前類別分佈: Counter({0: 43875, 1: 2998})
SMOTE 之後類別分佈: Counter({0: 43875, 1: 43875})
(87750, 94)
(87750,)


In [313]:
model_smote_dtc = DecisionTreeClassifier(max_depth=None, min_samples_leaf=20, class_weight=None, random_state=42)
model_smote_dtc.fit(X_train_smote, y_train_smote)
y_pred_smote_dtc = model_smote_dtc.predict(X_test_preprocessed_df)

In [314]:
print(f"accuracy: {accuracy_score(y_test, y_pred_smote_dtc)}")
print(classification_report(y_test, y_pred_smote_dtc))
print(f"最大深度: {model_smote_dtc.tree_.max_depth}, 節點總數: {model_smote_dtc.tree_.node_count}")
print(f"類別標籤: {model_smote_dtc.classes_}, 類別數量: {model_smote_dtc.n_classes_}")


accuracy: 0.8887277071422476
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     10969
           1       0.09      0.08      0.09       750

    accuracy                           0.89     11719
   macro avg       0.51      0.51      0.51     11719
weighted avg       0.88      0.89      0.89     11719

最大深度: 37, 節點總數: 3397
類別標籤: [0 1], 類別數量: 2


In [315]:
model_smote_rfc = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=20, class_weight=None, random_state=42)
model_smote_rfc.fit(X_train_smote, y_train_smote)
y_pred_smote_rfc = model_smote_rfc.predict(X_test_preprocessed_df)

In [316]:
print(f"accuracy: {accuracy_score(y_test, y_pred_smote_rfc)}")
print(classification_report(y_test, y_pred_smote_rfc))
print(f"樹數量: {len(model_rfc.estimators_)}")


accuracy: 0.7242938817305231
              precision    recall  f1-score   support

           0       0.95      0.75      0.84     10969
           1       0.10      0.40      0.16       750

    accuracy                           0.72     11719
   macro avg       0.52      0.58      0.50     11719
weighted avg       0.89      0.72      0.79     11719

樹數量: 100


In [None]:
# 取重要特徵欄位重新訓練

In [335]:
# model_select = model_dtc
# model_select = model_rfc
model_select = model_smote_dtc
# model_select = model_smote_rfc

feature_importance = model_select.feature_importances_
sorted_index = np.argsort(feature_importance)[::-1]
sorted_feature_names = X_train_smote.columns[sorted_index]
sorted_feature_importance = feature_importance[sorted_index]
print(sorted_feature_importance.shape, sorted_feature_names.shape)
important_features = []
threshold = 0.01
for i, j in zip(sorted_feature_names, sorted_feature_importance):
  if j >= threshold:
    print(f"{i}: {j}")
    important_features.append(i)
display(important_features)

(94,) (94,)
age_of_car: 0.7885804466481362
policy_tenure: 0.08834290002671184
age_of_policyholder: 0.04759295243776581
length: 0.011190805350179111
population_density: 0.01065069577492912


['age_of_car',
 'policy_tenure',
 'age_of_policyholder',
 'length',
 'population_density']

In [336]:
X_train_important = X_train_smote[important_features]
X_test_important = X_test_preprocessed_df[important_features]
print(X_train_important.shape, X_test_important.shape)


(87750, 5) (11719, 5)


In [337]:
model_important_dtc = DecisionTreeClassifier(max_depth=None, min_samples_leaf=20, class_weight=None, random_state=42)
model_important_dtc.fit(X_train_important, y_train_smote)
y_pred_important_dtc = model_important_dtc.predict(X_test_important)

In [338]:
print(f"accuracy: {accuracy_score(y_test, y_pred_important_dtc)}")
print(classification_report(y_test, y_pred_important_dtc))
print(f"最大深度: {model_important_dtc.tree_.max_depth}, 節點總數: {model_important_dtc.tree_.node_count}")
print(f"類別標籤: {model_important_dtc.classes_}, 類別數量: {model_important_dtc.n_classes_}")

accuracy: 0.887362402935404
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     10969
           1       0.09      0.09      0.09       750

    accuracy                           0.89     11719
   macro avg       0.51      0.51      0.51     11719
weighted avg       0.88      0.89      0.89     11719

最大深度: 37, 節點總數: 3483
類別標籤: [0 1], 類別數量: 2


In [334]:
model_important_rfc = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=20, class_weight=None, random_state=42)
model_important_rfc.fit(X_train_important, y_train_smote)
y_pred_important_rfc = model_important_rfc.predict(X_test_important)

In [327]:
print(f"accuracy: {accuracy_score(y_test, y_pred_important_rfc)}")
print(classification_report(y_test, y_pred_important_rfc))
print(f"樹數量: {len(model_important_rfc.estimators_)}")


accuracy: 0.9059646727536479
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     10969
           1       0.10      0.06      0.07       750

    accuracy                           0.91     11719
   macro avg       0.52      0.51      0.51     11719
weighted avg       0.88      0.91      0.89     11719

樹數量: 100
