<a href="https://colab.research.google.com/github/michael-0907/tibami/blob/main/kaggle_car_insurance_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from collections import Counter
from imblearn.over_sampling import SMOTE




In [None]:
data = pd.read_csv("/content/sample_data/train.csv", encoding="utf-8")


In [None]:
print(data["is_claim"].value_counts())
print(data["is_claim"].value_counts(normalize=True))

is_claim
0    54844
1     3748
Name: count, dtype: int64
is_claim
0    0.936032
1    0.063968
Name: proportion, dtype: float64


In [None]:
data = data.drop(columns=["policy_id"])


In [None]:
def torque_transform(torque):
  torque = float(torque.lower().split("n")[0])
  return torque
data["max_torque"] = data["max_torque"].apply(torque_transform)

def power_transform(power):
  power = float(power.lower().split("b")[0])
  return power
data["max_power"] = data["max_power"].apply(power_transform)



In [None]:
for col in data.columns:
  if (data[col].dtype == "object") & (data[col].nunique() == 2) & ("is_" in col):
    data[col] = data[col].map({"Yes": 1, "No": 0})

In [None]:
X = data.drop(columns=["is_claim"])
y = data["is_claim"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True ,stratify=y)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
print(Counter(y_train))
print(Counter(y_test))

(46873, 42) (11719, 42)
(46873,) (11719,)
Counter({0: 43875, 1: 2998})
Counter({0: 10969, 1: 750})


In [None]:

for i, col in enumerate(X_train.columns):
  print(f"no: {i}, col_name: {col}, dtype: {X_train[col].dtype}, nunique: {X_train[col].nunique()}")
  print(X_train[col].unique())
  print("-" * 30)

In [None]:
categorical_columns = X_train.select_dtypes(include=["object"]).columns
categorical_columns

Index(['area_cluster', 'segment', 'model', 'fuel_type', 'engine_type',
       'rear_brakes_type', 'transmission_type', 'steering_type'],
      dtype='object')

In [None]:
X_train_dummies = pd.get_dummies(X_train, columns=categorical_columns, dtype=float)
X_test_dummies = pd.get_dummies(X_test, columns=categorical_columns, dtype=float)

X_train_dummies.head()
print(X_train_dummies.shape)
print(X_test_dummies.shape)



(46873, 94)
(11719, 94)


In [None]:
# 預處理結束，開始訓練

In [None]:
model_dtc = DecisionTreeClassifier(max_depth=20, min_samples_leaf=20, class_weight="balanced", random_state=42)
model_dtc.fit(X_train_dummies, y_train)
y_pred_dtc = model_dtc.predict(X_test_dummies)

In [None]:
print(f"accuracy: {accuracy_score(y_test, y_pred_dtc)}")
print(classification_report(y_test, y_pred_dtc))
print(f"最大深度: {model_dtc.tree_.max_depth}, 節點總數: {model_dtc.tree_.node_count}")
print(f"類別標籤: {model_dtc.classes_}, 類別數量: {model_dtc.n_classes_}")


accuracy: 0.6198481099069887
              precision    recall  f1-score   support

           0       0.95      0.63      0.76     10969
           1       0.08      0.47      0.14       750

    accuracy                           0.62     11719
   macro avg       0.51      0.55      0.45     11719
weighted avg       0.89      0.62      0.72     11719

最大深度: 20, 節點總數: 2485
類別標籤: [0 1], 類別數量: 2


In [None]:
model_rfc = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=10, class_weight="balanced", random_state=42)
model_rfc.fit(X_train_dummies, y_train)
y_pred_rfc = model_rfc.predict(X_test_dummies)

In [None]:
print(f"accuracy: {accuracy_score(y_test, y_pred_rfc)}")
print(classification_report(y_test, y_pred_rfc))
print(f"樹數量: {len(model_rfc.estimators_)}")


accuracy: 0.8102227152487413
              precision    recall  f1-score   support

           0       0.95      0.85      0.89     10969
           1       0.12      0.29      0.17       750

    accuracy                           0.81     11719
   macro avg       0.53      0.57      0.53     11719
weighted avg       0.89      0.81      0.85     11719

樹數量: 100


In [None]:
# 以下是分割數據集後，對訓練組做 smote

In [None]:
# SMOTE 之前的類別分佈
print("SMOTE 之前類別分佈:", Counter(y_train))
# 使用 SMOTE 進行過採樣
smote = SMOTE(random_state=42)
X_train_dummies_smote, y_train_smote = smote.fit_resample(X_train_dummies, y_train)
# SMOTE 之後的類別分佈
print("SMOTE 之後類別分佈:", Counter(y_train_smote))
print(X_train_dummies_smote.shape)
print(y_train_smote.shape)

SMOTE 之前類別分佈: Counter({0: 43875, 1: 2998})
SMOTE 之後類別分佈: Counter({0: 43875, 1: 43875})
(87750, 94)
(87750,)


In [None]:
model_smote_dtc = DecisionTreeClassifier(max_depth=None, min_samples_leaf=20, class_weight=None, random_state=42)
model_smote_dtc.fit(X_train_dummies_smote, y_train_smote)
y_pred_smote_dtc = model_smote_dtc.predict(X_test_dummies)

In [None]:
print(f"accuracy: {accuracy_score(y_test, y_pred_smote_dtc)}")
print(classification_report(y_test, y_pred_smote_dtc))
print(f"最大深度: {model_smote_dtc.tree_.max_depth}, 節點總數: {model_smote_dtc.tree_.node_count}")
print(f"類別標籤: {model_smote_dtc.classes_}, 類別數量: {model_smote_dtc.n_classes_}")


accuracy: 0.8912023210171517
              precision    recall  f1-score   support

           0       0.94      0.95      0.94     10969
           1       0.10      0.09      0.09       750

    accuracy                           0.89     11719
   macro avg       0.52      0.52      0.52     11719
weighted avg       0.88      0.89      0.89     11719

最大深度: 37, 節點總數: 3395
類別標籤: [0 1], 類別數量: 2


In [None]:
model_smote_rfc = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=20, class_weight=None, random_state=42)
model_smote_rfc.fit(X_train_dummies_smote, y_train_smote)
y_pred_smote_rfc = model_smote_rfc.predict(X_test_dummies)

In [None]:
print(f"accuracy: {accuracy_score(y_test, y_pred_smote_rfc)}")
print(classification_report(y_test, y_pred_smote_rfc))
print(f"樹數量: {len(model_rfc.estimators_)}")


accuracy: 0.7215632733168359
              precision    recall  f1-score   support

           0       0.95      0.74      0.83     10969
           1       0.10      0.43      0.17       750

    accuracy                           0.72     11719
   macro avg       0.53      0.59      0.50     11719
weighted avg       0.90      0.72      0.79     11719

樹數量: 100


In [None]:
# 取重要特徵欄位重新訓練

In [None]:
# model_select = model_dtc
# model_select = model_rfc
# model_select = model_smote_dtc
model_select = model_smote_rfc

feature_importance = model_select.feature_importances_
sorted_index = np.argsort(feature_importance)[::-1]
sorted_feature_names = X_train_dummies.columns[sorted_index]
sorted_feature_importance = feature_importance[sorted_index]
print(sorted_feature_importance.shape, sorted_feature_names.shape)
important_features = []
threshold = 0.01
for i, j in zip(sorted_feature_names, sorted_feature_importance):
  if j >= threshold:
    print(f"{i}: {j}")
    important_features.append(i)
display(important_features)

(94,) (94,)
age_of_car: 0.3571306233544617
policy_tenure: 0.27510592511366383
age_of_policyholder: 0.18249831265251207
population_density: 0.02528226465191083


['age_of_car', 'policy_tenure', 'age_of_policyholder', 'population_density']

In [None]:
X_train_important = X_train_dummies_smote[important_features]
X_test_important = X_test_dummies[important_features]


# X_train_important = X_train_dummies_smote[["age_of_car"]]
# X_test_important = X_test_dummies[["age_of_car"]]

# X_train_important = X_train_dummies_smote[["policy_tenure"]]
# X_test_important = X_test_dummies[["policy_tenure"]]

In [None]:
model_important_dtc = DecisionTreeClassifier(max_depth=None, min_samples_leaf=20, class_weight=None, random_state=42)
model_important_dtc.fit(X_train_important, y_train_smote)
y_pred_important_dtc = model_important_dtc.predict(X_test_important)

In [None]:
print(f"accuracy: {accuracy_score(y_test, y_pred_important_dtc)}")
print(classification_report(y_test, y_pred_important_dtc))
print(f"最大深度: {model_important_dtc.tree_.max_depth}, 節點總數: {model_important_dtc.tree_.node_count}")
print(f"類別標籤: {model_important_dtc.classes_}, 類別數量: {model_important_dtc.n_classes_}")

accuracy: 0.566515914327161
              precision    recall  f1-score   support

           0       0.94      0.57      0.71     10969
           1       0.07      0.50      0.13       750

    accuracy                           0.57     11719
   macro avg       0.51      0.54      0.42     11719
weighted avg       0.89      0.57      0.67     11719

最大深度: 59, 節點總數: 6795
類別標籤: [0 1], 類別數量: 2


In [None]:
model_important_rfc = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=20, class_weight=None, random_state=42)
model_important_rfc.fit(X_train_important, y_train_smote)
y_pred_important_rfc = model_important_rfc.predict(X_test_important)

In [None]:
print(f"accuracy: {accuracy_score(y_test, y_pred_important_rfc)}")
print(classification_report(y_test, y_pred_important_rfc))
print(f"樹數量: {len(model_important_rfc.estimators_)}")


accuracy: 0.9033193958528885
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     10969
           1       0.09      0.05      0.07       750

    accuracy                           0.90     11719
   macro avg       0.51      0.51      0.51     11719
weighted avg       0.88      0.90      0.89     11719

樹數量: 100
