In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, dcc, html
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
import joblib
import re

In [18]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [19]:
# Step 2: Parse Numerical Columns
def parse_torque_power(value, unit):
    if isinstance(value, str):
        match = re.match(r'(\d+\.?\d*)', value)
        if match:
            return float(match.group(1))
        else:
            raise ValueError(f"Invalid format for {unit}: {value}")
    return float(value)

# List of numerical columns
num_cols = ['policy_tenure', 'age_of_car', 'age_of_policyholder', 'population_density',
            'gross_weight', 'ncap_rating', 'displacement', 'cylinder', 'gear_box',
            'turning_radius', 'length', 'width', 'height', 'max_torque', 'max_power']

# Apply parsing and convert to float for train and test data
for col in num_cols:
    if col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].apply(lambda x: parse_torque_power(x, col))
        df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)
        df[col].fillna(df[col].mean(), inplace=True)
    if col in test_df.columns:
        if test_df[col].dtype == 'object':
            test_df[col] = test_df[col].apply(lambda x: parse_torque_power(x, col))
        test_df[col] = pd.to_numeric(test_df[col], errors='coerce').astype(float)
        test_df[col].fillna(df[col].mean(), inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A

In [20]:
# Step 3: Exploratory Data Analysis (EDA)
def perform_eda(df):
    print("Dataset Info:")
    print(df.info())
    print("\nClaim Distribution:")
    print(df['is_claim'].value_counts(normalize=True))
    
    sns.countplot(x='is_claim', data=df)
    plt.title('Claim Distribution')
    plt.savefig('claim_distribution.png')
    plt.close()
    
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    corr_matrix = df[numerical_cols].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.savefig('correlation_heatmap.png')
    plt.close()
    
    sns.barplot(x='fuel_type', y='is_claim', data=df, estimator=lambda x: sum(x)/len(x)*100)
    plt.title('Claim Rate by Fuel Type (%)')
    plt.savefig('claim_by_fuel.png')
    plt.close()
    
    app = Dash(__name__)
    fig1 = px.pie(df, names='is_claim', title='Claim Distribution')
    fig2 = px.histogram(df, x='age_of_policyholder', color='is_claim', title='Age Distribution by Claim')
    fig3 = go.Figure(data=go.Heatmap(z=corr_matrix.values, x=corr_matrix.columns, y=corr_matrix.columns, colorscale='RdBu'))
    fig3.update_layout(title='Correlation Heatmap')
    
    app.layout = html.Div([
        dcc.Graph(figure=fig1),
        dcc.Graph(figure=fig2),
        dcc.Graph(figure=fig3)
    ])
    
    # To run dashboard: 
    app.run(debug=True)

perform_eda(df)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58592 entries, 0 to 58591
Data columns (total 44 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   policy_id                         58592 non-null  object 
 1   policy_tenure                     58592 non-null  float64
 2   age_of_car                        58592 non-null  float64
 3   age_of_policyholder               58592 non-null  float64
 4   area_cluster                      58592 non-null  object 
 5   population_density                58592 non-null  float64
 6   make                              58592 non-null  int64  
 7   segment                           58592 non-null  object 
 8   model                             58592 non-null  object 
 9   fuel_type                         58592 non-null  object 
 10  max_torque                        58592 non-null  float64
 11  max_power                         58592 non-null  flo

In [21]:
# Step 4: Data Preprocessing
import os
X = df.drop('is_claim', axis=1)
y = df['is_claim']

cat_cols = X.select_dtypes(include=['object']).columns.drop('policy_id', errors='ignore')
num_cols = X.select_dtypes(include=['float64', 'int64']).columns.drop('policy_id', errors='ignore')

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), cat_cols)
])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)
X_test_proc = preprocessor.transform(X_test)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_proc, y_train)

joblib.dump(preprocessor, 'preprocessor.pkl')
if os.path.exists('preprocessor.pkl'):
    print("Preprocessor saved as preprocessor.pkl")
else:
    print("Error: Failed to save preprocessor.pkl")

Preprocessor saved as preprocessor.pkl


In [22]:
# Step 5: Train Baseline Models
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_res, y_train_res)
y_pred_lr = lr.predict(X_test_proc)
print('LR AUC:', roc_auc_score(y_test, lr.predict_proba(X_test_proc)[:, 1]))
print('LR F1:', f1_score(y_test, y_pred_lr))

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_res, y_train_res)
y_pred_dt = dt.predict(X_test_proc)
print('DT AUC:', roc_auc_score(y_test, dt.predict_proba(X_test_proc)[:, 1]))
print('DT F1:', f1_score(y_test, y_pred_dt))

cv_auc_lr = cross_val_score(lr, X_train_res, y_train_res, cv=5, scoring='roc_auc').mean()
print('LR CV AUC:', cv_auc_lr)

LR AUC: 0.5917452602683552
LR F1: 0.1422961422961423
DT AUC: 0.5164697266659948
DT F1: 0.10162314749470713
LR CV AUC: 0.6310620066425923


In [23]:
# Step 6: Train Advanced Models
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_res, y_train_res)
y_pred_rf = rf.predict(X_test_proc)
print('RF AUC:', roc_auc_score(y_test, rf.predict_proba(X_test_proc)[:, 1]))
print('RF F1:', f1_score(y_test, y_pred_rf))

xgb = XGBClassifier(eval_metric='auc', random_state=42)
xgb.fit(X_train_res, y_train_res)
y_pred_xgb = xgb.predict(X_test_proc)
print('XGB AUC:', roc_auc_score(y_test, xgb.predict_proba(X_test_proc)[:, 1]))
print('XGB F1:', f1_score(y_test, y_pred_xgb))

lgbm = lgb.LGBMClassifier(random_state=42)
lgbm.fit(X_train_res, y_train_res)
y_pred_lgbm = lgbm.predict(X_test_proc)
print('LGBM AUC:', roc_auc_score(y_test, lgbm.predict_proba(X_test_proc)[:, 1]))
print('LGBM F1:', f1_score(y_test, y_pred_lgbm))

cv_auc_xgb = cross_val_score(xgb, X_train_res, y_train_res, cv=5, scoring='roc_auc').mean()
print('XGB CV AUC:', cv_auc_xgb)

RF AUC: 0.5779344074518977
RF F1: 0.11162079510703364
XGB AUC: 0.6108968949128964
XGB F1: 0.0642570281124498
[LightGBM] [Info] Number of positive: 38390, number of negative: 38390
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008691 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5324
[LightGBM] [Info] Number of data points in the train set: 76780, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000



X does not have valid feature names, but LGBMClassifier was fitted with feature names


X does not have valid feature names, but LGBMClassifier was fitted with feature names



LGBM AUC: 0.6325933141764357
LGBM F1: 0.027607361963190184
XGB CV AUC: 0.9703935768823839


In [24]:
# Step 7: Hyperparameter Tuning (for XGB)
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0]
}

grid_xgb = GridSearchCV(XGBClassifier(eval_metric='auc', random_state=42), param_grid, cv=5, scoring='roc_auc')
grid_xgb.fit(X_train_res, y_train_res)
print('Best Params:', grid_xgb.best_params_)
print('Best AUC:', grid_xgb.best_score_)

Best Params: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best AUC: 0.9715630718878192


In [25]:
# Final Model: Tuned XGB
final_model = grid_xgb.best_estimator_
final_model.fit(X_train_res, y_train_res)
joblib.dump(final_model, 'final_xgb_model.pkl')
if os.path.exists('final_xgb_model.pkl'):
    print("Model saved as final_xgb_model.pkl")
else:
    print("Error: Failed to save final_xgb_model.pkl")

Model saved as final_xgb_model.pkl


In [26]:
# Evaluation on Test
y_pred_final = final_model.predict(X_test_proc)
print('Final Model AUC:', roc_auc_score(y_test, final_model.predict_proba(X_test_proc)[:, 1]))
print('Final Model F1:', f1_score(y_test, y_pred_final))
print(classification_report(y_test, y_pred_final))
sns.heatmap(confusion_matrix(y_test, y_pred_final), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Final Model')
plt.savefig('confusion_matrix.png')
plt.close()

Final Model AUC: 0.6124884126435524
Final Model F1: 0.07332490518331226
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      8227
           1       0.13      0.05      0.07       562

    accuracy                           0.92      8789
   macro avg       0.53      0.51      0.51      8789
weighted avg       0.89      0.92      0.90      8789



In [27]:
# Step 8: Generate Predictions for Test Dataset
print("Processing test dataset...")
test_ids = test_df['policy_id']
test_features = test_df.drop('policy_id', axis=1, errors='ignore')
test_proc = preprocessor.transform(test_features)
test_predictions = final_model.predict(test_proc)

submission = pd.DataFrame({
    'policy_id': test_ids,
    'is_claim': test_predictions
})

sample_cols = sample_submission.columns
submission = submission[sample_cols]
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Processing test dataset...
Submission file created: submission.csv
