# Imports

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import random 

from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_log_error 
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, early_stopping 
# from catboost import CatBoostRegressor

import optuna

random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


# Reading Data

In [3]:
sample_submission = pd.read_csv('data/sample_submission.csv')
test_data = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')

# Exploring Data

In [4]:
print('Train data:')
print(f'Columns: {train_data.shape[1]} | Rows: {train_data.shape[0]}')
print('-----------------------------')
print('Test data:')
print(f'Columns: {test_data.shape[1]} | Rows: {test_data.shape[0]}')

Train data:
Columns: 21 | Rows: 1200000
-----------------------------
Test data:
Columns: 20 | Rows: 800000


In [5]:
new_cols = []
for col in train_data.columns:
    new_cols.append(col.lower().replace(' ', '_'))
    
train_data.columns = new_cols


new_cols = []
for col in test_data.columns:
    new_cols.append(col.lower().replace(' ', '_'))

test_data.columns = new_cols

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   age                   1181295 non-null  float64
 2   gender                1200000 non-null  object 
 3   annual_income         1155051 non-null  float64
 4   marital_status        1181471 non-null  object 
 5   number_of_dependents  1090328 non-null  float64
 6   education_level       1200000 non-null  object 
 7   occupation            841925 non-null   object 
 8   health_score          1125924 non-null  float64
 9   location              1200000 non-null  object 
 10  policy_type           1200000 non-null  object 
 11  previous_claims       835971 non-null   float64
 12  vehicle_age           1199994 non-null  float64
 13  credit_score          1062118 non-null  float64
 14  insurance_duration    1199999 non-

# Preprocessing Data

In [7]:
def fill_nan_columns(data):
    numeric_columns = data.select_dtypes(include=['number']).columns
    for col in numeric_columns:
        data[col].fillna(data[col].median(), inplace=True)
    
    object_columns = data.select_dtypes(include=['object']).columns
    for col in object_columns:
        data[col].fillna("Unknown", inplace=True)

In [8]:
binary_columns = ['gender', 'smoking_status']

dummy_columns = [
    'marital_status',
    'occupation',
    'location',
    'property_type',
    'education_level',
    'policy_type',
    'customer_feedback'
]

ordinal_columns = {
    'exercise_frequency': ['Rarely', 'Monthly', 'Weekly', 'Daily']
}


In [9]:
ordinal_columns.items()

dict_items([('exercise_frequency', ['Rarely', 'Monthly', 'Weekly', 'Daily'])])

In [10]:
def encode_features(train_data, test_data):
    # encode binary features
    le = LabelEncoder()
    for feature in binary_columns:
        train_data[feature] = le.fit_transform(train_data[feature])
        test_data[feature] = le.fit_transform(test_data[feature])
    
    # encode ordinal features
    for feature, order in ordinal_columns.items():
        oe = OrdinalEncoder(categories=[order])
        train_data[feature] = oe.fit_transform(train_data[[feature]]).flatten()
        test_data[feature] = oe.fit_transform(test_data[[feature]]).flatten() 
    
    # encode categorical features
    train_data = pd.get_dummies(train_data, columns=dummy_columns, drop_first=True)
    test_data = pd.get_dummies(test_data, columns=dummy_columns, drop_first=True)
    
    return train_data, test_data
    

In [11]:
def preprocess_datetime_columns(data):
    data['policy_start_date'] = pd.to_datetime(data['policy_start_date'])
    data['policy_start_date'] = data['policy_start_date'].astype(np.int64) / 10**9 # converts nanoseconds to seconds from epoch
    return data

In [12]:
# Preprocessing data pipeline
def preprocess_data(train_data, test_data):
    train = train_data.copy()
    test = test_data.copy()
    
    fill_nan_columns(train)
    fill_nan_columns(test)
    
    encoded_train, encoded_test = encode_features(train, test)
    
    encoded_train = preprocess_datetime_columns(encoded_train)
    encoded_test = preprocess_datetime_columns(encoded_test)
    
    return encoded_train, encoded_test

In [13]:
preprocessed_train, preprocessed_test = preprocess_data(train_data, test_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [14]:
preprocessed_train

Unnamed: 0,id,age,gender,annual_income,number_of_dependents,health_score,previous_claims,vehicle_age,credit_score,insurance_duration,...,property_type_Condo,property_type_House,education_level_High School,education_level_Master's,education_level_PhD,policy_type_Comprehensive,policy_type_Premium,customer_feedback_Good,customer_feedback_Poor,customer_feedback_Unknown
0,0,19.0,0,10049.0,1.0,22.598761,2.0,17.0,372.0,5.0,...,False,True,False,False,False,False,True,False,True,False
1,1,39.0,0,31678.0,3.0,15.569731,1.0,12.0,694.0,2.0,...,False,True,False,True,False,True,False,False,False,False
2,2,23.0,1,25602.0,3.0,47.177549,1.0,14.0,595.0,3.0,...,False,True,True,False,False,False,True,True,False,False
3,3,21.0,1,141855.0,2.0,10.938144,1.0,0.0,367.0,1.0,...,False,False,False,False,False,False,False,False,True,False
4,4,21.0,1,39651.0,1.0,20.376094,0.0,8.0,598.0,4.0,...,False,True,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,1199995,36.0,0,27316.0,0.0,13.772907,1.0,5.0,372.0,3.0,...,False,False,False,True,False,False,True,False,True,False
1199996,1199996,54.0,1,35786.0,2.0,11.483482,1.0,10.0,597.0,4.0,...,False,False,False,True,False,True,False,False,True,False
1199997,1199997,19.0,1,51884.0,0.0,14.724469,0.0,19.0,595.0,6.0,...,True,False,False,True,False,False,False,True,False,False
1199998,1199998,55.0,1,23911.0,1.0,18.547381,1.0,7.0,407.0,4.0,...,False,False,False,False,True,False,True,False,True,False


In [15]:
preprocessed_test

Unnamed: 0,id,age,gender,annual_income,number_of_dependents,health_score,previous_claims,vehicle_age,credit_score,insurance_duration,...,property_type_Condo,property_type_House,education_level_High School,education_level_Master's,education_level_PhD,policy_type_Comprehensive,policy_type_Premium,customer_feedback_Good,customer_feedback_Poor,customer_feedback_Unknown
0,1200000,28.0,0,2310.0,4.0,7.657981,1.0,19.0,595.0,1.0,...,False,True,False,False,False,False,False,False,True,False
1,1200001,31.0,0,126031.0,2.0,13.381379,1.0,14.0,372.0,8.0,...,False,False,False,True,False,False,True,True,False,False
2,1200002,47.0,0,17092.0,0.0,24.354527,1.0,16.0,819.0,9.0,...,True,False,False,False,True,True,False,False,False,False
3,1200003,28.0,0,30424.0,3.0,5.136225,1.0,3.0,770.0,5.0,...,False,True,False,False,True,True,False,False,True,False
4,1200004,24.0,1,10863.0,2.0,11.844155,1.0,14.0,755.0,7.0,...,False,True,True,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,1999995,50.0,0,38782.0,1.0,14.498639,1.0,8.0,309.0,2.0,...,True,False,False,False,False,False,True,False,False,False
799996,1999996,41.0,0,73462.0,0.0,8.145748,2.0,0.0,595.0,2.0,...,False,False,False,True,False,False,False,True,False,False
799997,1999997,26.0,0,35178.0,0.0,6.636583,1.0,10.0,595.0,6.0,...,False,False,False,True,False,True,False,False,True,False
799998,1999998,34.0,0,45661.0,3.0,15.937248,2.0,17.0,467.0,7.0,...,True,False,False,True,False,False,True,False,False,False


In [16]:
new_cols = []
for col in preprocessed_train.columns:
    new_cols.append(col.lower().replace("'", '').replace(' ', '_').replace('-', '_'))

preprocessed_train.columns = new_cols


new_cols = []
for col in preprocessed_test.columns:
    new_cols.append(col.lower().replace("'", '').replace(' ', '_').replace('-', '_'))

preprocessed_test.columns = new_cols

# Models

Notes:
- Policy Start Date could be an interesting variable. In the real world when you switch insurers you typically get a better rate because they need to entice you to switch providers.

In [17]:
train = preprocessed_train.copy()
train.drop('id', axis=1, inplace=True)

In [18]:
target = 'premium_amount'

X = train.drop(target, axis=1)
y = train[target]

In [19]:
y_log = np.log1p(y)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.20, random_state=42)

## LightGBM

In [21]:
%%time

def objective_lgbm(trial):
    params = {
        'boosting_type': 'gbdt',
        # 'device': 'gpu',
        'colsample_bytree':trial.suggest_float('colsample_bytree', 0.5, 1),
        'learning_rate':trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth':trial.suggest_int('max_depth', 10, 25),
        'min_child_samples':trial.suggest_int('min_child_samples', 70, 200),
        'n_estimators':trial.suggest_int('n_estimators', 500, 2000),
        'num_leaves':trial.suggest_int('num_leaves', 20, 200),
        'reg_alpha':trial.suggest_float('reg_alpha', 1e-8, 100),
        'reg_lambda':trial.suggest_float('reg_lambda', 1e-8, 500),
        'subsample':trial.suggest_float('subsample', 0.5, 1),
        'metric':'rmse',
        'objective':'regression',
        'verbose':-1
    }
    
    model = LGBMRegressor(
        **params
    )
    
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        callbacks=[early_stopping(stopping_rounds=30, verbose=False)]
    )
    
    y_pred = model.predict(X_test)
    msle = mean_squared_log_error(np.expm1(y_test), np.expm1(y_pred))
    return np.sqrt(msle)

study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(objective_lgbm, n_trials=5)

print("Best parameters:", study_lgbm.best_params)
print("Best RMSLE:", study_lgbm.best_value)

[I 2024-12-29 23:14:07,400] A new study created in memory with name: no-name-bd3e3129-384f-4b41-baac-6dc26c92a48f
[I 2024-12-29 23:19:08,472] Trial 0 finished with value: 1.046516232185766 and parameters: {'colsample_bytree': 0.7646350415687834, 'learning_rate': 0.023041849887639437, 'max_depth': 18, 'min_child_samples': 91, 'n_estimators': 783, 'num_leaves': 111, 'reg_alpha': 68.65097007344342, 'reg_lambda': 461.37657084298075, 'subsample': 0.8890205554321071}. Best is trial 0 with value: 1.046516232185766.
[I 2024-12-29 23:20:18,935] Trial 1 finished with value: 1.046238576113237 and parameters: {'colsample_bytree': 0.909220866966931, 'learning_rate': 0.06002934237336119, 'max_depth': 18, 'min_child_samples': 147, 'n_estimators': 594, 'num_leaves': 126, 'reg_alpha': 3.384049220835285, 'reg_lambda': 440.00175619496156, 'subsample': 0.6396937711823723}. Best is trial 1 with value: 1.046238576113237.
[I 2024-12-29 23:23:25,010] Trial 2 finished with value: 1.0469085011316355 and paramet

Best parameters: {'colsample_bytree': 0.909220866966931, 'learning_rate': 0.06002934237336119, 'max_depth': 18, 'min_child_samples': 147, 'n_estimators': 594, 'num_leaves': 126, 'reg_alpha': 3.384049220835285, 'reg_lambda': 440.00175619496156, 'subsample': 0.6396937711823723}
Best RMSLE: 1.046238576113237
CPU times: user 1h 23min 16s, sys: 45.2 s, total: 1h 24min 1s
Wall time: 27min 43s
Compiler : 154 ms
Parser   : 164 ms


## CatBoost

In [None]:
%%time

def objective_cat(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 2000, 6000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 16),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 5.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0)
        # 'task_type': 'GPU',
        # 'devices': '0'
    }
    
    model = CatBoostRegressor(
        **params
    )
    
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=30,
        verbose=False
    )
    
    y_pred = model.predict(X_test)
    msle = mean_squared_log_error(np.expm1(y_test), np.expm1(y_pred))
    return np.sqrt(msle)

study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(objective_cat, n_trials=10)

print("Best parameters:", study_cat.best_params)
print("Best RMSLE:", study_cat.best_value)

## XGBoost

In [None]:
%%time

def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 18),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1, 20),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 15),
        'tree_method': 'hist',
        # 'device': 'cuda',
        'early_stopping_rounds':30
    }
    
    model = XGBRegressor(
        **params
    )
    
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )
    
    y_pred = model.predict(X_test)
    msle = mean_squared_log_error(np.expm1(y_test), np.expm1(y_pred))
    return np.sqrt(msle)

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=10)

print("Best parameters:", study_xgb.best_params)
print("Best RMSLE:", study_xgb.best_value)

# Submission

In [None]:
# Get best parameters
best_params = study_lgbm.best_params

# Create and train model with best parameters
final_model = LGBMRegressor(**best_params)
final_model.fit(
    X_train, 
    y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[early_stopping(stopping_rounds=30, verbose=False)]
)

# Make predictions 
y_pred = final_model.predict(preprocessed_test.iloc[:, 1:]) # all columns minus 'id'

In [38]:
y_pred = np.expm1(y_pred)

In [39]:
submission = pd.DataFrame(
    data= {
        'id': preprocessed_test['id'],
        'Premium Amount': y_pred
    }
)

submission

Unnamed: 0,id,Premium Amount
0,1200000,852.581896
1,1200001,829.420179
2,1200002,773.678509
3,1200003,787.569905
4,1200004,749.006860
...,...,...
799995,1999995,989.970844
799996,1999996,554.959800
799997,1999997,828.937705
799998,1999998,815.503344


In [40]:
submission.to_csv('data/submission.csv', index=False)