# Imports

In [27]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import random 

from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_log_error 
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, early_stopping 
# from catboost import CatBoostRegressor

import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
random.seed(42)

# Reading Data

In [30]:
sample_submission = pd.read_csv('data/sample_submission.csv')
test_data = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')

# Exploring Data

In [31]:
print('Train data:')
print(f'Columns: {train_data.shape[1]} | Rows: {train_data.shape[0]}')
print('-----------------------------')
print('Test data:')
print(f'Columns: {test_data.shape[1]} | Rows: {test_data.shape[0]}')

Train data:
Columns: 21 | Rows: 1200000
-----------------------------
Test data:
Columns: 20 | Rows: 800000


In [33]:
new_cols = []
for col in train_data.columns:
    new_cols.append(col.lower().replace(' ', '_'))
    
train_data.columns = new_cols


new_cols = []
for col in test_data.columns:
    new_cols.append(col.lower().replace(' ', '_'))

test_data.columns = new_cols

In [35]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   age                   1181295 non-null  float64
 2   gender                1200000 non-null  object 
 3   annual_income         1155051 non-null  float64
 4   marital_status        1181471 non-null  object 
 5   number_of_dependents  1090328 non-null  float64
 6   education_level       1200000 non-null  object 
 7   occupation            841925 non-null   object 
 8   health_score          1125924 non-null  float64
 9   location              1200000 non-null  object 
 10  policy_type           1200000 non-null  object 
 11  previous_claims       835971 non-null   float64
 12  vehicle_age           1199994 non-null  float64
 13  credit_score          1062118 non-null  float64
 14  insurance_duration    1199999 non-

# Preprocessing Data

In [36]:
def fill_nan_columns(data):
    numeric_columns = data.select_dtypes(include=['number']).columns
    for col in numeric_columns:
        data[col].fillna(data[col].median(), inplace=True)
    
    object_columns = data.select_dtypes(include=['object']).columns
    for col in object_columns:
        data[col].fillna("Unknown", inplace=True)

In [37]:
binary_columns = ['gender', 'smoking_status']

dummy_columns = [
    'marital_status',
    'occupation',
    'location',
    'property_type',
    'education_level',
    'policy_type',
    'customer_feedback'
]

ordinal_columns = {
    'exercise_frequency': ['Rarely', 'Monthly', 'Weekly', 'Daily']
}


In [39]:
ordinal_columns.items()

dict_items([('exercise_frequency', ['Rarely', 'Monthly', 'Weekly', 'Daily'])])

In [40]:
def encode_features(train_data, test_data):
    # encode binary features
    le = LabelEncoder()
    for feature in binary_columns:
        train_data[feature] = le.fit_transform(train_data[feature])
        test_data[feature] = le.fit_transform(test_data[feature])
    
    # encode ordinal features
    for feature, order in ordinal_columns.items():
        oe = OrdinalEncoder(categories=[order])
        train_data[feature] = oe.fit_transform(train_data[[feature]]).flatten()
        test_data[feature] = oe.fit_transform(test_data[[feature]]).flatten() 
    
    # encode categorical features
    train_data = pd.get_dummies(train_data, columns=dummy_columns, drop_first=True)
    test_data = pd.get_dummies(test_data, columns=dummy_columns, drop_first=True)
    
    return train_data, test_data
    

In [41]:
def preprocess_datetime_columns(data):
    data['policy_start_date'] = pd.to_datetime(data['policy_start_date'])
    data['policy_start_date'] = data['policy_start_date'].astype(np.int64) / 10**9 # converts nanoseconds to seconds from epoch
    return data