# Imports

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import logging
from lightgbm import LGBMRegressor

# Configure logging
logging.basicConfig(level=logging.INFO)

# Read Data

In [3]:
train_data = pd.read_csv(filepath_or_buffer='data/train.csv')
test_data = pd.read_csv(filepath_or_buffer='data/test.csv')

# Data Preprocessing/Transformations

## Explore

In [103]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

In [5]:
class CleanseColumnNames(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X = X.copy()
        logging.info(f"CleanseColumnNames: Initial columns: {X.columns.tolist()}")
        X.columns = [col.lower().replace(' ', '_') for col in X.columns]
        logging.info(f"CleanseColumnNames: Transformed columns: {X.columns.tolist()}")
        return X

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        logging.info("FeatureEngineering: Before:")
        logging.info(X.head())  # Adjust based on desired details
        
        X['contract_length'] = pd.cut(
            X['insurance_duration'].fillna(99),
            bins=[-float('inf'), 1, 3, float('inf')],
            labels=[0, 1, 2]
        ).astype(int)

        X['income_per_dependent'] = X['annual_income'] / X['number_of_dependents'].replace(0, 1)
        X['credit_score_per_insurance_duration'] = X['credit_score'] / X['insurance_duration']
        
        logging.info("FeatureEngineering: After:")
        logging.info(X.head())
        return X


def create_pipeline(numeric_features=None, ordinal_features=None, ordinal_categories=None, category_features=None):
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())   
    ])

    ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OrdinalEncoder(categories=[ordinal_categories['exercise_frequency']], handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    category_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])

    # date_transformer = Pipeline(steps=[
        
    # ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric_features', numeric_transformer, numeric_features),
            ('ordinal_features', ordinal_transformer, ordinal_features),
            ('category_features', category_transformer, category_features)
        ])

    full_pipeline = Pipeline([
        ('cleanse column names', CleanseColumnNames()),
        ('feature engineering', FeatureEngineering()),
        ('preprocessor', preprocessor),
        ('regressor', LGBMRegressor())
    ])
    
    return full_pipeline

- Instead of imputing the median for numeric features build a model to predict the missing value
    - start with predicting vehicle age

- Need to engineer Policy Start Date 
    - assumption is the longer a person has had a policy the higher the price 

- The value of what is being insured really should determine the premium (but there is also incentive for maximizing profit) so use the proxy value features to create an 'estimated value' feature

## Imputation: Predict Vehicle Age

In [4]:
train_data = train_data[~train_data['Vehicle Age'].isna()]
test_data = train_data[train_data['Vehicle Age'].isna()]

In [6]:
train_data.drop('id', axis=1, inplace=True)
# test_data.drop('id', axis=1, inplace=True)

target = 'Vehicle Age'
X = train_data.drop(target, axis=1)
y = train_data[target]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [11]:
# numeric_features = X.select_dtypes(include='number').columns
numeric_features = [
    'age', 
    'annual_income', 
    'number_of_dependents', 
    'health_score',
    'previous_claims', 
    # 'vehicle_age', 
    'credit_score', 
    'insurance_duration'
]

category_features = [
    'marital_status',
    'occupation',
    'location',
    'property_type',
    'education_level',
    'policy_type',
    'customer_feedback',
    'gender',
    'smoking_status'
]

ordinal_features = ['exercise_frequency']

ordinal_categories = {
    'exercise_frequency': ['Rarely', 'Monthly', 'Weekly', 'Daily']
}

# date_features = ['policy_start_date']

In [12]:
pipeline = create_pipeline(
    numeric_features=numeric_features, 
    ordinal_features=ordinal_features,
    ordinal_categories=ordinal_categories,
    category_features=category_features
    )

predict_vehicle_age = pipeline.fit(X_train, y_train)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 880
[LightGBM] [Info] Number of data points in the train set: 959995, number of used features: 28
[LightGBM] [Info] Start training from score 9.565900


In [13]:
# Make predictions
y_pred = predict_vehicle_age.predict(X_test)



In [14]:
error = abs(y_pred - y_test)
error.mean()

5.002807719305263

In [28]:
vehicle_age_imputation = predict_vehicle_age.predict(train_data)

train_data['Vehicle Age'] = vehicle_age_imputation



## Final Model

In [6]:
# train_data.drop('id', axis=1, inplace=True)
# test_data.drop('id', axis=1, inplace=True)

target = 'Premium Amount'
X = train_data.drop(target, axis=1)
y = train_data[target]

y_log = np.log1p(y)

In [31]:
# test_data.drop('id', axis=1, inplace=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.20, random_state=42)

## Predict

In [8]:
# numeric_features = X.select_dtypes(include='number').columns
numeric_features = [
    'age', 
    'annual_income', 
    'number_of_dependents', 
    'health_score',
    'previous_claims', 
    'vehicle_age', 
    'credit_score', 
    'insurance_duration',
    'contract_length',
    'income_per_dependent',
    'credit_score_per_insurance_duration'
]

category_features = [
    'marital_status',
    'occupation',
    'location',
    'property_type',
    'education_level',
    'policy_type',
    'customer_feedback',
    'gender',
    'smoking_status'
]

ordinal_features = ['exercise_frequency']

ordinal_categories = {
    'exercise_frequency': ['Rarely', 'Monthly', 'Weekly', 'Daily']
}

# date_features = ['policy_start_date']

In [9]:
pipeline = create_pipeline(
    numeric_features=numeric_features, 
    ordinal_features=ordinal_features,
    ordinal_categories=ordinal_categories,
    category_features=category_features
    )

predict_premium_amount = pipeline.fit(X_train, y_train)

INFO:root:CleanseColumnNames: Initial columns: ['id', 'Age', 'Gender', 'Annual Income', 'Marital Status', 'Number of Dependents', 'Education Level', 'Occupation', 'Health Score', 'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']
INFO:root:CleanseColumnNames: Transformed columns: ['id', 'age', 'gender', 'annual_income', 'marital_status', 'number_of_dependents', 'education_level', 'occupation', 'health_score', 'location', 'policy_type', 'previous_claims', 'vehicle_age', 'credit_score', 'insurance_duration', 'policy_start_date', 'customer_feedback', 'smoking_status', 'exercise_frequency', 'property_type']
INFO:root:FeatureEngineering: Before:
INFO:root:            id   age  gender  annual_income marital_status  \
404339  404339  45.0    Male        91315.0       Divorced   
748487  748487  51.0  Female         5280.0        Married   
435951  

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.118144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1414
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 32
[LightGBM] [Info] Start training from score 6.593848


In [10]:
# Make predictions
y_pred = pipeline.predict(test_data)
y_pred = np.expm1(y_pred)

INFO:root:CleanseColumnNames: Initial columns: ['id', 'Age', 'Gender', 'Annual Income', 'Marital Status', 'Number of Dependents', 'Education Level', 'Occupation', 'Health Score', 'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']
INFO:root:CleanseColumnNames: Transformed columns: ['id', 'age', 'gender', 'annual_income', 'marital_status', 'number_of_dependents', 'education_level', 'occupation', 'health_score', 'location', 'policy_type', 'previous_claims', 'vehicle_age', 'credit_score', 'insurance_duration', 'policy_start_date', 'customer_feedback', 'smoking_status', 'exercise_frequency', 'property_type']
INFO:root:FeatureEngineering: Before:
INFO:root:        id   age  gender  annual_income marital_status  number_of_dependents  \
0  1200000  28.0  Female         2310.0            NaN                   4.0   
1  1200001  31.0  Female       126

# Submission

In [11]:
sample_submission = pd.read_csv('data/sample_submission.csv')

In [12]:
sample_submission['Premium Amount'] = y_pred
sample_submission.to_csv('data/submission.csv', index=False)