# Regression Prediction Notebook

In [4]:
file_path = './data/raw/'

## Introduction
This notebook explores a regression problem using an insurance dataset. The objective is to predict insurance charges based on various features such as age, income, marital status, and health conditions, among others. The notebook will guide you through data preprocessing, model selection, training, evaluation, hyperparameter tuning, and model interpretation.


## Importing Libraries 

In [2]:
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.linear_model import Lasso, LinearRegression, BayesianRidge, GammaRegressor
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, mean_squared_log_error, mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
import os
import datetime as dt
from category_encoders import BinaryEncoder


## Data Preprocessing

here we are making a function to clean the data in the data to be able to repeat it on any prediction data, to view the details of the preprocessing please view [insurance_eda](./insurance_eda.ipynb)

In [20]:
def fix_data_types(df):
    #converting the data types of the columns to the appropriate data types
    df['number_of_dependents'] = df['number_of_dependents'].astype(int)
    df['credit_score'] = df['credit_score'].astype(int)
    df['previous_claims'] = df['previous_claims'].astype(int)
    df['insurance_duration'] = df['insurance_duration'].astype(int)
    df['vehicle_age'] = df['vehicle_age'].astype(int)
    df['age'] = df['age'].astype(int)
    
    df['gender'] = df['gender'].astype('category')
    df['marital_status'] = df['marital_status'].astype('category')
    df['education_level'] = df['education_level'].astype('category')
    df['occupation'] = df['occupation'].astype('category')
    df['customer_feedback'] = df['customer_feedback'].astype('category')
    df['policy_type'] = df['policy_type'].astype('category')
    df['smoking_status'] = df['smoking_status'].astype('category')
    df['exercise_frequency'] = df['exercise_frequency'].astype('category')
    df['property_type'] = df['property_type'].astype('category')
    df['location'] = df['location'].astype('category')

In [21]:
def assert_data_types(df):
    #asserting that the data types of the columns are as expected
    assert df['number_of_dependents'].dtype == int
    assert df['credit_score'].dtype == int
    assert df['previous_claims'].dtype == int
    assert df['insurance_duration'].dtype == int
    assert df['vehicle_age'].dtype == int
    assert df['age'].dtype == int
    
    assert df['gender'].dtype == 'category'
    assert df['marital_status'].dtype == 'category'
    assert df['education_level'].dtype == 'category'
    assert df['occupation'].dtype == 'category'
    assert df['customer_feedback'].dtype == 'category'
    assert df['policy_type'].dtype == 'category'
    assert df['smoking_status'].dtype == 'category'
    assert df['exercise_frequency'].dtype == 'category'
    assert df['property_type'].dtype == 'category'
    assert df['location'].dtype == 'category'

In [22]:
def label_encode(df):
    le = LabelEncoder()
    df['gender'] = le.fit_transform(df['gender'])
    df['marital_status'] = le.fit_transform(df['marital_status'])
    df['education_level'] = le.fit_transform(df['education_level'])
    df['occupation'] = le.fit_transform(df['customer_feedback'])
    df['customer_feedback'] = le.fit_transform(df['customer_feedback'])
    df['policy_type'] = le.fit_transform(df['policy_type'])
    df['smoking_status'] = le.fit_transform(df['smoking_status'])
    df['exercise_frequency'] = le.fit_transform(df['exercise_frequency'])
    df['property_type'] = le.fit_transform(df['property_type'])
    df['location'] = le.fit_transform(df['location'])

In [23]:
def convert_date(data):
    dates = data["policy_start_date"]
    data["start_year"] = dates.apply(lambda x: int(x.year))
    data["start_month_sin"] = dates.apply(lambda x: np.sin(int(x.month)*(2.*np.pi/12)) )
    data["start_month_cos"] = dates.apply(lambda x: np.cos(int(x.month)*(2.*np.pi/12)) )
    data["start_day_sin"] = dates.apply(lambda x: np.sin(int(x.month)*(2.*np.pi/30)) )
    data["start_day_cos"] = dates.apply(lambda x: np.cos(int(x.month)*(2.*np.pi/30)) )
    data.drop(columns=['policy_start_date'], axis=1, inplace=True)

In [24]:
def impute_data(data, category_columns=['property_type', 'policy_type', 'education_level']):
    
    # Ensure numeric columns are selected, excluding 'premium_amount'
    numeric_columns = data.select_dtypes(include=[np.number]).columns


    imputer = SimpleImputer(strategy='median')
    # Impute missing values by the median value within each group
    for value, group in data.groupby(category_columns):
        group[numeric_columns] = imputer.fit_transform(group[numeric_columns])
        data.loc[group.index] = group
    
    

    return data

In [25]:
def clean_data(data): 
    #dropping the occupation column
    data.columns = data.columns.str.lower().str.replace(' ', '_')
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns
    #imputing missing values
    data = impute_data(data)
    data[categorical_cols] = SimpleImputer(strategy='most_frequent').fit_transform(data[categorical_cols])
    assert data.isnull().sum().sum() == 0
    label_encode(data)
    convert_date(data)
    
    #fixing data types
    fix_data_types(data)
    #asserting data types
    assert_data_types(data)
    return data

## Model Selection and Training
- Choose regression models (e.g., Linear Regression, Decision Tree, Random Forest)
- Train the models on the training set
- Evaluate the models using cross-validation

#### loading the data

In [26]:
df = pd.read_csv(os.path.join(file_path, 'train.csv'), parse_dates=['Policy Start Date'])

#### cleaning the data

In [27]:
df_clean = clean_data(df)

In [28]:
df_clean.dtypes

id                         int64
age                        int64
gender                  category
annual_income            float64
marital_status          category
number_of_dependents       int64
education_level         category
occupation              category
health_score             float64
location                category
policy_type             category
previous_claims            int64
vehicle_age                int64
credit_score               int64
insurance_duration         int64
customer_feedback       category
smoking_status          category
exercise_frequency      category
property_type           category
premium_amount           float64
start_year                 int64
start_month_sin          float64
start_month_cos          float64
start_day_sin            float64
start_day_cos            float64
dtype: object

#### splitting the data

In [29]:
# splitting the data into features and target
train_x, test_x, train_y, test_y = train_test_split(df_clean.drop(columns=['premium_amount', 'id'], axis=1), df_clean['premium_amount'], test_size=0.18, random_state=50)

In [30]:
train_x.head()

Unnamed: 0,age,gender,annual_income,marital_status,number_of_dependents,education_level,occupation,health_score,location,policy_type,...,insurance_duration,customer_feedback,smoking_status,exercise_frequency,property_type,start_year,start_month_sin,start_month_cos,start_day_sin,start_day_cos
188975,26,0,103901.0,1,0,3,1,9.990982,1,0,...,6,1,1,3,2,2022,0.5,-0.866025,0.866025,0.5
882899,24,1,1365.0,1,3,3,2,24.812165,1,0,...,1,2,0,0,0,2023,-2.449294e-16,1.0,0.587785,-0.809017
410229,59,1,1226.0,1,3,3,0,28.443072,0,1,...,3,0,0,2,1,2023,1.224647e-16,-1.0,0.951057,0.309017
547418,51,0,12091.0,1,1,1,1,24.482584,2,1,...,4,1,1,3,0,2021,-0.5,-0.866025,0.994522,0.104528
209201,64,0,2840.0,0,2,0,0,8.982331,1,1,...,2,0,1,1,0,2020,0.8660254,-0.5,0.743145,0.669131


In [31]:
train_x.dtypes

age                        int64
gender                  category
annual_income            float64
marital_status          category
number_of_dependents       int64
education_level         category
occupation              category
health_score             float64
location                category
policy_type             category
previous_claims            int64
vehicle_age                int64
credit_score               int64
insurance_duration         int64
customer_feedback       category
smoking_status          category
exercise_frequency      category
property_type           category
start_year                 int64
start_month_sin          float64
start_month_cos          float64
start_day_sin            float64
start_day_cos            float64
dtype: object

#### training the model

In [40]:
# training a HistGradientBoostingRegressor on the data
# trans_test = pd.DataFrame(column_transformer.fit_transform(test_x), columns=test_x.columns)
regressor = HistGradientBoostingRegressor(categorical_features='from_dtype', max_leaf_nodes=250, max_iter=500, max_features=0.7)
regressor.fit(train_x, train_y)
mean_squared_log_error(test_y, regressor.predict(test_x))

1.3100352386750207

### Applying the Regression to test data

#### loading the test data

In [33]:
test_df = pd.read_csv(os.path.join(file_path, 'test.csv'), parse_dates=['Policy Start Date'])
test_df.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


#### cleaning the test data

In [42]:
test_clean = clean_data(test_df)
test_clean.head()

Unnamed: 0,id,age,gender,annual_income,marital_status,number_of_dependents,education_level,occupation,health_score,location,...,insurance_duration,customer_feedback,smoking_status,exercise_frequency,property_type,start_year,start_month_sin,start_month_cos,start_day_sin,start_day_cos
0,1200000,28,0,2310.0,2,4,0,2,7.657981,0,...,1,2,1,3,2,2023,1.224647e-16,-1.0,0.951057,0.309017
1,1200001,31,0,126031.0,1,2,2,1,13.381379,1,...,8,1,1,2,0,2024,0.8660254,-0.5,0.743145,0.669131
2,1200002,47,0,17092.0,0,0,3,0,24.354527,2,...,9,0,1,1,1,2023,0.8660254,-0.5,0.743145,0.669131
3,1200003,28,0,30424.0,0,3,3,2,5.136225,1,...,5,2,1,0,2,2023,-0.8660254,0.5,0.866025,-0.5
4,1200004,24,1,10863.0,0,2,1,0,11.844155,1,...,7,0,0,3,2,2021,-0.5,0.866025,0.743145,-0.669131


In [43]:
ids = test_clean['id']
test_clean.drop(columns=['id'], axis=1, inplace=True)

#### predicting premium amount

In [44]:
prediction = regressor.predict(test_clean)

#### saving the data into a csv file

In [45]:
#merge prediction with id
result = ids.to_frame()

In [46]:
result.insert(1, 'Premium Amount', prediction)

In [47]:
result.head()

Unnamed: 0,id,Premium Amount
0,1200000,1568.460035
1,1200001,1136.308638
2,1200002,1089.221314
3,1200003,1072.13514
4,1200004,1047.894551


In [49]:
result.to_csv(os.path.join('./data/clean/', 'submission.csv'), index=False)