In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import set_config
set_config(display="diagram")

import warnings
warnings.filterwarnings('ignore')

### Project Setup

In [15]:
df = pd.read_csv('files_for_lab/we_fn_use_c_marketing_customer_value_analysis.csv')
print(df.shape)
df.head(1)

(9134, 24)


Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize


### Data Cleaning

In [16]:
# Custom class for the initial cleaning steps of:
# removing duplicates, drop features, deal with date fatures and standardize headers

class CustomInitialDataCleaner:
    def __init__(self, drop_features=[], date_features=[]):
        self.drop_features = drop_features
        self.date_features = date_features
    
    def __remove_duplicates(self, X):
        return X[~X.duplicated()]
    
    def __drop_features(self, X):
        return X.drop(self.drop_features, axis=1, inplace=False)
    
    def __standardize_feature_headers(self, X):
        X.columns = [col.lower().replace(' ', '_') for col in X.columns]
        return X
    
    def __handle_dates(self, X):
        for date_f in self.date_features:
            date_feature = pd.to_datetime(X[date_f])
            X[f'{date_f}_year'] = date_feature.dt.year
            X[f'{date_f}_month'] = date_feature.dt.month
            X[f'{date_f}_day'] = date_feature.dt.day
            X.drop(date_f, axis=1, inplace=True)
            return X
        
    def clean(self, X):
        X = self.__remove_duplicates(X)
        X = self.__drop_features(X)
        X = self.__handle_dates(X)
        return self.__standardize_feature_headers(X)

In [17]:
cleaner = CustomInitialDataCleaner(date_features=['Effective To Date'], drop_features=['Customer'])
df = cleaner.clean(df)
df.head(1)

Unnamed: 0,state,customer_lifetime_value,response,coverage,education,employmentstatus,gender,income,location_code,marital_status,...,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size,effective_to_date_year,effective_to_date_month,effective_to_date_day
0,Washington,2763.519279,No,Basic,Bachelor,Employed,F,56274,Suburban,Married,...,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize,2011,2,24


### Check Null Values

In [18]:
df.isna().sum()

state                            0
customer_lifetime_value          0
response                         0
coverage                         0
education                        0
employmentstatus                 0
gender                           0
income                           0
location_code                    0
marital_status                   0
monthly_premium_auto             0
months_since_last_claim          0
months_since_policy_inception    0
number_of_open_complaints        0
number_of_policies               0
policy_type                      0
policy                           0
renew_offer_type                 0
sales_channel                    0
total_claim_amount               0
vehicle_class                    0
vehicle_size                     0
effective_to_date_year           0
effective_to_date_month          0
effective_to_date_day            0
dtype: int64

In [19]:
print(df['effective_to_date_year'].unique())
# drop 
df.drop('effective_to_date_year', axis=1, inplace=True)

[2011]


In [21]:
df.shape

(9134, 24)

## Pipeline Setup

#### Setup for custom pipeline:
<br>
    0) since no null values found -> no step is worked in here to deal with them<br>
    2) scaling for numerical features<br>
    3) OrdinalEncoder for selected categorcial features<br>
    4) OneHotEncoder for selected categorical features<br>
    5) estimator // model<br>

#### Define ordinal features hierarchy and split the features 

In [22]:
# split features into numerical and remove target variable 
numerical = df.select_dtypes('number').drop('total_claim_amount', axis=1, inplace=False)
numerical_features = numerical.columns

# split features into categorical; 
# then differentiate between ordinal categoricals and nominal ones
categorical = df.select_dtypes('object')
cat_ordinal_features = categorical[['coverage', 'location_code', 'employmentstatus', 'vehicle_size']].columns
cat_onehot_features = categorical.loc[:, ~categorical.columns.isin(cat_ordinal_features)].columns

# define order for the categorical features which will be handled by OerdinalEncoder
predefined_order = [['Premium', 'Extended', 'Basic'], ['Urban', 'Suburban', 'Rural'], \
            ['Employed', 'Medical Leave','Unemployed', 'Retired', 'Disabled'], ['Large', 'Medsize', 'Small']]


In [31]:
# custom func for pipeline defintion
def make_custom_pipeline(model):
    return Pipeline([
                ('scaling', ColumnTransformer([
                ('numerical', StandardScaler(), numerical_features),
                ('cat_ordinal', OrdinalEncoder(categories=predefined_order), cat_ordinal_features),
                ('cat_onehot', OneHotEncoder(sparse=False), cat_onehot_features)
            ])),
                ('model', model)
            ])

## Split data train / test 

In [38]:
# X-y split
y = df['total_claim_amount']
X = df.drop('total_claim_amount', axis=1, inplace=False)


# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Train and evaluate models

In [39]:
models = [LinearRegression(), KNeighborsRegressor(n_neighbors=5), MLPRegressor(random_state=1, max_iter=500)]
experiments = []

In [40]:
for m in models:
    experiments.append(make_custom_pipeline(m))

In [50]:
metrics = []

In [52]:
for pipe in experiments:
    pipe.fit(X_train, y_train)
    
    y_pred_train = pipe.predict(X_train)
    y_pred = pipe.predict(X_test)
    
    metrics.append([type(pipe[1]).__name__, 'train_set', r2_score(y_train, y_pred_train), 
                    mean_absolute_error(y_train, y_pred_train),
                    mean_squared_error(y_train, y_pred_train, squared=False),
                    mean_squared_error(y_train, y_pred_train, squared=True),
                    mean_absolute_percentage_error(y_train, y_pred_train)])

    metrics.append([type(pipe[1]).__name__, 'test_set', r2_score(y_test, y_pred), 
                    mean_absolute_error(y_test, y_pred),
                    mean_squared_error(y_test, y_pred, squared=False),
                    mean_squared_error(y_test, y_pred, squared=True),
                    mean_absolute_percentage_error(y_test, y_pred)])
    


In [53]:
model_metrics = pd.DataFrame(metrics, columns=['model', 'data_set', 'r2_score', 'MAE', 'RMSE', 'MSE', 'MAPE'])
model_metrics

Unnamed: 0,model,data_set,r2_score,MAE,RMSE,MSE,MAPE
0,LinearRegression,train_set,0.625137,129.534921,179.085679,32071.680552,1.542381
1,LinearRegression,test_set,0.621274,127.804549,175.832489,30917.064078,1.602365
2,KNeighborsRegressor,train_set,0.730633,100.725629,151.808773,23045.90359,1.393611
3,KNeighborsRegressor,test_set,0.613453,123.621136,177.638741,31555.522325,1.689422
4,MLPRegressor,train_set,0.853202,75.431618,112.068649,12559.382042,0.731717
5,MLPRegressor,test_set,0.811791,81.445949,123.952732,15364.279859,0.731362


## Observations
- best r2 score delivers MLPRegressor with 0.81 on test set - the RMSE is here 123.95 which implies that the prediction will deviate by roughly 124 dollars on average from target
- also, KNeighborsRegressor seems to overfit since we see a large gap between the test and train data set predictions
- LinearRegression delivers the second 'best' r2 score on test data; 