In [1]:
# Relevant Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

import warnings
warnings.filterwarnings('ignore')


In [2]:
pd.set_option('display.max_columns', None) 

In [3]:
data=pd.read_csv('./data/marketing_customer_analysis.csv')

In [4]:
def clean_data(data):
    data.columns=[e.lower().replace(' ', '_') for e in data.columns]
    data=data.drop(columns=['unnamed:_0', 'vehicle_type'])
    data=data.dropna()
    data['effective_to_date']=pd.to_datetime(data['effective_to_date'], errors='coerce')
    return data

In [5]:
clean_data(data)

Unnamed: 0,customer,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,income,location_code,marital_status,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size
0,DK49336,Arizona,4809.216960,No,Basic,College,2011-02-18,Employed,M,48029,Suburban,Married,61,7.0,52,0.0,9,Corporate Auto,Corporate L3,Offer3,Agent,292.800000,Four-Door Car,Medsize
1,KX64629,California,2228.525238,No,Basic,College,2011-01-18,Unemployed,F,0,Suburban,Single,64,3.0,26,0.0,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize
2,LZ68649,Washington,14947.917300,No,Basic,Bachelor,2011-02-10,Employed,M,22139,Suburban,Single,100,34.0,31,0.0,2,Personal Auto,Personal L3,Offer3,Call Center,480.000000,SUV,Medsize
3,XL78013,Oregon,22332.439460,Yes,Extended,College,2011-01-11,Employed,M,49078,Suburban,Single,97,10.0,3,0.0,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize
6,IW72280,California,5035.035257,No,Basic,Doctor,2011-02-14,Employed,F,37405,Urban,Married,63,8.0,99,3.0,4,Corporate Auto,Corporate L2,Offer2,Branch,287.556107,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10903,SU71163,Arizona,2771.663013,No,Basic,College,2011-01-07,Employed,M,59855,Suburban,Married,74,30.0,82,4.0,1,Personal Auto,Personal L2,Offer2,Branch,355.200000,Two-Door Car,Medsize
10904,QI63521,Nevada,19228.463620,No,Basic,High School or Below,2011-02-24,Unemployed,M,0,Suburban,Single,187,14.0,32,0.0,2,Personal Auto,Personal L2,Offer1,Branch,897.600000,Luxury SUV,Medsize
10906,KX53892,Oregon,5259.444853,No,Basic,College,2011-01-06,Employed,F,61146,Urban,Married,65,7.0,68,0.0,6,Personal Auto,Personal L3,Offer2,Branch,273.018929,Four-Door Car,Medsize
10907,TL39050,Arizona,23893.304100,No,Extended,Bachelor,2011-02-06,Employed,F,39837,Rural,Married,201,11.0,63,0.0,2,Corporate Auto,Corporate L3,Offer1,Web,381.306996,Luxury SUV,Medsize


In [6]:
data.isnull().values.any()

True

In [7]:
data=data.dropna()

In [8]:
data.isnull().values.any()

False

In [9]:
# X-y split.
X=data.drop('total_claim_amount', axis=1)
y=data.total_claim_amount

In [10]:
# drop useless columns
X=X.drop(columns=['customer', 'effective_to_date'])

In [11]:
# numerical and categorical split
X_num=X._get_numeric_data()
X_cat=X.drop(columns=X_num.columns)
# Check data
print (X.shape, X_num.shape, X_cat.shape)

(4543, 23) (4543, 8) (4543, 15)


In [12]:
# numerical
X_num.describe()

Unnamed: 0,unnamed:_0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
count,4543.0,4543.0,4543.0,4543.0,4543.0,4543.0,4543.0,4543.0
mean,5540.24125,8033.714059,37453.033238,93.773278,15.11182,48.153863,0.386309,2.980189
std,3149.146534,6959.529443,30257.399536,35.093661,10.157925,28.146904,0.907553,2.390362
min,2.0,1904.000852,0.0,61.0,0.0,0.0,0.0,1.0
25%,2884.0,4039.089902,0.0,69.0,6.0,24.0,0.0,1.0
50%,5549.0,5846.520588,34455.0,83.0,14.0,47.0,0.0,2.0
75%,8283.0,8936.596938,61560.0,110.0,23.0,72.0,0.0,4.0
max,10908.0,83325.38119,99961.0,298.0,35.0,99.0,5.0,9.0


In [13]:
# using standard scaler
transformer = StandardScaler() 

transformer.fit(X_num)

x_standardized = transformer.transform(X_num)
x_standardized.shape

(4543, 8)

In [14]:
#cat_data = pd.get_dummies(X, drop_first=True) # if you need to save the encoding info, this won't do
#cat_data

In [15]:
# One Hot/Label Encoding (categorical)

encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_cat)

OneHotEncoder(drop='first')

In [16]:
encoded = encoder.transform(X_cat).toarray()
encoded #.shape

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.]])

In [17]:
encoder.categories_

[array(['Arizona', 'California', 'Nevada', 'Oregon', 'Washington'],
       dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Basic', 'Extended', 'Premium'], dtype=object),
 array(['Bachelor', 'College', 'Doctor', 'High School or Below', 'Master'],
       dtype=object),
 array(['Disabled', 'Employed', 'Medical Leave', 'Retired', 'Unemployed'],
       dtype=object),
 array(['F', 'M'], dtype=object),
 array(['Rural', 'Suburban', 'Urban'], dtype=object),
 array(['Divorced', 'Married', 'Single'], dtype=object),
 array(['Corporate Auto', 'Personal Auto', 'Special Auto'], dtype=object),
 array(['Corporate L1', 'Corporate L2', 'Corporate L3', 'Personal L1',
        'Personal L2', 'Personal L3', 'Special L1', 'Special L2',
        'Special L3'], dtype=object),
 array(['Offer1', 'Offer2', 'Offer3', 'Offer4'], dtype=object),
 array(['Agent', 'Branch', 'Call Center', 'Web'], dtype=object),
 array(['Four-Door Car', 'Luxury Car', 'Luxury SUV', 'SUV', 'Sports Car',
        'Two-Door Car'],

In [18]:
# Concat DataFrames
X = np.concatenate([X_num, encoded], axis=1)
X.shape

(4543, 51)

In [19]:
# traint-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [21]:
predictions  = model.predict(X_test)
predictions.shape

(1363,)

In [22]:
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

(0.755482172413151, 139.63842768014655, 19498.890484983516)

### Model Validation

In [23]:
#R2

In [24]:
#MSE

In [25]:
#RMSE

In [26]:
#MAE