In [1]:
import pandas as pd
import numpy as np
import warnings
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
warnings.filterwarnings('ignore') # no more 'useless' red warnings


In [2]:
df = pd.read_csv('../files_for_lab/csv_files/marketing_customer_analysis.csv')


In [9]:
data = df.copy()
data.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [11]:
standard_columns = [data.columns[i].lower() for i in range(len(data.columns))]
standard_columns = [col_name.lower().replace(' ', '_') for col_name in data.columns]
data.columns = standard_columns

In [14]:
data.columns

Index(['customer', 'state', 'customer_lifetime_value', 'response', 'coverage',
       'education', 'effective_to_date', 'employmentstatus', 'gender',
       'income', 'location_code', 'marital_status', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies', 'policy_type',
       'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount',
       'vehicle_class', 'vehicle_size'],
      dtype='object')

In [32]:
#X-y split
X = data.drop(['total_claim_amount'], axis=1)
y = data['total_claim_amount']

In [33]:
#Normalizing numerical variables

def boxcox_transform(data):
    numeric_cols = data.select_dtypes(np.number).columns
    _ci = {column: None for column in numeric_cols}
    for column in numeric_cols:
        # since i know any columns should take negative numbers, to avoid -inf in df
        data[column] = np.where(data[column]<=0, np.NAN, data[column]) 
        data[column] = data[column].fillna(data[column].mean())
        transformed_data, ci = stats.boxcox(data[column])
        data[column] = transformed_data
        _ci[column] = [ci] 
    return data, _ci

In [52]:
data, _ci = boxcox_transform(data)
data

Unnamed: 0,customer,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,income,...,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size
0,BU79786,Washington,,No,Basic,Bachelor,2/24/11,Employed,F,1240.517275,...,13.266781,,,Corporate Auto,Corporate L3,Offer1,Agent,24.474279,Two-Door Car,Medsize
1,QZ44356,Arizona,,No,Extended,Bachelor,1/31/11,Unemployed,F,1160.795864,...,11.734653,,,Personal Auto,Personal L3,Offer3,Agent,42.816969,Four-Door Car,Medsize
2,AI49188,Nevada,,No,Premium,Bachelor,2/19/11,Employed,F,1136.972639,...,10.687428,,,Personal Auto,Personal L3,Offer1,Agent,30.078798,Two-Door Car,Medsize
3,WW63253,California,,No,Basic,Bachelor,1/20/11,Unemployed,M,1160.795864,...,17.165467,,,Corporate Auto,Corporate L2,Offer1,Call Center,29.041858,SUV,Medsize
4,HB64268,Washington,,No,Basic,Bachelor,2/3/11,Employed,M,1065.510159,...,12.244391,,,Personal Auto,Personal L1,Offer1,Agent,13.489783,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,LA72316,California,,No,Basic,Bachelor,2/10/11,Employed,M,1440.464950,...,22.116484,,,Personal Auto,Personal L1,Offer2,Web,16.816465,Four-Door Car,Medsize
9130,PK87824,California,,Yes,Extended,College,2/12/11,Employed,F,692.060984,...,7.870945,,,Corporate Auto,Corporate L3,Offer1,Branch,24.279531,Four-Door Car,Medsize
9131,TD14365,California,,No,Extended,Bachelor,2/6/11,Unemployed,M,1160.795864,...,10.419353,,,Corporate Auto,Corporate L2,Offer1,Branch,35.739662,Four-Door Car,Medsize
9132,UP19263,California,,No,Extended,College,2/3/11,Employed,M,698.634769,...,13.041299,,,Personal Auto,Personal L2,Offer3,Branch,33.355499,Four-Door Car,Large
