In [65]:
import pandas as pd
import numpy as np
import warnings
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore') # no more 'useless' red warnings


In [3]:
df = pd.read_csv('../files_for_lab/csv_files/marketing_customer_analysis.csv')


In [4]:
data = df.copy()
data.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [6]:
standard_columns = [data.columns[i].lower() for i in range(len(data.columns))]
standard_columns = [col_name.lower().replace(' ', '_') for col_name in data.columns]
data.columns = standard_columns

In [7]:
data.columns

Index(['customer', 'state', 'customer_lifetime_value', 'response', 'coverage',
       'education', 'effective_to_date', 'employmentstatus', 'gender',
       'income', 'location_code', 'marital_status', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies', 'policy_type',
       'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount',
       'vehicle_class', 'vehicle_size'],
      dtype='object')

In [8]:
#X-y split
X = data.drop(['total_claim_amount'], axis=1)
y = data['total_claim_amount']

In [22]:
#Normalizing numerical variables

def boxcox_transform(X):
    numeric_cols = X.select_dtypes(np.number).columns #numerical variables
    _ci = {column: None for column in numeric_cols}
    for column in numeric_cols:
        # since i know any columns should take negative numbers, to avoid -inf in df
        X[column] = np.where(X[column]<=0, np.NAN, X[column]) 
        X[column] = X[column].fillna(X[column].mean())
        transformed_data, ci = stats.boxcox(X[column])
        X[column] = transformed_data
        _ci[column] = [ci] 
    return X, _ci

In [23]:
data, _ci = boxcox_transform(X)
X

Unnamed: 0,customer,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,income,...,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
0,BU79786,Washington,2.703839,No,Basic,Bachelor,2/24/11,Employed,F,1247.937066,...,10.876059,3.017887,0.648045,0.000000,Corporate Auto,Corporate L3,Offer1,Agent,Two-Door Car,Medsize
1,QZ44356,Arizona,2.754926,No,Extended,Bachelor,1/31/11,Unemployed,F,1167.961720,...,5.808248,18.780377,0.648045,1.424359,Personal Auto,Personal L3,Offer3,Agent,Four-Door Car,Medsize
2,AI49188,Nevada,2.780772,No,Premium,Bachelor,2/19/11,Employed,F,1144.063081,...,7.347815,17.389171,0.648045,0.607328,Personal Auto,Personal L3,Offer1,Agent,Two-Door Car,Medsize
3,WW63253,California,2.759125,No,Basic,Bachelor,1/20/11,Unemployed,M,1167.961720,...,7.347815,26.160800,0.648045,1.363462,Corporate Auto,Corporate L2,Offer1,Call Center,SUV,Medsize
4,HB64268,Washington,2.704995,No,Basic,Bachelor,2/3/11,Employed,M,1072.375744,...,5.471294,19.461641,0.648045,0.000000,Personal Auto,Personal L1,Offer1,Agent,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,LA72316,California,2.801170,No,Basic,Bachelor,2/10/11,Employed,M,1448.531450,...,7.347815,33.090025,0.648045,0.607328,Personal Auto,Personal L1,Offer2,Web,Four-Door Car,Medsize
9130,PK87824,California,2.711030,Yes,Extended,College,2/12/11,Employed,F,697.790558,...,6.134207,13.709541,0.648045,0.000000,Corporate Auto,Corporate L3,Offer1,Branch,Four-Door Car,Medsize
9131,TD14365,California,2.762062,No,Extended,Bachelor,2/6/11,Unemployed,M,1167.961720,...,4.379465,17.034934,1.176115,0.607328,Corporate Auto,Corporate L2,Offer1,Branch,Four-Door Car,Medsize
9132,UP19263,California,2.758397,No,Extended,College,2/3/11,Employed,M,704.383672,...,11.319628,1.667871,0.648045,0.893486,Personal Auto,Personal L2,Offer3,Branch,Four-Door Car,Large


In [34]:
numerical = X.select_dtypes(np.number)
numerical

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,2.703839,1247.937066,0.685347,10.876059,3.017887,0.648045,0.000000
1,2.754926,1167.961720,0.685871,5.808248,18.780377,0.648045,1.424359
2,2.780772,1144.063081,0.686039,7.347815,17.389171,0.648045,0.607328
3,2.759125,1167.961720,0.686018,7.347815,26.160800,0.648045,1.363462
4,2.704995,1072.375744,0.685461,5.471294,19.461641,0.648045,0.000000
...,...,...,...,...,...,...,...
9129,2.801170,1448.531450,0.685461,7.347815,33.090025,0.648045,0.607328
9130,2.711030,697.790558,0.685606,6.134207,13.709541,0.648045,0.000000
9131,2.762062,1167.961720,0.685725,4.379465,17.034934,1.176115,0.607328
9132,2.758397,704.383672,0.685898,11.319628,1.667871,0.648045,0.893486


In [26]:
categorical = X.select_dtypes(include = np.object)
# categorical_data = pd.get_dummies(categorical, drop_first=True) # if you need to save the encoding info, this won't do
# categorical_data
categorical

Unnamed: 0,customer,state,response,coverage,education,effective_to_date,employmentstatus,gender,location_code,marital_status,policy_type,policy,renew_offer_type,sales_channel,vehicle_class,vehicle_size
0,BU79786,Washington,No,Basic,Bachelor,2/24/11,Employed,F,Suburban,Married,Corporate Auto,Corporate L3,Offer1,Agent,Two-Door Car,Medsize
1,QZ44356,Arizona,No,Extended,Bachelor,1/31/11,Unemployed,F,Suburban,Single,Personal Auto,Personal L3,Offer3,Agent,Four-Door Car,Medsize
2,AI49188,Nevada,No,Premium,Bachelor,2/19/11,Employed,F,Suburban,Married,Personal Auto,Personal L3,Offer1,Agent,Two-Door Car,Medsize
3,WW63253,California,No,Basic,Bachelor,1/20/11,Unemployed,M,Suburban,Married,Corporate Auto,Corporate L2,Offer1,Call Center,SUV,Medsize
4,HB64268,Washington,No,Basic,Bachelor,2/3/11,Employed,M,Rural,Single,Personal Auto,Personal L1,Offer1,Agent,Four-Door Car,Medsize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9129,LA72316,California,No,Basic,Bachelor,2/10/11,Employed,M,Urban,Married,Personal Auto,Personal L1,Offer2,Web,Four-Door Car,Medsize
9130,PK87824,California,Yes,Extended,College,2/12/11,Employed,F,Suburban,Divorced,Corporate Auto,Corporate L3,Offer1,Branch,Four-Door Car,Medsize
9131,TD14365,California,No,Extended,Bachelor,2/6/11,Unemployed,M,Suburban,Single,Corporate Auto,Corporate L2,Offer1,Branch,Four-Door Car,Medsize
9132,UP19263,California,No,Extended,College,2/3/11,Employed,M,Suburban,Married,Personal Auto,Personal L2,Offer3,Branch,Four-Door Car,Large


In [29]:
encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(categorical)

OneHotEncoder(drop='first')

In [31]:
encoded = encoder.transform(categorical).toarray()
encoded.shape # 

(9134, 9234)

In [60]:
X_1 = np.concatenate([numerical, encoded], axis=1)
X_1.shape

(9134, 9241)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=0.3, random_state=42)

In [62]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [63]:
predictions  = model.predict(X_test)
predictions.shape

(2741,)

In [66]:
rmse = mean_squared_error(y_test, predictions, squared=False) # or mse with squared=True
mse = mean_squared_error(y_test, predictions, squared=True)
mae = mean_absolute_error(y_test, predictions)
print("R2_score:", round(r2_score(y_test, predictions),2)) # or r2_score(Y, predictions)
print("RMSE:", rmse)
print("MAE:", mae)
print("MSE:", mse)


R2_score: 0.8
RMSE: 5.239119136339147
MAE: 3.918969960133127
MSE: 27.44836932475505
