In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import scipy.stats as stats
from sklearn import linear_model
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

In [42]:
# To make run all you ask in one cell, not only the last required
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [43]:
# data=pd.read_csv('C:/Student/IRONHACK/Week1/D3/Lessons/merged_clean_reduced.csv')
data=pd.read_csv('C:/Student/IRONHACK/Week1/D4_Regression/Labs/lab-customer-analysis-round-5/files_for_lab/csv_files/marketing_customer_analysis.csv')
data.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


### DATA PROCESSING

#### x,y split

In [44]:
# identify numerical variables
numerical = data.select_dtypes(np.number)
categorical = data.select_dtypes(np.object)
#separate the features from the labels
y_num = numerical['Total Claim Amount']
X_num = numerical.drop(['Total Claim Amount'], axis=1)

#### Normalize (numerical)

In [45]:
# does not deal well with getting rid of outliers
from sklearn.preprocessing import MinMaxScaler
MinMaxtransformer = MinMaxScaler().fit(X_num)
x_normalized = MinMaxtransformer.transform(X_num)
print(x_normalized.shape)
x_normalized = pd.DataFrame(x_normalized,columns=X_num.columns)
x_normalized.head()

(9134, 7)


Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies
0,0.010629,0.562847,0.033755,0.914286,0.050505,0.0,0.0
1,0.062406,0.0,0.139241,0.371429,0.424242,0.0,0.875
2,0.13496,0.487763,0.198312,0.514286,0.383838,0.0,0.125
3,0.070589,0.0,0.189873,0.514286,0.656566,0.0,0.75
4,0.011245,0.438443,0.050633,0.342857,0.444444,0.0,0.0


#### One Hot/Label Encoding (categorical).

In [46]:
X_cat = data.select_dtypes(include = np.object)
# X_cat.columns
X_cat_reduced = X_cat[['Coverage', 'Education', 'Gender']]
X_cat_reduced.head()

Unnamed: 0,Coverage,Education,Gender
0,Basic,Bachelor,F
1,Extended,Bachelor,F
2,Premium,Bachelor,F
3,Basic,Bachelor,M
4,Basic,Bachelor,M


In [47]:
#one hot encoding 
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder().fit(X_cat_reduced)
print(encoder.categories_)
encoded = encoder.transform(X_cat_reduced).toarray()

enc = [columname for sublist in encoder.categories_ for columname in sublist]
onehot_encoded = pd.DataFrame(encoded,columns=enc)
onehot_encoded.head()

[array(['Basic', 'Extended', 'Premium'], dtype=object), array(['Bachelor', 'College', 'Doctor', 'High School or Below', 'Master'],
      dtype=object), array(['F', 'M'], dtype=object)]


Unnamed: 0,Basic,Extended,Premium,Bachelor,College,Doctor,High School or Below,Master,F,M
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [48]:
#because one of the variables can be deduced from the others, no point in keeping all these columns around
onehot_encoded = onehot_encoded.drop(['Basic'],axis=1)
onehot_encoded.head()

Unnamed: 0,Extended,Premium,Bachelor,College,Doctor,High School or Below,Master,F,M
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [49]:
#ANOTHER METHOD FOR LABELING: label encoding keeps just one column and makes it numerical, but watch out: you may be introducing unintended semantics
# turn table into a single column (array)
# [] - creates array, [[]] - creates df

# from sklearn.preprocessing import LabelEncoder
# label_cat= X_cat_reduced['Coverage']
# label_encoded = LabelEncoder().fit(label_cat).transform(label_cat) # ordered wrt value counts

# label_encoded = pd.DataFrame(label_encoded,columns=X_cat_reduced.columns)
# # print(label_encoded)
# label_encoded.head()

# # Concatenate first and then normalise

#### Concat dataframes

In [50]:
X_num.shape
onehot_encoded.shape
X = pd.concat([x_normalized, onehot_encoded],axis=1)
X.head(2)

(9134, 7)

(9134, 9)

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Extended,Premium,Bachelor,College,Doctor,High School or Below,Master,F,M
0,0.010629,0.562847,0.033755,0.914286,0.050505,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.062406,0.0,0.139241,0.371429,0.424242,0.0,0.875,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [51]:
# wont work...
X.rename(columns={('Extended',): 'Extended Coverage', ('Premium',):'Premium Coverage'}, inplace=True)
X.head(2)

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Extended,Premium,Bachelor,College,Doctor,High School or Below,Master,F,M
0,0.010629,0.562847,0.033755,0.914286,0.050505,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.062406,0.0,0.139241,0.371429,0.424242,0.0,0.875,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


### LINEAR REGRESSION

#### Train, test, split

In [52]:
y = numerical['Total Claim Amount']
y.head()

0     384.811147
1    1131.464935
2     566.472247
3     529.881344
4     138.130879
Name: Total Claim Amount, dtype: float64

In [53]:
# train test split is the way ML generates its claim to fame: we build the model on a portion of the data but we then validate it in another "fresh" portion
# our model has no opportunity to "cheat": it must accurately guess the values in the "fresh" dataset that it never saw before
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [54]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7307, 16)
(1827, 16)
(7307,)
(1827,)


In [55]:
X_train.head(2)
X_test.head(2)

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Extended,Premium,Bachelor,College,Doctor,High School or Below,Master,F,M
7706,0.01679,0.258249,0.088608,0.285714,0.69697,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
779,0.115687,0.792851,0.14346,0.8,0.616162,0.6,0.125,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Extended,Premium,Bachelor,College,Doctor,High School or Below,Master,F,M
7175,0.039478,0.565978,0.016878,0.114286,0.888889,0.2,0.625,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
7255,0.03665,0.2124,0.261603,0.085714,0.121212,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


#### Apply linear regression.

In [56]:
#we train/fit our model like yesterday
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)

In [57]:
model.predict(X_test)

array([239.1441607 , 671.30191693, 336.86459702, ..., 160.17371193,
       668.26856253, 342.60559576])

In [58]:
from sklearn.metrics import r2_score

predictions = lm.predict(X_test)
r2_score(y_test, predictions)

0.4928571755396277

### Model Validation

#### R2 / MSE / RMSE / MAE

In [59]:
from sklearn.metrics import mean_squared_error, r2_score

In [60]:
mse = mean_squared_error(y_test, predictions)
print(mse)

40395.03307937574


In [61]:
rmse = math.sqrt(mse)
print(rmse)

200.98515636577676


In [62]:
r2 = r2_score(y_test, predictions)
r2

0.4928571755396277

### Version 2
We will select another categorical variable and use a different ratio of train test split in order to improve the accuracy of our model

In [63]:
X_cat_reduced2 = X_cat[['State', 'Sales Channel']]
#one hot encoding 
encoder2 = OneHotEncoder().fit(X_cat_reduced2)
encoded2 = encoder2.transform(X_cat_reduced2).toarray()
enc = [columname for sublist in encoder2.categories_ for columname in sublist]
onehot_encoded2 = pd.DataFrame(encoded2,columns=enc)
onehot_encoded2.head()

Unnamed: 0,Arizona,California,Nevada,Oregon,Washington,Agent,Branch,Call Center,Web
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [64]:
X_num.shape
onehot_encoded.shape
X2 = pd.concat([x_normalized, onehot_encoded2],axis=1)
X2.head(2)

(9134, 7)

(9134, 9)

Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Arizona,California,Nevada,Oregon,Washington,Agent,Branch,Call Center,Web
0,0.010629,0.562847,0.033755,0.914286,0.050505,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.062406,0.0,0.139241,0.371429,0.424242,0.0,0.875,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [65]:
y2 = numerical['Total Claim Amount']
y2.head()
y2.shape
X2.shape

0     384.811147
1    1131.464935
2     566.472247
3     529.881344
4     138.130879
Name: Total Claim Amount, dtype: float64

(9134,)

(9134, 16)

In [66]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.15, random_state=100)
lm = linear_model.LinearRegression()
model = lm.fit(X2_train,y2_train)
model.predict(X2_test)

array([219.63933695, 660.74907437, 318.95766556, ..., 214.34626973,
       404.14799983, 345.69005595])

In [67]:
predictions2 = lm.predict(X2_test)
r2_score(y2_test, predictions2)

0.4844238781521152

### Version 3
We will remove all categorical variables and use a different ratio of train test split in order to improve the accuracy of our model

In [68]:
y2 = numerical['Total Claim Amount']
y2.shape
X3 = x_normalized
y2.shape
X3.shape

(9134,)

(9134,)

(9134, 7)

In [69]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=100)
lm = linear_model.LinearRegression()
model = lm.fit(X2_train,y2_train)
model.predict(X2_test)

array([224., 668., 304., ..., 140., 688., 328.])

In [70]:
predictions2 = lm.predict(X2_test)
r2_score(y2_test, predictions2)

0.47260801583830503