# Predicting the “Customer Life Time Value”



### Problem Description : To predict the Customer life time value for an auto insurance company based on different quantitative and qualitative features provided.



### Primary Evaluation Metric : RMSE

##### Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm 
import warnings
warnings.simplefilter("ignore")
import matplotlib.pyplot as plt
import seaborn as sns

##### Train data

In [2]:
## reading the train data
data_train= pd.read_csv('./train-1574429526318.csv')

FileNotFoundError: [Errno 2] File b'./train-1574429526318.csv' does not exist: b'./train-1574429526318.csv'

In [None]:
## creating a copy of the file
data_train1= data_train.copy()

In [None]:
## get and set option is to display all the rows and columns 
pd.get_option('display.max_rows')
pd.set_option('display.max_rows', None)
pd.get_option('display.max_columns')
pd.set_option('display.max_columns', None)
data_train.head(5)

In [None]:
## this is to set the index from count 1
data_train.index = np.arange(1, len(data_train) + 1)

In [None]:
## in this function we can see the datatype,number of levels present,what are the levels, count and percentage of Nan
def inspect_data(data):
    return pd.DataFrame({"Data Type":data.dtypes,"No of Levels":data.apply(lambda x: x.nunique(),axis=0), "Levels":data.apply(lambda x:str(x.unique()),axis=0), "NaN":round(data.isnull().sum(),1),"NaN%":round(data.isnull().sum()/100,1)})
inspect_data(data_train)

In [None]:
## to check the summary statistics of the data
data_train.describe()

In [None]:
## removing the attributes which are not important to the data
data_train=data_train.drop(['CustomerID','Location.Geo'],axis=1)


In [None]:
## to show the column names present in data
data_train.columns

In [None]:
## separating the categoric and numeric variables
cat_cols = ['Coverage','Education','EmploymentStatus','Gender','Location.Code','Marital.Status','Policy.Type','Policy','Renew.Offer.Type','Sales.Channel','Vehicle.Class','Vehicle.Size']
num_cols = data_train.columns.difference(cat_cols)
num_cols

In [None]:
## removing the ?
data_train.Income=data_train.Income.replace('?',np.nan)

In [None]:
## as per test data i have removed the retired level
data_train.EmploymentStatus=data_train.EmploymentStatus.replace('Retired',np.nan)

In [None]:
data_train['Vehicle.Size'].value_counts()

In [None]:
data_train[cat_cols] = data_train[cat_cols].apply(lambda x: x.astype('category'))
data_train[num_cols] = data_train[num_cols].apply(lambda x: x.astype('float'))
data_train.dtypes

In [None]:
data_train.shape

In [None]:
num_data = data_train.loc[:,num_cols]
cat_data = data_train.loc[:,cat_cols]

In [None]:
#Numeric columns imputation
imp = SimpleImputer(missing_values=np.nan, strategy='median')
num_data = pd.DataFrame(imp.fit_transform(num_data),columns=num_cols)

print(num_data.isnull().sum())

In [None]:
# categoric columns imputation
imp_c = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cat_data = pd.DataFrame(imp_c.fit_transform(cat_data),columns=cat_cols)

print(cat_data.isnull().sum())

In [None]:
cat_data = pd.get_dummies(cat_data,columns=['Coverage','Education','EmploymentStatus','Gender','Location.Code','Marital.Status','Policy.Type','Policy','Renew.Offer.Type','Sales.Channel','Vehicle.Class','Vehicle.Size'],drop_first=True)

In [None]:
data_train.shape

In [None]:
#standardizer = StandardScaler()
#standardizer.fit(num_data)
#num_data = pd.DataFrame(standardizer.transform(num_data),columns=num_cols)

data_train = pd.concat([num_data,cat_data],axis=1)

In [None]:
data_train.dtypes

In [None]:
x = data_train.drop("Customer.Lifetime.Value",axis=1)
y = data_train["Customer.Lifetime.Value"]

In [None]:
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.3,random_state=1)

In [None]:
print(x_train.shape)
print(x_validation.shape)
print(y_train.shape)
print(y_validation.shape)


## Model Implementation

In [None]:
## libraries for model
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [None]:
y_pred_valid1 = regressor.predict(x_validation)
y_pred_train1= regressor.predict(x_train)

In [None]:
rmse_train = sqrt(mean_squared_error(y_train, y_pred_train1))

print(rmse_train)

In [None]:

rmse_validation = sqrt(mean_squared_error(y_validation, y_pred_valid1))

print(rmse_validation)

In [None]:
from sklearn.svm import SVR

In [None]:
svr= SVR()
svr

In [None]:
svr.fit(x_train, y_train)
y_pred_valid3 = svr.predict(x_validation)
y_pred_train3= svr.predict(x_train)

In [None]:
rmse_train = sqrt(mean_squared_error(y_train, y_pred_train3))

print(rmse_train)

In [None]:

rmse_validation = sqrt(mean_squared_error(y_validation, y_pred_valid3))

print(rmse_validation)

In [None]:
decision= DecisionTreeRegressor(max_depth=5,ccp_alpha=0.5)
decision.fit(x_train,y_train)

In [None]:
y_pred_train5 = decision.predict(x_train)
y_pred_valid5 = decision.predict(x_validation)

In [None]:
rmse_train = sqrt(mean_squared_error(y_train, y_pred_train5))

print(rmse_train)

In [None]:
rmse_validation = sqrt(mean_squared_error(y_validation, y_pred_valid5))

print(rmse_validation)

#### Test data

In [None]:
data_test= pd.read_csv('./test-1574429501088.csv')

In [None]:
data_test1= data_test.copy()

In [None]:
data_test['Vehicle.Size'].value_counts()

In [None]:
data_test['Vehicle.Size'] = data_test['Vehicle.Size'].map({'Medsize':2.0 , 'Small':3.0,'Large':1.0})

In [None]:
data_test.head(5)

In [None]:
inspect_data(data_test)

In [None]:
data_test.describe()

In [None]:
data_test=data_test.drop(['CustomerID','Location.Geo'],axis=1)


In [None]:
cat_cols_test = ['Coverage','Education','EmploymentStatus','Gender','Location.Code','Marital.Status','Policy.Type','Policy','Renew.Offer.Type','Sales.Channel','Vehicle.Class','Vehicle.Size']
num_cols_test = data_test.columns.difference(cat_cols_test)
num_cols_test

In [None]:
data_test.Income=data_test.Income.replace('?',np.nan)

In [None]:
data_test[cat_cols_test] = data_test[cat_cols_test].apply(lambda x: x.astype('category'))
data_test[num_cols_test] = data_test[num_cols_test].apply(lambda x: x.astype('float'))
data_test.dtypes

In [None]:
num_data_test = data_test.loc[:,num_cols_test]
cat_data_test = data_test.loc[:,cat_cols_test]

In [None]:
imp_test = SimpleImputer(missing_values=np.nan, strategy='median')
num_data_test = pd.DataFrame(imp.fit_transform(num_data_test),columns=num_cols_test)


In [None]:
cat_data_test = pd.get_dummies(cat_data_test,columns=['Coverage','Education','EmploymentStatus','Gender','Location.Code','Marital.Status','Policy.Type','Policy','Renew.Offer.Type','Sales.Channel','Vehicle.Class','Vehicle.Size'],drop_first=True)

In [None]:
#standardizer_test = StandardScaler()
#standardizer_test.fit(num_data_test)
#num_data_test = pd.DataFrame(standardizer_test.transform(num_data_test),columns=num_cols_test)

data_test = pd.concat([num_data_test,cat_data_test],axis=1)

In [None]:
data_test.shape

In [None]:
data_train.shape

In [None]:
data_test.head()

In [None]:
data_test.isnull().sum()

In [None]:
y_pred_test1 = regressor.predict(data_test)


In [None]:
y_pred_test1

In [None]:
data_test.shape

In [None]:
data_train.shape

In [None]:
predic1 = pd.read_csv('./sample_submission-1577482703002.csv')

In [None]:
predic1.columns

In [None]:
predic1['Customer.Lifetime.Value']= y_pred_test1

In [None]:
predic1.head()

In [None]:
predic1.to_csv('final1.csv',index=False)

In [None]:
y_pred_test2 = svr.predict(data_test) 

In [None]:
y_pred_test2

In [None]:
predic2 = pd.read_csv('./sample_submission-1577482703002.csv')

In [None]:
predic2['Customer.Lifetime.Value']= y_pred_test2

In [None]:
predic2.to_csv('final2.csv',index=False)

In [None]:
predic2.head()

In [None]:
y_pred_test3= decision.predict(data_test)

In [None]:
y_pred_test3

In [None]:
predic3 = pd.read_csv('./sample_submission-1577482703002.csv')

In [None]:
predic3['Customer.Lifetime.Value']= y_pred_test3

In [None]:
predic3.to_csv('final3.csv',index=False)