## Data Understanding

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib

from sklearn.metrics import mean_squared_error,r2_score
from math import sqrt
from sklearn.ensemble import RandomForestRegressor

## Load Dataset

In [2]:
df= pd.read_csv('customer_lifetime.csv')
df.head()

Unnamed: 0,Customer_Age,Annual_Income,Tenure_Months,Monthly_Spend,Visits_Per_Month,Avg_Basket_Value,Support_Tickets,CLV
0,56,33343,4,8645,3,3161,0,156418
1,69,43500,40,7232,13,1732,3,212379
2,46,83222,4,13485,13,4374,2,208248
3,32,59375,56,7797,6,2901,1,258410
4,60,39662,25,14342,1,1994,1,227847


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Customer_Age      500 non-null    int64
 1   Annual_Income     500 non-null    int64
 2   Tenure_Months     500 non-null    int64
 3   Monthly_Spend     500 non-null    int64
 4   Visits_Per_Month  500 non-null    int64
 5   Avg_Basket_Value  500 non-null    int64
 6   Support_Tickets   500 non-null    int64
 7   CLV               500 non-null    int64
dtypes: int64(8)
memory usage: 31.4 KB


## Data Preparation

In [4]:
x = df.drop('CLV',axis=1)
y = df['CLV']

In [5]:
x

Unnamed: 0,Customer_Age,Annual_Income,Tenure_Months,Monthly_Spend,Visits_Per_Month,Avg_Basket_Value,Support_Tickets
0,56,33343,4,8645,3,3161,0
1,69,43500,40,7232,13,1732,3
2,46,83222,4,13485,13,4374,2
3,32,59375,56,7797,6,2901,1
4,60,39662,25,14342,1,1994,1
...,...,...,...,...,...,...,...
495,65,101774,18,17711,6,4227,3
496,42,36090,1,4920,2,6772,4
497,57,41637,14,18002,7,6495,5
498,62,52415,51,16025,12,1784,0


In [6]:
x.isnull().sum()

Customer_Age        0
Annual_Income       0
Tenure_Months       0
Monthly_Spend       0
Visits_Per_Month    0
Avg_Basket_Value    0
Support_Tickets     0
dtype: int64

## Splitting Dataset and training

In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=42)

## Regression Modelling

In [8]:
Linear  = LinearRegression()
Linear.fit(x_train, y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [9]:
Predictions = Linear.predict(x_test)
Predictions

array([232417.55345755, 158739.15185384, 319724.4940298 , 220863.31291873,
       316829.92595353, 297809.60221788, 266073.38966088, 200107.94793728,
       169237.32618962, 190047.72202027, 145643.79193179, 279983.48386611,
       200000.77079834, 172208.30898493, 147190.12071709, 259667.56720652,
       304032.14801188, 294781.89046065, 375250.84093117, 344004.59690025,
       224194.41064076, 206275.96409058, 190515.71605259, 253724.41021251,
       313604.27559766, 189499.47952701, 295861.98658895, 139188.99666886,
       291588.96241325, 193612.92777133, 136436.02486951, 302506.32937317,
       217491.53647332, 168210.35382708, 223942.16394405, 151851.76778132,
        86709.86760815, 188587.85885275, 258378.37392352, 327874.15706967,
       333119.44877819, 275986.46431049, 206870.07679706, 275737.83471506,
       226204.76842813, 255743.42985126, 149475.2828013 , 328549.8588258 ,
       244917.02458857, 166294.80307177, 193353.48960876, 299262.10431077,
       129746.96352508, 1

## Random Forest Regressor

In [10]:
rf = RandomForestRegressor(n_estimators=200,random_state=42)
rf.fit(x_train,y_train)
random_prediction = rf.predict(x_test)
random_prediction

array([242486.275, 163626.42 , 308422.285, 194809.08 , 318495.255,
       297696.225, 280925.095, 192481.71 , 155645.675, 172377.18 ,
       150737.765, 283909.005, 196734.955, 178276.89 , 149080.015,
       248944.195, 287837.33 , 291627.21 , 371420.8  , 341579.41 ,
       217556.635, 201690.585, 205467.32 , 259989.975, 308385.335,
       176760.415, 303101.9  , 149985.13 , 279358.235, 183555.725,
       164713.77 , 314998.68 , 211791.055, 197963.915, 251721.8  ,
       160813.86 ,  98596.575, 177651.41 , 275661.815, 319512.53 ,
       328855.29 , 296742.035, 207043.375, 282843.345, 221859.765,
       244567.535, 169997.965, 320258.67 , 236344.45 , 182879.315,
       211523.47 , 291559.035, 130923.19 , 186845.895, 224684.935,
       138793.58 , 212732.92 , 198767.53 , 241499.705, 338230.135,
       185069.595, 318430.185, 275817.04 , 288846.62 , 173919.235,
       209012.345, 213940.28 , 180005.315, 209884.865, 154318.005,
       292248.4  , 175435.765, 154636.52 , 151526.755, 108712.

## Model Evaluation

In [11]:
MSE= mean_squared_error(y_test,Predictions)
MSE


272939026.48102325

In [12]:
RMSE = sqrt(MSE)
RMSE

16520.86639619797

In [13]:
r_2= r2_score(y_test,Predictions)
r_2


0.9397913883234074

In [14]:
RMSE_linear = sqrt(mean_squared_error(y_test,Predictions))
r2_linear = r2_score(y_test, Predictions)
RMSE_tree = sqrt(mean_squared_error(y_test,random_prediction))
r2_tree = r2_score(y_test,random_prediction)

print(f'RMSE_linear:', RMSE_linear)
print(f'r2_linear:', r2_linear)
print(f'RMSE_tree:', RMSE_tree)
print(f'r2_tree:', r2_tree)

RMSE_linear: 16520.86639619797
r2_linear: 0.9397913883234074
RMSE_tree: 20786.338883254626
r2_tree: 0.9046876929776694


## Model Persistence

In [15]:
joblib.dump(Linear,'CLV_model.joblib')
joblib.dump(x,'modelfeatures.joblib')

['modelfeatures.joblib']