# 1. Importing Libraries

In [3]:
import lifetimes
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from lifetimes import BetaGeoFitter # BG/NBD
from lifetimes import GammaGammaFitter # Gamma-Gamma Model
from lifetimes.plotting import plot_frequency_recency_matrix
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)



https://medium.com/@ugursavci/customer-lifetime-value-prediction-in-python-89e4a50df12e

# 2. Reading Data

In [4]:
df = pd.read_csv('data/Online_Retail.csv')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


# 3. Understanding Data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495478 entries, 0 to 495477
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    495478 non-null  object 
 1   StockCode    495478 non-null  object 
 2   Description  494024 non-null  object 
 3   Quantity     495478 non-null  int64  
 4   InvoiceDate  495478 non-null  object 
 5   UnitPrice    495478 non-null  float64
 6   CustomerID   361878 non-null  float64
 7   Country      495478 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 30.2+ MB


In [6]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,495478.0,495478.0,361878.0
mean,8.605486,4.532422,15547.871368
std,227.588756,99.315438,1594.40259
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,14194.0
50%,3.0,2.1,15514.0
75%,10.0,4.13,16931.0
max,80995.0,38970.0,18287.0


# 4. Data Preprocessing

## Filtering Our Data

In [None]:
df = df[df['Quantity'] > 0 ] # exclude the orders with 0 value
df = df[df['UnitPrice'] > 0] # exclude the Unit Price with 0 value
df = df[~df['InvoiceNo'].str.contains("C",na=False)]  # C indicates the returned orders we don't want them as well

## Missing Values

In [7]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     133600
Country             0
dtype: int64

In [8]:
df.dropna(inplace=True)  # inplace=True means we dropped them permanently

## Handling Outliers

In [10]:
def find_boundaries(df, variable,q1=0.05,q2=0.95):
    # the boundaries are the quantiles
    lower_boundary = df[variable].quantile(q1) # lower quantile
    upper_boundary = df[variable].quantile(q2) # upper quantile
    return upper_boundary, lower_boundary
def capping_outliers(df,variable):
    upper_boundary,lower_boundary =  find_boundaries(df,variable)
    df[variable] = np.where(df[variable] > upper_boundary, upper_boundary,
                            np.where(df[variable] < lower_boundary, lower_boundary, df[variable]))

In [12]:
capping_outliers(df,'UnitPrice')
capping_outliers(df,'Quantity')

In [13]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,361878.0,361878.0,361878.0
mean,8.195325,2.671928,15547.871368
std,9.208407,2.26392,1594.40259
min,1.0,0.42,12346.0
25%,2.0,1.25,14194.0
50%,4.0,1.95,15514.0
75%,12.0,3.75,16931.0
max,36.0,8.5,18287.0


## Preparing Our Dataset ( RFM Dataset )

In [14]:
df['Total Price'] = df['UnitPrice'] * df['Quantity']

In [16]:
clv = lifetimes.utils.summary_data_from_transaction_data(df,'CustomerID','InvoiceDate','Total Price',observation_period_end='2011-12-09')
clv.head()

Unnamed: 0_level_0,frequency,recency,T,monetary_value
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12346.0,0.0,0.0,325.0,0.0
12747.0,10.0,367.0,369.0,375.725
12748.0,114.0,373.0,373.0,253.824825
12749.0,6.0,210.0,213.0,524.588333
12820.0,3.0,323.0,326.0,256.573333


In [18]:
clv = clv[clv['frequency']>1] # we want only customers shopped more than 2 times
clv.head()

Unnamed: 0_level_0,frequency,recency,T,monetary_value
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12747.0,10.0,367.0,369.0,375.725
12748.0,114.0,373.0,373.0,253.824825
12749.0,6.0,210.0,213.0,524.588333
12820.0,3.0,323.0,326.0,256.573333
12822.0,2.0,17.0,87.0,128.14


## Frequency/Recency analysis using the BG/NBD model

In [19]:
bgf = BetaGeoFitter(penalizer_coef=0.001)
bgf.fit(clv['frequency'], clv['recency'], clv['T'])

<lifetimes.BetaGeoFitter: fitted with 1926 subjects, a: 0.02, alpha: 94.37, b: 1.57, r: 2.29>

## Expected Number of Purchases within 6 Months

In [21]:
t = 180 # 30 day period
clv['expected_purc_6_months'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, clv['frequency'], clv['recency'], clv['T'])
clv.sort_values(by='expected_purc_6_months',ascending=False).head(5)

Unnamed: 0_level_0,frequency,recency,T,monetary_value,expected_purc_6_months
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12748.0,114.0,373.0,373.0,253.824825,44.65685
17841.0,112.0,372.0,373.0,349.628661,43.886957
15311.0,90.0,373.0,373.0,420.387778,35.438595
14606.0,88.0,372.0,373.0,127.851364,34.669089
13089.0,82.0,367.0,369.0,623.924268,32.641797


## Gamma-Gamma Model

In [22]:
clv[['frequency','monetary_value']].corr()

Unnamed: 0,frequency,monetary_value
frequency,1.0,0.117416
monetary_value,0.117416,1.0


In [24]:
clv[['frequency','monetary_value']].corr()

Unnamed: 0,frequency,monetary_value
frequency,1.0,0.117416
monetary_value,0.117416,1.0


In [25]:
df[['UnitPrice','Quantity']].corr()

Unnamed: 0,UnitPrice,Quantity
UnitPrice,1.0,-0.338928
Quantity,-0.338928,1.0


In [26]:
ggf = GammaGammaFitter(penalizer_coef=0.01)
ggf.fit(clv["frequency"],
        clv["monetary_value"])

<lifetimes.GammaGammaFitter: fitted with 1926 subjects, p: 3.75, q: 0.37, v: 3.70>

In [28]:
clv['6_monhths_clv']=ggf.customer_lifetime_value(bgf,
                                   clv["frequency"],
                                   clv["recency"],
                                   clv["T"],
                                   clv["monetary_value"],
                                   time=6,
                                   freq='D',
                                   discount_rate=0.01)

In [30]:
clv.sort_values('6_monhths_clv',ascending=False).head()

Unnamed: 0_level_0,frequency,recency,T,monetary_value,expected_purc_6_months,6_monhths_clv
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
14096.0,16.0,97.0,101.0,3020.954375,16.720978,49316.353729
18102.0,26.0,367.0,367.0,2032.233846,10.997172,21729.715117
13089.0,82.0,367.0,369.0,623.924268,32.641797,19714.161258
17511.0,32.0,371.0,373.0,1528.5625,13.159978,19534.838749
14088.0,12.0,312.0,322.0,3073.660833,6.144459,18504.038622


In [32]:
clv['Segment'] =  pd.qcut(clv['6_monhths_clv'],4,labels = ['Hibernating','Need Attention','LoyalCustomers', 'Champions'])

In [33]:
clv.head()

Unnamed: 0_level_0,frequency,recency,T,monetary_value,expected_purc_6_months,6_monhths_clv,Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12747.0,10.0,367.0,369.0,375.725,4.750395,1755.281137,Champions
12748.0,114.0,373.0,373.0,253.824825,44.65685,10966.4977,Champions
12749.0,6.0,210.0,213.0,524.588333,4.817273,2514.423461,Champions
12820.0,3.0,323.0,326.0,256.573333,2.244199,592.073557,Need Attention
12822.0,2.0,17.0,87.0,128.14,4.019133,551.028967,Need Attention


In [34]:
clv.groupby('Segment').mean()

Unnamed: 0_level_0,frequency,recency,T,monetary_value,expected_purc_6_months,6_monhths_clv
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Hibernating,3.161826,196.248963,284.811203,118.883779,2.51811,282.174518
Need Attention,4.293139,234.960499,281.218295,221.464043,3.201085,651.104772
LoyalCustomers,6.0,241.939709,272.723493,304.484255,4.101674,1120.92891
Champions,12.682573,261.628631,279.736515,480.744722,6.933298,2958.477163
