In [1]:
import lifetimes
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from lifetimes import BetaGeoFitter # BG/NBD
from lifetimes import GammaGammaFitter # Gamma-Gamma Model
from lifetimes.plotting import plot_frequency_recency_matrix

pd.set_option('display.max_columns', None)

In [2]:
df_base = pd.read_csv('Online_Retail.csv')

In [9]:
df = df_base.copy()

# Data Inspection

In [10]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495478 entries, 0 to 495477
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    495478 non-null  object 
 1   StockCode    495478 non-null  object 
 2   Description  494024 non-null  object 
 3   Quantity     495478 non-null  int64  
 4   InvoiceDate  495478 non-null  object 
 5   UnitPrice    495478 non-null  float64
 6   CustomerID   361878 non-null  float64
 7   Country      495478 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 30.2+ MB


In [12]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,495478.0,495478.0,361878.0
mean,8.605486,4.532422,15547.871368
std,227.588756,99.315438,1594.40259
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,14194.0
50%,3.0,2.1,15514.0
75%,10.0,4.13,16931.0
max,80995.0,38970.0,18287.0


There seem to be extreme values in Quantity & Unit Price on both ends (negative & positive)

# Data Wrangling

In [13]:
df = df.query('Quantity > 0')
df = df.query('UnitPrice > 0')

In [14]:
df = df[~df['InvoiceNo'].str.contains('C', na = False)]

In [16]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     130802
Country             0
dtype: int64

In [17]:
df.dropna(inplace = True)

In [26]:
# Define a function to handle outliers in our data

def find_boundaries(df, variable, q1 = 0.05, q2 = 0.95):
    # boundaries represent quartiles
    
    lower_boundary = df[variable].quantile(q1)
    upper_boundary = df[variable].quantile(q2)
    
    return lower_boundary, upper_boundary
    
def capping_outliers(df, variable):
    
    lower_boundary, upper_boundary = find_boundaries(df, variable)
    df[variable] = np.where(df[variable] > upper_boundary, upper_boundary, np.where(df[variable] < lower_boundary, lower_boundary, df[variable]))

In [30]:
capping_outliers(df, 'UnitPrice')

In [31]:
capping_outliers(df, 'Quantity')

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 354321 entries, 0 to 495477
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    354321 non-null  object 
 1   StockCode    354321 non-null  object 
 2   Description  354321 non-null  object 
 3   Quantity     354321 non-null  float64
 4   InvoiceDate  354321 non-null  object 
 5   UnitPrice    354321 non-null  float64
 6   CustomerID   354321 non-null  float64
 7   Country      354321 non-null  object 
dtypes: float64(3), object(5)
memory usage: 24.3+ MB


In [32]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,354321.0,354321.0,354321.0
mean,8.348212,2.651029,15552.486392
std,9.245021,2.248187,1594.52715
min,1.0,0.42,12346.0
25%,2.0,1.25,14194.0
50%,4.0,1.95,15522.0
75%,12.0,3.75,16931.0
max,36.0,8.5,18287.0
