# RFM Analysis
## Method 2: Using Functions

In [19]:
# import pytho modules for analysis
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

## Loading the Dataset

In [20]:
url = "https://raw.githubusercontent.com/nyangweso-rodgers/Data_Analytics/main/Analytics-with-Python/Exploratory-Data-Analysis-with-Python/Exploratory-Data-Analysis-for-Online-Retail-Store/grouped_daily_customer_data.csv"
daily_customers_sales_df = pd.read_csv(url, parse_dates=['Date', 'MonthYear'])

# check the shape of the data
daily_customers_sales_df.shape

(16766, 6)

* Preview the Dataset

In [21]:
daily_customers_sales_df.dtypes

CustomerID                      float64
Date                     datetime64[ns]
Country                          object
MonthYear                datetime64[ns]
TotalAmount                     float64
CountOfUniqueInvoices             int64
dtype: object

* Convert CustomerID from float64 to string

In [22]:
## using dictionary to convert specific columns
convert_dic = {
    'CustomerID': str
}
daily_customers_sales_df = daily_customers_sales_df.astype(convert_dic)

In [23]:
daily_customers_sales_df.head()

Unnamed: 0,CustomerID,Date,Country,MonthYear,TotalAmount,CountOfUniqueInvoices
0,12346.0,2011-01-18,United Kingdom,2011-01-01,77183.6,1
1,12347.0,2010-12-07,Iceland,2010-12-01,711.79,1
2,12347.0,2011-01-26,Iceland,2011-01-01,475.39,1
3,12347.0,2011-04-07,Iceland,2011-04-01,636.25,1
4,12347.0,2011-06-09,Iceland,2011-06-01,382.52,1


### Summuary Statistics

In [24]:
daily_customers_sales_df.describe(include='all')

  daily_customers_sales_df.describe(include='all')
  daily_customers_sales_df.describe(include='all')


Unnamed: 0,CustomerID,Date,Country,MonthYear,TotalAmount,CountOfUniqueInvoices
count,16766.0,16766,16766,16766,16766.0,16766.0
unique,4339.0,305,37,13,,
top,14911.0,2011-11-17 00:00:00,United Kingdom,2011-11-01 00:00:00,,
freq,132.0,125,15095,2391,,
first,,2010-12-01 00:00:00,,2010-12-01 00:00:00,,
last,,2011-12-09 00:00:00,,2011-12-01 00:00:00,,
mean,,,,,531.516635,1.105571
std,,,,,1907.700011,0.44495
min,,,,,0.0,1.0
25%,,,,,178.71,1.0


### Step 1: Creating an RFM Table
* Get the _First Invoice Date_ and _Last Invoice Date_ from the dataset.

In [25]:
## Getting first and last order date
print("Minimum Date: ", daily_customers_sales_df['Date'].min(), "Maximum Date: ", daily_customers_sales_df['Date'].max())

Minimum Date:  2010-12-01 00:00:00 Maximum Date:  2011-12-09 00:00:00


* In the dataset, the first Sales Date is __1st Dec. 2010__, and the Last Sales Date is __9th Dec. 2011__.
* We use the _Last Sales Date_ in the calculation of __RECENCY__. 

In [26]:
# add 1 day to get the 'todays' date
Now = daily_customers_sales_df['Date'].max() + dt.timedelta(days=1)
Now

Timestamp('2011-12-10 00:00:00')

### Step: Create a Function to Define Recency and Tenure
* _Remarks_: We have added another two columns:
  * the tenure, which represents the time since the customer first purchase 
  * mean value

In [27]:
def get_recency(x):
    last_purchase = x.max()
    return (Now - last_purchase).days

def get_tenure(x):
    first_purchase = x.min()
    return (Now - first_purchase).days

# aggregate data by customers
RFM_Table = daily_customers_sales_df.groupby('CustomerID').agg(
    Recency = ('Date', get_recency),
    Frequency = ('CustomerID', 'count'),
    MonetaryValue = ('TotalAmount', 'sum'),
    Tenure = ('Date', get_tenure),
    MeanValue = ('TotalAmount', 'mean')
)

# Preview
RFM_Table

Unnamed: 0_level_0,Recency,Frequency,MonetaryValue,Tenure,MeanValue
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12346.0,326,1,77183.60,326,77183.600000
12347.0,3,7,4310.00,368,615.714286
12348.0,76,4,1797.24,359,449.310000
12349.0,19,1,1757.55,19,1757.550000
12350.0,311,1,334.40,311,334.400000
...,...,...,...,...,...
18280.0,278,1,180.60,278,180.600000
18281.0,181,1,80.82,181,80.820000
18282.0,8,2,178.05,127,89.025000
18283.0,4,14,2094.88,338,149.634286


### Step 2: Summary Statistics of the RFM Table

In [28]:
RFM_Table.describe(include='all')

Unnamed: 0,Recency,Frequency,MonetaryValue,Tenure,MeanValue
count,4339.0,4339.0,4339.0,4339.0,4339.0
mean,93.041484,3.864024,2053.793018,223.782899,458.02906
std,100.007757,5.952745,8988.248381,117.883623,1929.03957
min,1.0,1.0,0.0,1.0,0.0
25%,18.0,1.0,307.245,113.0,185.265833
50%,51.0,2.0,674.45,249.0,304.506364
75%,142.5,4.0,1661.64,327.0,451.193333
max,374.0,132.0,280206.02,374.0,84236.25


### Step 3: RFM Segments
* The next thing we need to do is to segment the __recency__, __frequency__, and __total_value__ into the categories.
* For our use case, we can split each feature into 4 quartiles that roughly divide the sample into 4 segments of equal proportion. 
* We have called these scores __R__, __F__, and __M__ respectively.

In [29]:
# we reset the index for the next set of calculations
RFM_Segments = RFM_Table.reset_index()

In [30]:
# Recency Quartile Segmentation
r_labels = range( 4, 0, -1)
Recency = RFM_Table['Recency']
r_quartiles, bins = pd.qcut(Recency, 4, labels=r_labels, retbins=True)
RFM_Segments = RFM_Segments.assign(R_Quartile = r_quartiles.values)

# Frequency Quartile Segmentation
f_labels = range(1, 5)
Frequency = RFM_Table['Frequency'].rank(method='first') # rank to deal with duplicate values
f_quartiles, bins = pd.qcut(Frequency, 4, labels=f_labels, retbins=True)
RFM_Segments = RFM_Segments.assign(F_Quartile = f_quartiles.values)

# Monetary Value Quartile Segmentation
m_labels = range(1, 5)
MonetaryValue = RFM_Table['MonetaryValue']
m_quartiles, bins = pd.qcut(MonetaryValue, 4, labels=m_labels, retbins=True)
RFM_Segments = RFM_Segments.assign(M_Quartile = m_quartiles.values)

# 
RFM_Segments['RFMClass'] = RFM_Segments["R_Quartile"].astype(str) + RFM_Segments["F_Quartile"].astype(str) + RFM_Segments['M_Quartile'].astype(str)
RFM_Segments['RFMScore'] = RFM_Segments["R_Quartile"].astype(int) + RFM_Segments["F_Quartile"].astype(int) + RFM_Segments['M_Quartile'].astype(int)

# preview
RFM_Segments

Unnamed: 0,CustomerID,Recency,Frequency,MonetaryValue,Tenure,MeanValue,R_Quartile,F_Quartile,M_Quartile,RFMClass,RFMScore
0,12346.0,326,1,77183.60,326,77183.600000,1,1,4,114,6
1,12347.0,3,7,4310.00,368,615.714286,4,4,4,444,12
2,12348.0,76,4,1797.24,359,449.310000,2,3,4,234,9
3,12349.0,19,1,1757.55,19,1757.550000,3,1,4,314,8
4,12350.0,311,1,334.40,311,334.400000,1,1,2,112,4
...,...,...,...,...,...,...,...,...,...,...,...
4334,18280.0,278,1,180.60,278,180.600000,1,2,1,121,4
4335,18281.0,181,1,80.82,181,80.820000,1,2,1,121,4
4336,18282.0,8,2,178.05,127,89.025000,4,3,1,431,8
4337,18283.0,4,14,2094.88,338,149.634286,4,4,4,444,12


### Step 4: Defining RFM Tiers

* We can further divide the customers into __RFM Tiers__ by binning together ranges of scores. 
* For example, we can say that customers with scores ranging from 3 to 5 are Bronze, from 5 to 9 are Silver and from 9 to 12 are Gold.

In [31]:
# group customers into different Tiers
def get_tier(df):
    if df['RFMScore'] >= 9:
        return 'GOLD'
    elif (df['RFMScore'] >= 5) and (df['RFMScore'] < 9):
        return 'SILVER'
    else:
        return "BRONZE"
    
RFM_Segments['Tier'] = RFM_Segments.apply(get_tier, axis=1)
RFM_Segments


Unnamed: 0,CustomerID,Recency,Frequency,MonetaryValue,Tenure,MeanValue,R_Quartile,F_Quartile,M_Quartile,RFMClass,RFMScore,Tier
0,12346.0,326,1,77183.60,326,77183.600000,1,1,4,114,6,SILVER
1,12347.0,3,7,4310.00,368,615.714286,4,4,4,444,12,GOLD
2,12348.0,76,4,1797.24,359,449.310000,2,3,4,234,9,GOLD
3,12349.0,19,1,1757.55,19,1757.550000,3,1,4,314,8,SILVER
4,12350.0,311,1,334.40,311,334.400000,1,1,2,112,4,BRONZE
...,...,...,...,...,...,...,...,...,...,...,...,...
4334,18280.0,278,1,180.60,278,180.600000,1,2,1,121,4,BRONZE
4335,18281.0,181,1,80.82,181,80.820000,1,2,1,121,4,BRONZE
4336,18282.0,8,2,178.05,127,89.025000,4,3,1,431,8,SILVER
4337,18283.0,4,14,2094.88,338,149.634286,4,4,4,444,12,GOLD


* Get Distribution of Customers under Different Tiers

In [32]:
RFM_Segments['Tier'].value_counts()

SILVER    1850
GOLD      1680
BRONZE     809
Name: Tier, dtype: int64