# Customer Segmentation using RFM analysis
### Method 2: Defining Functions

In [3]:
## Import appropriate modules
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
plt.style.use('seaborn-white')
import seaborn as sns 
import datetime as dt

ImportError: DLL load failed: The specified module could not be found.

In [2]:
# load the data set
df = pd.read_excel('data_for_RFM_score_analysis.xlsx')
print(df.head()) 

                                     id       date   sales
0  000173c5-978c-4b52-b7a4-5ebf974deb86 2020-08-13  1690.0
1  000173c5-978c-4b52-b7a4-5ebf974deb86 2020-08-14  6145.0
2  000173c5-978c-4b52-b7a4-5ebf974deb86 2020-08-15  4550.0
3  000173c5-978c-4b52-b7a4-5ebf974deb86 2020-08-17  1270.0
4  000173c5-978c-4b52-b7a4-5ebf974deb86 2020-08-20  3830.0


In [3]:
# convert the date column to datetime data type
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804659 entries, 0 to 804658
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   id      804659 non-null  object        
 1   date    804659 non-null  datetime64[ns]
 2   sales   804659 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 18.4+ MB


In [5]:
# create a snapshot date with today's date
print('Max. Date:', df['date'].max()) # Max Date: 2020-09-25 00:00:00

snapshot_date = max(df.date) + dt.timedelta(days=1)
print('Current Date:',snapshot_date) # 2020-09-26 00:00:00

Max. Date 2020-09-25 00:00:00
Current Date 2020-09-26 00:00:00


In [6]:
# create functions to get recency and tenure
def get_recency(x):
    last_purchase = x.max()
    return (snapshot_date - last_purchase).days

def get_tenure(x):
    first_purchase = x.min()
    return (snapshot_date - first_purchase).days

# aggregate data by the customers
customers = df.groupby('id').agg(
    recency=('date', get_recency),
    tenure=('date', get_tenure),
    frequency=('id', 'count'),
    total_value=('sales', 'sum'),
    mean_value=('sales', 'mean')
    )
# show 5 samples of the grouped dataframe
print(customers.sample(5))

                                      recency  tenure  frequency  total_value  \
id                                                                              
e1da4790-6a59-4ba3-ba2c-7a0b31188da2        1     162         27      39905.0   
326d72f1-9757-4e51-a969-b4e0e84f6811       42      50          2       4645.0   
cc8b16da-5c19-46b4-bae4-e294fff2347c        1     152        101     171180.0   
14c672da-19d8-4d28-a778-e398f9bfb2ac        2      25          3       1680.0   
7be4fc3d-622d-4b79-90e3-bcebe783be1d      124     144          4      12615.0   

                                       mean_value  
id                                                 
e1da4790-6a59-4ba3-ba2c-7a0b31188da2  1477.962963  
326d72f1-9757-4e51-a969-b4e0e84f6811  2322.500000  
cc8b16da-5c19-46b4-bae4-e294fff2347c  1694.851485  
14c672da-19d8-4d28-a778-e398f9bfb2ac   560.000000  
7be4fc3d-622d-4b79-90e3-bcebe783be1d  3153.750000  


### Remark 1
We have added another two columns:
* the tenure, which represents the time since the customer first purchase 
* mean value

### RFM Segments
* The next thing we need to do is to segment the __recency__, __frequency__, and __total_value__ into the categories.
* For our use case, we decided to split each feature into 4 quartiles that roughly divide the sample into 4 segments of equal proportion. 
* We have called these scores R, F, and M respectively.

In [12]:
# use only the necessary columns
customers = customers.reset_index() # re-set index
print(customers.head(5))
rfm = customers[['id', 'recency', 'frequency', 'total_value']]
print(rfm.head())

   level_0  index                                    id  recency  tenure  \
0        0      0  000173c5-978c-4b52-b7a4-5ebf974deb86        1     161   
1        1      1  0001b0ce-f323-49b5-b381-3348c7a001ab        1      85   
2        2      2  00020337-5321-4d6e-83af-67905edd8006        8      49   
3        3      3  00066d09-6e2e-4104-b03b-6927490e1972       16     163   
4        4      4  00069d0b-c994-4555-a30e-0c0799d77fe1       10     175   

   frequency  total_value   mean_value  
0         54     149415.0  2766.944444  
1         54     266800.0  4940.740741  
2          4      13385.0  3346.250000  
3         84      75388.0   897.476190  
4         48      75630.0  1575.625000  
                                     id  recency  frequency  total_value
0  000173c5-978c-4b52-b7a4-5ebf974deb86        1         54     149415.0
1  0001b0ce-f323-49b5-b381-3348c7a001ab        1         54     266800.0
2  00020337-5321-4d6e-83af-67905edd8006        8          4      13385.0
3  00

In [10]:
# recency quartile segmentation
r_labels = range(4, 0, -1)
recency = rfm['recency']
r_quartiles, bins = pd.qcut(recency, 4, labels=r_labels, retbins=True)
rfm = rfm.assign(R=r_quartiles.values)

# frequency quartile segmentation
f_labels = range(1, 5)
frequency = rfm['frequency'].rank(method='first') # rank to deal with duplicate values
f_quartiles, bins = pd.qcut(frequency, 4, labels=f_labels, retbins=True)
rfm = rfm.assign(F = f_quartiles.values)

# monetary value quartile segmentation
m_labels = range(1, 5)
monetary = rfm['total_value']
m_quartiles, bins = pd.qcut(monetary, 4, labels=m_labels, retbins=True)
rfm = rfm.assign(M = m_quartiles.values)

# show 5 samples of the newly created scores
print(rfm[['R', 'F', 'M']].sample(5))

       R  F  M
25868  3  3  3
6882   3  2  2
23371  1  1  1
14397  4  3  3
32801  4  4  3


In [15]:
r_quartiles

0        4
1        4
2        3
3        2
4        3
        ..
34892    3
34893    1
34894    4
34895    3
34896    4
Name: recency, Length: 34897, dtype: category
Categories (4, int64): [4 < 3 < 2 < 1]

### Remark 2
* To simplify the analysis, it is important that we combine the 3 different scores (R, F, and M) to create a single metric.
* There are a few approaches available:

1. The first one is to create an RFM Segment by concatenating the 3 digits from the individual scores to form a 3 character string that goes from 111 (lowest possible score in all three metrics) to 444 (highest possible score in all three metrics). The drawback of this method is the creation of many distinct segments (4x4x4 = 64 segments) which are not so easy to differentiate and prioritize (who is more valuable, a 432 or a 234 customer?).

2. Another possibility is to sum the 3 individual scores to create the RFM Score, a number ranging from 3 (lowest possible score in all metrics) to 12 (highest possible score in all metrics). Here the drawback is that customers with different buying habits (e.g. from different RFM Segments) can fall on the same score bins. For example, both customers in segments 431 and 134 would get a score of 8. On the other hand, we end up with less distinct scores to compare (4+4+4 = 12 scores), each of them with equal relevance.

### Remark 3
We can further divide the customers into RFM tiers by binning together ranges of scores. For example, we can say that customers with scores ranging from 3 to 5 are Bronze, from 5 to 9 are Silver and from 9 to 12 are Gold.

In [11]:
# group into different tiers
def get_tier(df):
    if df['score'] >= 9:
        return 'Gold'
    elif (df['score'] >= 5) and (df['score'] < 9):
        return 'Silver'
    else:
        return 'Bronze'
rfm['tier'] = rfm.apply(get_tier, axis=1)
print(rfm[['R', 'F', 'M', 'segment', 'score', 'tier']].sample(5))

KeyError: 'score'

In [13]:
rfm.head()

Unnamed: 0,id,recency,frequency,total_value
0,000173c5-978c-4b52-b7a4-5ebf974deb86,1,54,149415.0
1,0001b0ce-f323-49b5-b381-3348c7a001ab,1,54,266800.0
2,00020337-5321-4d6e-83af-67905edd8006,8,4,13385.0
3,00066d09-6e2e-4104-b03b-6927490e1972,16,84,75388.0
4,00069d0b-c994-4555-a30e-0c0799d77fe1,10,48,75630.0
