# 1.0.0 RFM

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import plotly.express as px

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append("../") 

from utils.RFM import rfm_v1, rfm_v2
from utils.paths import make_dir_line

modality = 'p'
project = 'Australian'
data = make_dir_line(modality, project)

raw = data('raw')
models = data('models')

# Project RFM

RFM is abbreviation for Recency, Frequency and Monetary. 

It is a technique that helps determine marketing and sales strategies based on customers’ buying habits.

- Recency: Time passed since the customer’s last purchase. In other words, it is the “time passed since the last contact of the customer”.

<center>Recency= RFM analysis date — Last purchase date</center>

- Frequency: Total number of purchases. It shows how frequently the customer does shopping. It can be found from the number of the invoices that one customer has.

- Monetary (Monetary Value): Total spending by the customer.


Customer segmentation is the process of separating these values into groups by scoring between 1 and 5. Depending on these scores, the customers are segmented into different groups. These groups can be shown on the Recency and Frequency Grid as the following:

In [4]:
df = pd.read_csv(raw / 'sample.csv')
df['date'] =  pd.to_datetime(df['date'])
df.head()

Unnamed: 0,invoice,date,customer_id,stockcode,price
0,1,2020-07-25,d18734,y7,50.45
1,2,2020-01-17,c21086,x7,25.3
2,3,2019-07-05,d18185,z5,18.4
3,4,2019-02-26,c18331,z2,5.5
4,5,2019-02-10,b16309,y7,18.4


In [5]:
df.shape

(100000, 5)

In [6]:
# print a concise summary of a DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   invoice      100000 non-null  int64         
 1   date         100000 non-null  datetime64[ns]
 2   customer_id  100000 non-null  object        
 3   stockcode    100000 non-null  object        
 4   price        100000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 3.8+ MB


In [7]:
# count number of distinct ids, ignore NaN value
df["invoice"].nunique()

100000

In [8]:
# How many prices are there?
df["price"].value_counts().head()

price
50.45    20180
25.30    20146
12.99    20001
18.40    19849
5.50     19824
Name: count, dtype: int64

In [9]:
# sorting the products from the most purhased product to the least along with their prices: 
df.groupby("date").agg({"price":"sum"}).sort_values("date", ascending = False).head(10)

Unnamed: 0_level_0,price
date,Unnamed: 1_level_1
2020-12-31,2975.0
2020-12-30,3370.69
2020-12-29,3768.99
2020-12-28,2914.32
2020-12-27,2692.86
2020-12-26,3719.75
2020-12-25,2898.33
2020-12-24,3746.14
2020-12-23,3682.1
2020-12-22,2366.72


In [10]:
df.groupby(["date","invoice"] ).agg({"price":"sum"}).sort_values("price", ascending=True).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,price
date,invoice,Unnamed: 2_level_1
2020-10-07,51869,5.5
2020-03-10,15065,5.5
2019-04-18,74656,5.5
2020-03-10,17426,5.5
2019-04-18,74243,5.5


In [11]:
df.isnull().sum()

invoice        0
date           0
customer_id    0
stockcode      0
price          0
dtype: int64

In [12]:
# This is another way to do the same thing as above
df.sort_values("date", ascending=True).head() 

Unnamed: 0,invoice,date,customer_id,stockcode,price
2316,2317,2019-01-01,c19511,y4,50.45
88380,88381,2019-01-01,a16609,z5,18.4
41675,41676,2019-01-01,e20346,z7,5.5
41603,41604,2019-01-01,a16487,x4,12.99
41493,41494,2019-01-01,d19076,z5,50.45


In [13]:
# how much money we gained?
df.groupby("date").agg({"price":"sum"}).sort_values("price", ascending = False).head()

Unnamed: 0_level_0,price
date,Unnamed: 1_level_1
2019-09-23,4173.63
2020-06-16,3998.65
2020-09-12,3935.83
2019-10-12,3916.61
2019-02-24,3914.46


## 1.2.0 Data Preparation 

In [14]:
df.describe([0.01,0.05,0.10,0.25,0.50,0.75,0.90,0.95, 0.99]).T

Unnamed: 0,count,mean,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max,std
invoice,100000.0,50000.5,1.0,1000.99,5000.95,10000.9,25000.75,50000.5,75000.25,90000.1,95000.05,99000.01,100000.0,28867.657797
date,100000.0,2019-12-30 21:32:37.824000,2019-01-01 00:00:00,2019-01-08 00:00:00,2019-02-05 00:00:00,2019-03-14 00:00:00,2019-07-01 00:00:00,2019-12-30 00:00:00,2020-07-01 00:00:00,2020-10-19 00:00:00,2020-11-25 00:00:00,2020-12-24 00:00:00,2020-12-31 00:00:00,
price,100000.0,22.618414,5.5,5.5,5.5,5.5,12.99,18.4,25.3,50.45,50.45,50.45,50.45,15.425024


In [15]:
#we got the outlier values for our information, but we will not use it because we do not build a model
for feature in ['price']:

    Q1 = df[feature].quantile(0.01)
    Q3 = df[feature].quantile(0.99)
    IQR = Q3-Q1
    upper = Q3 + 1.5*IQR
    lower = Q1 - 1.5*IQR

    if df[(df[feature] > upper) | (df[feature] < lower)].any(axis=None):
        print(feature,"yes")
        print(df[(df[feature] > upper) | (df[feature] < lower)].shape[0])
    else:
        print(feature, "no")

price no


## 1.3.0 RFM

### 1.3.1 First RFM

In [16]:
rfm, _ = rfm_v1(dataset=df, id_customer='customer_id', date='date', id_facture='invoice', money='price', 
                cut_r=None, 
                cut_f=[0,1,2,3,7,14], 
                cut_m=None
               )
rfm.head()

Unnamed: 0,id_customer,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,rfm_score,segment,group
0,a15600,153,3,101.05,3,3,4,334,Potential higt,9
1,a15601,170,2,11.0,3,2,1,321,About to Sleep,2
2,a15602,317,3,106.4,2,3,5,235,Need Attention,8
3,a15603,22,3,43.79,5,3,2,532,New Customers,3
4,a15604,7,5,93.49,5,4,4,544,Champions,10


In [17]:
fig = px.scatter_3d(rfm, x='r_quartile', y='f_quartile', z='m_quartile', color='segment', # color='group',
                    title = 'RFM Rubik Cube',
                    category_orders={'segment': ['Hibernating','About to Sleep','New Customers','Potential low',
                                                 'At Risk','New Customers higt','Unique higt - Promising',
                                                 'Need Attention','Potential higt','Champions']},
                    color_discrete_map={'Hibernating':'#684cf6','About to Sleep':'#90dde0','New Customers':'#447FF5',
                                        'Potential low':'#78e591','At Risk':'#1e5274','New Customers higt':'#FFE343',
                                        'Unique higt - Promising':'#9EFF43','Need Attention':'#4AC3FF',
                                        'Potential higt':'#D04AFF','Champions':'#FF4AC8'},
                    labels={'r_quartile':'R Quantile', 
                            'f_quartile':'F Quantile', 
                            'm_quartile':'M Quantile', 
                            'group':'Group',
                            'segment':'Segment'
                           }
                   )
fig.show()

In [18]:
fig = px.scatter_3d(rfm, x='recency', y='frequency', z='monetary', color='segment', # color='group',
                    title = 'RFM original values',
                    category_orders={'segment': ['Hibernating','About to Sleep','New Customers','Potential low',
                                                 'At Risk','New Customers higt','Unique higt - Promising',
                                                 'Need Attention','Potential higt','Champions']},
                    color_discrete_map={'Hibernating':'#684cf6','About to Sleep':'#90dde0','New Customers':'#447FF5',
                                        'Potential low':'#78e591','At Risk':'#1e5274','New Customers higt':'#FFE343',
                                        'Unique higt - Promising':'#9EFF43','Need Attention':'#4AC3FF',
                                        'Potential higt':'#D04AFF','Champions':'#FF4AC8'},
                    labels={'recency':'Recency', 
                            'frequency':'Frequency', 
                            'monetary':'Monetary', 
                            'group':'Group',
                            'segment':'Segment'
                           }
                   )
fig.show()

In [19]:
rfm['segment'].value_counts(normalize=True)

segment
Hibernating                0.196340
About to Sleep             0.184675
Champions                  0.158890
New Customers              0.155133
Potential higt             0.121410
Need Attention             0.086020
Potential low              0.036511
At Risk                    0.030451
New Customers higt         0.015907
Unique higt - Promising    0.014665
Name: proportion, dtype: float64

In [20]:
rfm[["segment","recency","frequency","monetary"]].groupby("segment").agg(["mean"])

Unnamed: 0_level_0,recency,frequency,monetary
Unnamed: 0_level_1,mean,mean,mean
segment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
About to Sleep,257.417227,2.331255,46.756356
At Risk,242.510448,4.1801,57.058776
Champions,53.719298,5.132342,124.008785
Hibernating,438.234259,1.391821,25.454224
Need Attention,322.266643,4.019021,108.517179
New Customers,58.758594,2.209766,40.963193
New Customers higt,97.542857,2.0,84.516571
Potential higt,128.112803,3.932368,107.814944
Potential low,55.824896,4.280498,57.728996
Unique higt - Promising,391.878099,2.0,83.70031


In [21]:
fig = px.box(rfm, y='recency', color='segment', 
             title = 'RFM Recency',
             category_orders={'segment': ['Hibernating','About to Sleep','New Customers','Potential low',
                                          'At Risk','New Customers higt','Unique higt - Promising',
                                          'Need Attention','Potential higt','Champions']},
             color_discrete_map={'Hibernating':'#684cf6','About to Sleep':'#90dde0','New Customers':'#447FF5',
                                 'Potential low':'#78e591','At Risk':'#1e5274','New Customers higt':'#FFE343',
                                 'Unique higt - Promising':'#9EFF43','Need Attention':'#4AC3FF',
                                 'Potential higt':'#D04AFF','Champions':'#FF4AC8'},
             labels={'recency':'Recency', 
                     'frequency':'Frequency', 
                     'monetary':'Monetary', 
                     'group':'Group',
                     'segment':'Segment'
                    })
fig.show()

In [22]:
fig = px.box(rfm, y='frequency', color='segment', 
             title = 'RFM Frequency',
             category_orders={'segment': ['Hibernating','About to Sleep','New Customers','Potential low',
                                          'At Risk','New Customers higt','Unique higt - Promising',
                                          'Need Attention','Potential higt','Champions']},
             color_discrete_map={'Hibernating':'#684cf6','About to Sleep':'#90dde0','New Customers':'#447FF5',
                                 'Potential low':'#78e591','At Risk':'#1e5274','New Customers higt':'#FFE343',
                                 'Unique higt - Promising':'#9EFF43','Need Attention':'#4AC3FF',
                                 'Potential higt':'#D04AFF','Champions':'#FF4AC8'},
             labels={'recency':'Recency', 
                     'frequency':'Frequency', 
                     'monetary':'Monetary', 
                     'group':'Group',
                     'segment':'Segment'
                    })
fig.show()

In [23]:
fig = px.box(rfm, y='monetary', color='segment', 
             title = 'RFM Monetary',
             category_orders={'segment': ['Hibernating','About to Sleep','New Customers','Potential low',
                                          'At Risk','New Customers higt','Unique higt - Promising',
                                          'Need Attention','Potential higt','Champions']},
             color_discrete_map={'Hibernating':'#684cf6','About to Sleep':'#90dde0','New Customers':'#447FF5',
                                 'Potential low':'#78e591','At Risk':'#1e5274','New Customers higt':'#FFE343',
                                 'Unique higt - Promising':'#9EFF43','Need Attention':'#4AC3FF',
                                 'Potential higt':'#D04AFF','Champions':'#FF4AC8'},
             labels={'recency':'Recency', 
                     'frequency':'Frequency', 
                     'monetary':'Monetary', 
                     'group':'Group',
                     'segment':'Segment'
                    })
fig.show()

### 1.3.2 Second RFM

In [24]:
rfm2, _ = rfm_v2(dataset=df, id_customer='customer_id', date='date', id_facture='invoice', money='price', 
                 cut_r=None, 
                 cut_f=[0,1,2,3,7,14], 
                 cut_m=None,
                 r_weight=1, f_weight=1, m_weight=1,
                 n_groups=8
                )
rfm2.head()

Unnamed: 0,id_customer,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,rfm_score,group
0,a15600,153,3,101.05,3,3,4,3.333333,5
1,a15601,170,2,11.0,3,2,1,2.0,2
2,a15602,317,3,106.4,2,3,5,3.333333,5
3,a15603,22,3,43.79,5,3,2,3.333333,5
4,a15604,7,5,93.49,5,4,4,4.333333,7


In [25]:
fig = px.scatter_3d(rfm2, x='r_quartile', y='f_quartile', z='m_quartile', color='group',
                    title = 'RFM Rubik Cube',
                    category_orders={'group': ['1','2','3','4','5','6','7','8']},
                    color_discrete_map={'1':'#684cf6','2':'#90dde0','3':'#447FF5','4':'#78e591','5':'#1e5274',
                                        '6':'#FFE343','7':'#9EFF43','8':'#4AC3FF'},
                    labels={'r_quartile':'R Quantile', 
                            'f_quartile':'F Quantile', 
                            'm_quartile':'M Quantile', 
                            'group':'Group',
                            'segment':'Segment'
                           }
                   )
fig.show()

In [26]:
fig = px.scatter_3d(rfm2, x='recency', y='frequency', z='monetary', color='group',
                    title = 'RFM original values',
                    category_orders={'group': ['1','2','3','4','5','6','7','8']},
                    color_discrete_map={'1':'#684cf6','2':'#90dde0','3':'#447FF5','4':'#78e591','5':'#1e5274',
                                        '6':'#FFE343','7':'#9EFF43','8':'#4AC3FF'},
                    labels={'recency':'Recency', 
                            'frequency':'Frequency', 
                            'monetary':'Monetary', 
                            'group':'Group',
                            'segment':'Segment'
                           }
                   )
fig.show()

In [27]:
rfm2['group'].value_counts(normalize=True)

group
3    0.186462
7    0.177100
2    0.141831
1    0.129166
6    0.104381
5    0.102988
4    0.099261
8    0.058811
Name: proportion, dtype: float64

In [28]:
rfm2[["group","recency","frequency","monetary"]].groupby("group").agg(["mean"])

Unnamed: 0_level_0,recency,frequency,monetary
Unnamed: 0_level_1,mean,mean,mean
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,482.156697,1.127141,20.840758
2,311.182226,1.794061,33.485127
3,220.232044,2.349041,48.609898
4,175.769231,2.907814,62.85105
5,153.445131,3.360989,74.82103
6,136.881858,3.927141,91.897512
7,82.071514,4.597776,112.51031
8,28.524472,5.660999,145.637115


In [29]:
fig = px.box(rfm2, y='recency', color='group', 
             title = 'RFM Recency',
             category_orders={'group': ['1','2','3','4','5','6','7','8']},
             color_discrete_map={'1':'#684cf6','2':'#90dde0','3':'#447FF5','4':'#78e591','5':'#1e5274',
                                 '6':'#FFE343','7':'#9EFF43','8':'#4AC3FF'},
             labels={'recency':'Recency', 
                     'frequency':'Frequency', 
                     'monetary':'Monetary', 
                     'group':'Group',
                     'segment':'Segment'
                    })
fig.show()

In [30]:
fig = px.box(rfm2, y='frequency', color='group', 
             title = 'RFM Frequency',
             category_orders={'group': ['1','2','3','4','5','6','7','8']},
             color_discrete_map={'1':'#684cf6','2':'#90dde0','3':'#447FF5','4':'#78e591','5':'#1e5274',
                                 '6':'#FFE343','7':'#9EFF43','8':'#4AC3FF'},
             labels={'recency':'Recency', 
                     'frequency':'Frequency', 
                     'monetary':'Monetary', 
                     'group':'Group',
                     'segment':'Segment'
                    })
fig.show()

In [31]:
fig = px.box(rfm2, y='monetary', color='group', 
             title = 'RFM Monetary',
             category_orders={'group': ['1','2','3','4','5','6','7','8']},
             color_discrete_map={'1':'#684cf6','2':'#90dde0','3':'#447FF5','4':'#78e591','5':'#1e5274',
                                 '6':'#FFE343','7':'#9EFF43','8':'#4AC3FF'},
             labels={'recency':'Recency', 
                     'frequency':'Frequency', 
                     'monetary':'Monetary', 
                     'group':'Group',
                     'segment':'Segment'
                    })
fig.show()

### 1.4.0 Save

In [32]:
save = rfm2.copy()
# save = save.loc[:,['id_customer', 'group']]
save.rename(columns={'id_customer':'customer_id','group':'rfm'}, inplace=True)
save.head()

Unnamed: 0,customer_id,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,rfm_score,rfm
0,a15600,153,3,101.05,3,3,4,3.333333,5
1,a15601,170,2,11.0,3,2,1,2.0,2
2,a15602,317,3,106.4,2,3,5,3.333333,5
3,a15603,22,3,43.79,5,3,2,3.333333,5
4,a15604,7,5,93.49,5,4,4,4.333333,7


In [33]:
# save
save.to_parquet(models / 'df_rfm.parquet.gzip', compression='gzip')