In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import plotly.express as px
import matplotlib.pyplot as plt

from Rfm import rfm_v1, rfm_v2

# to display all columns and rows:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#to arrange the decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x) 

In [2]:
# read dataset
df = pd.read_csv('sample.csv', sep = ',', decimal = '.', header = 0, encoding = 'utf-8')
df['date'] =  pd.to_datetime(df['date'])
df.head()

Unnamed: 0,invoice,date,customer_id,stockcode,price
0,1,2020-07-25,d18734,y7,50.45
1,2,2020-01-17,c21086,x7,25.3
2,3,2019-07-05,d18185,z5,18.4
3,4,2019-02-26,c18331,z2,5.5
4,5,2019-02-10,b16309,y7,18.4


In [3]:
df.shape

(100000, 5)

In [5]:
rfm, _ = rfm_v1(dataset=df, id_customer='customer_id', date='date', id_facture='invoice', money='price', 
                cut_r=None, 
                cut_f=[0,1,2,3,7,14], 
                cut_m=None
               )
rfm.head()

Unnamed: 0,id_customer,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,rfm_score,segment,group
0,a15600,153,3,101.05,3,3,4,334,Potential higt,9
1,a15601,170,2,11.0,3,2,1,321,About to Sleep,2
2,a15602,317,3,106.4,2,3,5,235,Potential higt,9
3,a15603,22,3,43.79,5,3,2,532,About to Sleep,2
4,a15604,7,5,93.49,5,4,4,544,Need Attention,8


In [6]:
from sklearn.preprocessing import StandardScaler 
from sklearn.cluster import DBSCAN 

In [7]:
Clus_dataSet = rfm[['r_quartile','f_quartile','m_quartile']]
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)

In [99]:
Clus_dataSet

array([[-0.00261329,  0.21824494,  0.72058615],
       [-0.00261329, -0.67954342, -1.39188724],
       [-0.70957256,  0.21824494,  1.42474395],
       ...,
       [-0.70957256,  0.21824494,  0.72058615],
       [ 1.41130525,  1.1160333 ,  1.42474395],
       [ 0.70434598, -0.67954342,  0.01642836]])

In [127]:
# DBSCAN
model = DBSCAN(eps=0.705, min_samples=500).fit(Clus_dataSet)
core_samples_mask = np.zeros_like(model.labels_, dtype=bool)
core_samples_mask[model.core_sample_indices_] = True
labels = model.labels_
rfm["Clus_Db"] = labels

In [128]:
rfm.head()

Unnamed: 0,id_customer,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,rfm_score,segment,group,Clus_Db
0,a15600,153,3,101.05,3,3,4,334,Potential higt,9,0
1,a15601,170,2,11.0,3,2,1,321,About to Sleep,2,1
2,a15602,317,3,106.4,2,3,5,235,Potential higt,9,2
3,a15603,22,3,43.79,5,3,2,532,About to Sleep,2,3
4,a15604,7,5,93.49,5,4,4,544,Need Attention,8,4


In [129]:
realClusterNum = len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels)) 
realClusterNum, clusterNum

(17, 18)

In [130]:
rfm['Clus_Db'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, 10, 11, 12, 13, 14, 15,
       16], dtype=int64)

In [131]:
rfm.head(1)

Unnamed: 0,id_customer,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,rfm_score,segment,group,Clus_Db
0,a15600,153,3,101.05,3,3,4,334,Potential higt,9,0


In [132]:
fig = px.scatter_3d(rfm, x='r_quartile', y='f_quartile', z='m_quartile', color='Clus_Db', # color='group',
                    title = 'RFM Rubik Cube',
                    #category_orders={'segment': ['Hibernating','About to Sleep','New Customers','Potential low',
                    #                             'At Risk','New Customers higt','Unique higt - Promising',
                    #                             'Need Attention','Potential higt','Champions']},
                    #color_discrete_map={'Hibernating':'#684cf6','About to Sleep':'#90dde0','New Customers':'#447FF5',
                    #                    'Potential low':'#78e591','At Risk':'#1e5274','New Customers higt':'#FFE343',
                    #                    'Unique higt - Promising':'#9EFF43','Need Attention':'#4AC3FF',
                    #                    'Potential higt':'#D04AFF','Champions':'#FF4AC8'},
                    # category_orders={'group': ['1','2','3','4','5','6','7','8','9','10']},
                    # color_discrete_map={'1':'#684cf6','2':'#90dde0','3':'#447FF5','4':'#78e591','5':'#1e5274',
                    #                     '6':'#FFE343','7':'#9EFF43','8':'#4AC3FF','9':'#D04AFF','10':'#FF4AC8'},
                    labels={'r_quartile':'R Quantile', 
                            'f_quartile':'F Quantile', 
                            'm_quartile':'M Quantile', 
                            'Clus_Dbcc':'Group',
                            'segment':'Segment'
                           }
                   )
fig.show()

In [133]:
fig = px.scatter_3d(rfm, x='recency', y='frequency', z='monetary', color='Clus_Db', # color='group',
                    title = 'RFM original values',
                    category_orders={'segment': ['Hibernating','About to Sleep','New Customers','Potential low',
                                                 'At Risk','New Customers higt','Unique higt - Promising',
                                                 'Need Attention','Potential higt','Champions']},
                    color_discrete_map={'Hibernating':'#684cf6','About to Sleep':'#90dde0','New Customers':'#447FF5',
                                        'Potential low':'#78e591','At Risk':'#1e5274','New Customers higt':'#FFE343',
                                        'Unique higt - Promising':'#9EFF43','Need Attention':'#4AC3FF',
                                        'Potential higt':'#D04AFF','Champions':'#FF4AC8'},
                    # category_orders={'group': ['1','2','3','4','5','6','7','8','9','10']},
                    # color_discrete_map={'1':'#684cf6','2':'#90dde0','3':'#447FF5','4':'#78e591','5':'#1e5274',
                    #                     '6':'#FFE343','7':'#9EFF43','8':'#4AC3FF','9':'#D04AFF','10':'#FF4AC8'},
                    labels={'recency':'Recency', 
                            'frequency':'Frequency', 
                            'monetary':'Monetary', 
                            'group':'Group',
                            'segment':'Segment'
                           }
                   )
fig.show()