## Exploring data

In [3]:
#import libraries
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objs as go

In [4]:
template_style = 'plotly_white'

In [6]:
df = pd.read_excel("new_customer_data.xlsx", index_col=[0])
df.head(10)

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,...,profit,Age,gender,job_industry_category,job_title,owns_car,past_3_years_bike_related_purchases,wealth_segment,postcode,state
0,1,2,2950,2017-02-25,0,Approved,Solex,Standard,medium,medium,...,17.87,68,Male,Financial Services,Software Engineer I,Yes,19,Mass Customer,3064,VIC
1,2,3,3120,2017-05-21,1,Approved,Trek Bicycles,Standard,medium,large,...,1702.55,44,Female,Health,Clinical Specialist,Yes,89,Mass Customer,2196,NSW
2,3,37,402,2017-10-16,0,Approved,OHM Cycles,Standard,low,medium,...,1544.61,45,Male,Retail,Desktop Support Technician,No,9,Affluent Customer,2835,NSW
3,4,88,3135,2017-08-31,0,Approved,Norco Bicycles,Standard,medium,medium,...,817.36,61,Male,Financial Services,Staff Scientist,No,83,Mass Customer,2096,NSW
5,6,25,2339,2017-03-08,1,Approved,Giant Bicycles,Road,medium,medium,...,709.34,64,Male,Property,Web Designer II,Yes,3,Affluent Customer,2153,NSW
6,7,22,1542,2017-04-21,1,Approved,WeareA2B,Standard,medium,medium,...,15.08,44,Male,Health,Food Chemist,Yes,56,Mass Customer,2155,NSW
7,8,15,2459,2017-07-15,0,Approved,WeareA2B,Standard,medium,medium,...,1279.4,37,Female,Manufacturing,Quality Engineer,No,67,High Net Worth,4034,QLD
8,9,67,1305,2017-08-10,0,Approved,Solex,Standard,medium,large,...,690.49,53,Male,Financial Services,Executive Secretary,Yes,97,High Net Worth,4124,QLD
9,10,12,3262,2017-08-30,1,Approved,WeareA2B,Standard,medium,medium,...,1069.55,63,Male,Entertainment,Product Engineer,No,65,Mass Customer,4221,QLD
11,12,61,2783,2017-01-05,1,Approved,OHM Cycles,Standard,low,medium,...,14.23,44,Male,Retail,Sales Representative,Yes,14,Affluent Customer,2171,NSW


In [102]:
df.columns

Index(['transaction_id', 'product_id', 'customer_id', 'transaction_date',
       'online_order', 'order_status', 'brand', 'product_line',
       'product_class', 'product_size', 'list_price', 'standard_cost',
       'profit', 'Age', 'gender', 'job_industry_category', 'job_title',
       'owns_car', 'past_3_years_bike_related_purchases', 'wealth_segment',
       'postcode', 'state'],
      dtype='object')

In [54]:
df['transaction_date'].describe()

  df['transaction_date'].describe()


count                   13636
unique                    364
top       2017-02-14 00:00:00
freq                       58
first     2017-01-01 00:00:00
last      2017-12-30 00:00:00
Name: transaction_date, dtype: object

## RFM Analysis
Below perform following operations:

- For Recency, Calculate the number of days between present date and date of last purchase each customer.
    In this data, present date is the last transaction date 2017-12-30.
- For Frequency, Calculate the number of orders for each customer.
    Frequency is count of customer_id across transactions.
- For Monetary, Calculate sum of purchase price for each customer.
    Monetary is sum of list_price for each customer_id.

In [131]:
#calculate Recency: since latest transaction date was 2017-12-31 => this date - transaction_date
present_date = pd.to_datetime('2017-12-30')
rfm = df.groupby('customer_id').agg({'transaction_date': lambda date: (present_date - date.max()),
                                    'customer_id': lambda num: len(num),
                                     'list_price': lambda price: price.sum(),
                                     'profit': lambda profit: profit.sum()
                                     })


In [132]:
#change name of columns
rfm.columns = ['Recency', 'Frequency', 'Monetary', 'Profit']

In [134]:
rfm.reset_index(inplace=True)

In [135]:
rfm

Unnamed: 0,customer_id,Recency,Frequency,Monetary,Profit
0,1,7 days,11,9084.45,3018.09
1,2,128 days,3,4149.07,2226.26
2,9,78 days,6,5357.55,2353.11
3,12,67 days,7,6394.21,3540.03
4,13,27 days,7,7734.74,4337.38
...,...,...,...,...,...
2442,3493,93 days,6,10053.82,3728.88
2443,3494,4 days,4,5122.71,2755.11
2444,3495,13 days,7,8626.65,3847.65
2445,3496,256 days,4,4725.38,2045.84


### Computing Quantile for RFM values
Customers with the lowest recency, highest frequency and monetary amounts considered as top customers.
qcut() is Quantile-based discretization function. qcut bins the data based on sample quantiles. 

In [136]:
rfm['r_score'] = pd.qcut(rfm['Recency'], 5, ['1','2','3','4', '5'])
rfm['f_score'] = pd.qcut(rfm['Frequency'], 5, ['5','4','3','2','1'])
rfm['m_score'] = pd.qcut(rfm['Monetary'], 5, ['5','4','3','2','1'])


In [137]:
rfm.head(10)

Unnamed: 0,customer_id,Recency,Frequency,Monetary,Profit,r_score,f_score,m_score
0,1,7 days,11,9084.45,3018.09,1,1,1
1,2,128 days,3,4149.07,2226.26,5,5,4
2,9,78 days,6,5357.55,2353.11,4,3,3
3,12,67 days,7,6394.21,3540.03,4,2,3
4,13,27 days,7,7734.74,4337.38,2,2,2
5,14,47 days,3,2509.39,1713.9,3,5,5
6,15,35 days,6,4923.34,1728.39,3,3,4
7,19,280 days,2,2948.34,1736.45,5,5,5
8,20,31 days,4,6618.86,3608.28,2,5,2
9,21,6 days,5,7446.21,4229.41,1,4,2


In [138]:
rfm.dtypes

customer_id              int64
Recency        timedelta64[ns]
Frequency                int64
Monetary               float64
Profit                 float64
r_score               category
f_score               category
m_score               category
dtype: object

### RFM Result Interpretation


In [139]:
#concate r_score, f_score and m_score to get RFM_score
rfm['RFM_Segment_concat'] = rfm.r_score.astype(str)+ rfm.f_score.astype(str) + rfm.m_score.astype(str)
rfm.head()

Unnamed: 0,customer_id,Recency,Frequency,Monetary,Profit,r_score,f_score,m_score,RFM_Segment_concat
0,1,7 days,11,9084.45,3018.09,1,1,1,111
1,2,128 days,3,4149.07,2226.26,5,5,4,554
2,9,78 days,6,5357.55,2353.11,4,3,3,433
3,12,67 days,7,6394.21,3540.03,4,2,3,423
4,13,27 days,7,7734.74,4337.38,2,2,2,222


In [140]:
# Count num of unique segments by RFM_score
rfm_count_unique = rfm.groupby('RFM_Segment_concat')['RFM_Segment_concat'].nunique()
print(rfm_count_unique.sum())

105


Although this way get as detail as we can for customer segmentation that up to 105 different segments, it is not practical if use in this case. Instead, another straightforward method is to sum all r, m, f scores to a single number and define segment base on each score range.
The cons of sum 3 categorical value of r_score, f_score and m_score is we can not differentiate between 1 + 2 + 3 and 3 + 2 + 1 because it all equals to 6 for example.

In [141]:
rfm['RFM_Score'] = rfm.r_score.astype(int)+ rfm.f_score.astype(int) + rfm.m_score.astype(int)
rfm.head()

Unnamed: 0,customer_id,Recency,Frequency,Monetary,Profit,r_score,f_score,m_score,RFM_Segment_concat,RFM_Score
0,1,7 days,11,9084.45,3018.09,1,1,1,111,3
1,2,128 days,3,4149.07,2226.26,5,5,4,554,14
2,9,78 days,6,5357.55,2353.11,4,3,3,433,10
3,12,67 days,7,6394.21,3540.03,4,2,3,423,9
4,13,27 days,7,7734.74,4337.38,2,2,2,222,6


### Rating User base on RFM score


In [142]:
#rfm_level_function
def rfm_level(rfm):
    if rfm['RFM_Score'] >= 9:
        return 'Can\'t Loose Them'
    elif ((rfm['RFM_Score'] >= 8) and (rfm['RFM_Score'] < 9)):
        return 'Champions'
    elif ((rfm['RFM_Score'] >= 7) and (rfm['RFM_Score'] < 8)):
        return 'Loyal'
    elif ((rfm['RFM_Score'] >= 6) and (rfm['RFM_Score'] < 7)):
        return 'Potential'
    elif ((rfm['RFM_Score'] >= 5) and (rfm['RFM_Score'] < 6)):
        return 'Promising'
    elif ((rfm['RFM_Score'] >= 4) and (rfm['RFM_Score'] < 5)):
        return 'Needs Attention'
    elif ((rfm['RFM_Score'] >= 3) and (rfm['RFM_Score'] < 4)):
        return 'Lost'
    else:
        return 'Require Activation'


In [143]:
# rfm = rfm.set_index('RFM_Score')
rfm['RFM_Level'] = rfm.apply(rfm_level, axis=1)
rfm.head()


Unnamed: 0,customer_id,Recency,Frequency,Monetary,Profit,r_score,f_score,m_score,RFM_Segment_concat,RFM_Score,RFM_Level
0,1,7 days,11,9084.45,3018.09,1,1,1,111,3,Lost
1,2,128 days,3,4149.07,2226.26,5,5,4,554,14,Can't Loose Them
2,9,78 days,6,5357.55,2353.11,4,3,3,433,10,Can't Loose Them
3,12,67 days,7,6394.21,3540.03,4,2,3,423,9,Can't Loose Them
4,13,27 days,7,7734.74,4337.38,2,2,2,222,6,Potential


In [144]:
rfm.columns

Index(['customer_id', 'Recency', 'Frequency', 'Monetary', 'Profit', 'r_score',
       'f_score', 'm_score', 'RFM_Segment_concat', 'RFM_Score', 'RFM_Level'],
      dtype='object')

In [180]:
#save file to excel for further analysis
rfm.to_excel('RFM_Analysis.xlsx')