# RFM Segmentation

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sn

import warnings
warnings.filterwarnings('ignore')

import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
retail_df = pd.read_csv("https://raw.githubusercontent.com/manaranjanp/ISB_MLUL/main/clustering/retail_txns.csv")

In [None]:
retail_df.Country.value_counts()

In [None]:
retail_df.Country.value_counts()

In [None]:
retail_df.shape

In [None]:
retail_df.info();

In [None]:
retail_df.isnull().sum()

In [None]:
retail_df.dropna( subset = ['CustomerID'], inplace = True)

In [None]:
retail_df.info()

In [None]:
sn.kdeplot(retail_df.Quantity);

In [None]:
retail_df[retail_df.Quantity < 0][0:10]

In [None]:
sn.kdeplot(retail_df.UnitPrice);

## Calculating the order value

In [None]:
retail_df['amount'] = retail_df.UnitPrice * retail_df.Quantity

In [None]:
retail_df[0:5]

### RFM

In [None]:
invoices_df = retail_df.groupby(['InvoiceNo', 'InvoiceDate', 'CustomerID'])['amount'].sum().reset_index()

In [None]:
invoices_df.head(10)

In [None]:
invoices_df.shape

In [None]:
invoices_df['InvoiceDate'] = pd.to_datetime(invoices_df['InvoiceDate'])

In [None]:
invoices_df.sample(10)

### Which month the invoice is created

In [None]:
from dateutil.relativedelta import relativedelta

In [None]:
def getDiffInMonths(now, since):
    
    difference = relativedelta(now, since)
    return difference.months + difference.years * 12

In [None]:
max(invoices_df.InvoiceDate)

In [None]:
last_month = max(invoices_df.InvoiceDate).date().replace(day=1)

In [None]:
last_month

In [None]:
invoices_df['monthsBefore'] =  invoices_df.apply(lambda rec:
                                    getDiffInMonths(last_month,
                                                    rec['InvoiceDate']),
                                                 axis = 1)

In [None]:
invoices_df

In [None]:
cust_grouping  = invoices_df.groupby(['CustomerID'])['monthsBefore']
invoices_df['recency'] = cust_grouping.transform('min')

In [None]:
invoices_df.sample(10)

In [None]:
invoices_df.shape

In [None]:
recency_df = invoices_df[['CustomerID', 'recency']].drop_duplicates()

In [None]:
recency_df.shape

## Calculating Frequency and Monetary Value


In [None]:
frequency_df = invoices_df.CustomerID.value_counts().reset_index()

In [None]:
frequency_df.columns = ['CustomerID', 'frequency']
frequency_df

In [None]:
mvalue_df = invoices_df.groupby(['CustomerID'])['amount'].sum().reset_index()
mvalue_df.columns = ['CustomerID', 'mvalue']
mvalue_df

In [None]:
rfm_df = recency_df.merge(frequency_df, on = 'CustomerID').merge(mvalue_df, on = 'CustomerID')

In [None]:
rfm_df

In [None]:
sn.scatterplot(data = rfm_df,
               x = 'frequency',
               y = 'mvalue');

In [None]:
rfm_df['frequency'].describe()

In [None]:
rfm_df['recency'].describe()

In [None]:
rfm_df['f_bin'] = pd.cut( rfm_df['frequency'], [0, 3, 6, 50], labels = ['Low', 'Medium', 'High'])
rfm_df['r_bin'] = pd.cut( rfm_df['recency'], [-1, 1, 4, 20], labels =['High', 'Medium', 'Low'])

In [None]:
rfm_df

In [None]:
rfm_segments = pd.crosstab(rfm_df.f_bin,
                           rfm_df.r_bin,
                           normalize = "all")

In [None]:
rfm_segments = rfm_segments.reindex(['High', 'Medium', 'Low'])

In [None]:
sn.heatmap(rfm_segments[['Low', 'Medium', 'High']],
           annot = True,
           fmt = "0.3f",
           cmap = "crest");
plt.xlabel("Recency")
plt.ylabel("Frequency");

In [None]:
sn.kdeplot(rfm_df.frequency);

In [None]:
sn.boxplot(np.log(rfm_df.frequency));

In [None]:
sn.kdeplot(rfm_df.mvalue);

In [None]:
sn.boxplot(np.log(rfm_df.mvalue));

In [None]:
rfm_df['log_frequency'] = np.log(rfm_df.frequency)

In [None]:
rfm_df['log_mvalue'] = np.log(rfm_df.mvalue)

In [None]:
rfm_df.frequency.describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_rfm_df = scaler.fit_transform( rfm_df[['recency', 
                                              'log_frequency', 
                                              'log_mvalue']] )

In [None]:
scaled_rfm_df[0:10]

In [None]:
from sklearn.cluster import KMeans

In [None]:
cluster_range = range( 2, 10 )
cluster_errors = []

for num_clusters in cluster_range:
  clusters = KMeans( num_clusters )
  clusters.fit( scaled_rfm_df )
  cluster_errors.append( clusters.inertia_ )

plt.figure(figsize=(8,4))
plt.plot( cluster_range, cluster_errors, marker = "o" );

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

fig, ax = plt.subplots(2, 2, figsize=(15,10))
num_clusters = [5, 6, 7, 8]
for i, k in enumerate(num_clusters):
    km = KMeans(n_clusters=k, 
                random_state=42)
    q, mod = divmod(i, 2)
    visualizer = SilhouetteVisualizer(km, 
                                      colors='yellowbrick', 
                                      ax=ax[q-1][mod])
    visualizer.fit(scaled_rfm_df) 

In [None]:
k = 5

clusters = KMeans( k, random_state = 42 )
clusters.fit( scaled_rfm_df )
rfm_df["clusterid"] = clusters.labels_

In [None]:
plt.figure(figsize = (10, 6))
sn.scatterplot(data = rfm_df,
               y = 'frequency',
               x = 'mvalue',
               hue = 'clusterid',
               style= "clusterid" );

In [None]:
plt.figure(figsize = (10, 6))
sn.scatterplot(data = rfm_df,
               y = 'recency',
               x = 'mvalue',
               hue = 'clusterid',
               style= "clusterid" );

In [None]:
sn.barplot(data = rfm_df,
           x = 'clusterid',
           y = 'mvalue',           
           estimator='mean');

In [None]:
sn.barplot(data = rfm_df,
           x = 'clusterid',
           y = 'frequency');

In [None]:
sn.barplot(data = rfm_df,
           x = 'clusterid',
           y = 'recency');

In [None]:
rfm_df

In [None]:
rfm_df[['recency', 'frequency', 'mvalue', 'clusterid']].groupby("clusterid").mean()

## Segmentation Interpretation:

- Cluster 0: New Customers

- Cluster 1: Loyal Customers and Promising

- Cluster 2: Star Customers

- Cluster 3: Dormant or Churned

- Cluster 4: On the Fence or Needs attention