# RFM Analysis - Data Transformations

Overview
• Recency measures the number of days since the last purchase, so lower values indicate more recent activity.
• Frequency is the total number of purchases; higher values signal more engaged customers.
• Monetary sums all spending; higher values suggest higher-value customers.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt

df = pd.read_csv(r'C:\Users\mgaig\OneDrive\Data - portfolio\Jupyter Lab\RFM Analysis\Online Retail.csv', encoding='ISO-8859-1')

df.head()

# Clean and aggregate the data

In [None]:
df.dropna(subset=['CustomerID'], inplace=True)

df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])


df['Amount'] = df['Quantity'] * df['UnitPrice']

rfm_df = df.groupby('CustomerID').agg(
    CustomerID=('CustomerID', 'first'),
    LastPurchaseDate=('InvoiceDate', 'max'),
    Frequency=('InvoiceNo', 'nunique'),
    Monetary=('Amount', 'mean')
).reset_index(drop=True)

print(rfm_df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Print the column names and their data types
print(rfm_df.info())

# Create RFM Data Model

In [None]:
# Calculate Recency
rfm_df['Recency'] = (rfm_df['LastPurchaseDate'].max() + pd.to_timedelta('1D')) - rfm_df['LastPurchaseDate']
rfm_df['Recency'] = rfm_df['Recency'].dt.days

quantiles = rfm_df[['Recency', 'Frequency', 'Monetary']].quantile([0.25, 0.5, 0.75])

# Assign RecencyScore
rfm_df['RecencyScore'] = pd.cut(rfm_df['Recency'], bins=[-1, quantiles['Recency'][0.25], quantiles['Recency'][0.5], quantiles['Recency'][0.75], float('inf')], labels=[4, 3, 2, 1], include_lowest=True)

# Assign FrequencyScore and MonetaryScore
rfm_df['FrequencyScore'] = pd.cut(rfm_df['Frequency'], bins=[-1, quantiles['Frequency'][0.25], quantiles['Frequency'][0.5], quantiles['Frequency'][0.75], float('inf')], labels=[1, 2, 3, 4], include_lowest=True)
rfm_df['MonetaryScore'] = pd.cut(rfm_df['Monetary'], bins=[-1, quantiles['Monetary'][0.25], quantiles['Monetary'][0.5], quantiles['Monetary'][0.75], float('inf')], labels=[1, 2, 3, 4], include_lowest=True)

# Concatenate scores
rfm_df['RFM_Score'] = rfm_df['RecencyScore'].astype(str) + rfm_df['FrequencyScore'].astype(str) + rfm_df['MonetaryScore'].astype(str)

# Display the first 5 rows
print(rfm_df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Print the column names and their data types
print(rfm_df.info())

# Segment customers using RFM values

In [None]:
def rfm_level(df):
    r = df['RecencyScore']
    f = df['FrequencyScore']
    m = df['MonetaryScore']
    rfm = df['RFM_Score']
    
    # 1. Champions
    if r == 4 and f == 4 and m == 4:
        return 'Champions'
    
    # 2. Loyal Customers
    elif f == 4 and m >= 3:
        return 'Loyal Customers'
    
    # 3. Potential Loyalist
    elif r == 4 and f >= 2 and m >= 1:
        return 'Potential Loyalist'
    
    elif r >= 3 and f >= 3 or r >= 3 and m >= 3:
        return 'Promising'
    
    # 4. Recent Customers
    elif r >= 3 and f >= 1 and m >= 1:
        return 'Recent Customers'
    
    # 8. At Risk
    elif r <= 2 and f >= 1 and m == 4:
        return 'At Risk Big Spenders'
    
# 8. Can't lose them
    elif r <= 2 and f >= 1 and m >= 3:
        return 'At Risk Moderate Spenders'


    # 6. Customers Needing Attention
    elif r >= 2 and f >= 2 and m >= 1:
        return 'Needs Attention'
    
# 10. Hibernating
    elif r == 1 and f <= 2 or r == 1 and m <= 2:
        return 'Hibernating'

    # 11. Lost
    else:
        return 'About to Sleep'

  



# Apply the function to assign RFM level
rfm_df['RFM_Level'] = rfm_df.apply(rfm_level, axis=1)

# Display the first 5 rows
print(rfm_df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Print the column names and their data types
print(rfm_df.info())


In [None]:
# Calculate the number of customers in each RFM level
rfm_level_counts = rfm_df['RFM_Level'].value_counts().reset_index()
rfm_level_counts.columns = ['RFM_Level', 'count']

# Display the result
print(rfm_level_counts.to_markdown(index=False, numalign="left", stralign="left"))

# Display total counts
total_counts = rfm_level_counts['count'].sum()
print(f"Total number of customers: {total_counts}")

In [None]:
import altair as alt

# Create a bar chart of number of customers in each `RFM_Level`
chart = alt.Chart(rfm_df).mark_bar().encode(
    x=alt.X('RFM_Level'),
    y=alt.Y('count()', title='Number of Customers'),
    color=alt.Color('RFM_Level'),
    tooltip=['RFM_Level', alt.Tooltip('count()', title='Number of Customers')]
).properties(
    title='Number of Customers per RFM Level'
).interactive()

# Save the chart
chart.save('rfm_level_bar_chart.json')

# Display the chart
chart.display()

In [None]:
# Filter lost customers
lost_customers = rfm_df[rfm_df['RFM_Level'] == 'Lost']

# Group by RFM_Score and count the number of lost customers in each group
lost_rfm_grouped = lost_customers.groupby('RFM_Score').size().reset_index(name='count')

# Display the result
print(lost_rfm_grouped.to_markdown(index=False, numalign="left", stralign="left"))

In [38]:
output_path = r'C:\Users\mgaig\OneDrive\Data - portfolio\Jupyter Lab\RFM Analysis\output\RFM_Analysis.csv'
rfm_df.to_csv(output_path, index=False)


from azure.storage.blob import BlobServiceClient
import pandas as pd
import io
import numpy as np 
from dotenv import load_dotenv
import os


storage_account_name = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
storage_account_key = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")
container_name = "dataillustratedcontainer"


blob_service_client = BlobServiceClient(
    account_url=f"https://{storage_account_name}.blob.core.windows.net",
    credential=storage_account_key
)


csv_data = rfm_df.to_csv(index=False)


blob_client_transformed = blob_service_client.get_blob_client(container=container_name, blob="RFM_analysis.csv")


blob_client_transformed.upload_blob(csv_data, overwrite=True)

print(f"DataFrame successfully uploaded to {container_name}/{"RFM_analysis.csv"}")

In [None]:
# Merge the original dataframe with the RFM dataframe on 'CustomerID'
df = df.merge(rfm_df[['CustomerID', 'RFM_Level']], on='CustomerID', how='left')

# Display the first few rows to verify the merge
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

output_path = r'C:\Users\mgaig\OneDrive\Data - portfolio\Jupyter Lab\RFM Analysis\output\RFM_Analysis_all_sales.csv'
df.to_csv(output_path, index=False)