In [None]:
# Import libraries
import pandas as pd
import numpy as np

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import warnings
warnings.filterwarnings('ignore')

# **1. Data Cleaning and Preparation**

In [None]:
data = pd.read_csv('../data/online_retail.csv')
print(f'Rows, Columns count', data.shape)
data.head()

**1. Data Inspection**

In [None]:
# Data summary information
data.info()

Missing values are present in `Description, CustomerID`, `InvoiceDate, CustomerID` have incorrect datatypes

In [None]:
# Statistical summary
data.describe()

Our numerical columns are highly skewed, they also appear to have invalid values; there are negative values present

**2. Data Cleaning**

In [None]:
# Data cleaning
# Check for and drop missing values for critical columns
print(f'Missing values per column\n',data.isnull().sum())
data.dropna(subset=['InvoiceDate', 'CustomerID', 'InvoiceDate'], inplace=True)

# Change invoiceDate to DateTime format
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
# Convert customerID to str
data['CustomerID'] = data['CustomerID'].astype(int).astype(str)

# Filter invalid values in Quantity and UnitPrice columns
data = data[(data['Quantity'] > 0) & (data['UnitPrice'] > 0.01)]

# Create TransactionAmount columns
data['TransactionAmount'] = (data['UnitPrice'] * data['Quantity']).round(2)

# Inspection check
print(f'Missing values after cleaning:\n', data.isna().sum(), '\n')
print(f'Updated Rows, Columns:', data.shape)
data.head()

**Checking Duplicate Entries**

In [None]:
print(f'Duplicate Entries: ', data.duplicated().sum()) # 5192 Duplicates
duplicates = data[data.duplicated(keep=False)]
duplicates.head() 

In [None]:
duplicate_groups = data[data.duplicated(keep=False)].groupby(list(data.columns)).size()
print(len(duplicate_groups))
duplicate_groups.head()

In [None]:
# Drop duplicates but keep 1st occurence of a duplicated entry
data = data.drop_duplicates(keep='last')
print(f'Duplicated Entries: ', data.duplicated().sum())

In [None]:
data.describe(include=['number'])

In [None]:
print(data[['Quantity', 'TransactionAmount']].skew())

**Handling Outliers**

In [None]:
def cap_outliers(series, lower_quantile=0.01, upper_quantile=0.99):
    lower_bound = series.quantile(lower_quantile)
    upper_bound = series.quantile(upper_quantile)
    return series.clip(lower=lower_bound, upper=upper_bound)

# # Automatically apply capping to all numeric columns
# data[data.select_dtypes(include=['float64', 'int64']).columns] = data.select_dtypes(include=['float64', 'int64']).apply(
#     lambda x: cap_outliers(x)
# )
# Columns to cap outliers
columns_to_cap = ['Quantity', 'TransactionAmount']  # Replace with your column names

# Apply the capping function to each column
for col in columns_to_cap:
    data[col] = cap_outliers(data[col])

# Check summary statistics to confirm
data.describe(include='number')

In [None]:
data.head(3)

In [None]:
print(f'Skewness before Log-Transform after removing Outliers: \n', data[['Quantity', 'TransactionAmount']].skew(),'\n')
# print(f'Skewness after Log-Transform: \n', data[['Quantity_log', 'TransactionAmount_log']].skew())

In [None]:
# Duplicates check
print(f'Number of Duplicates:', data.duplicated().sum(), '\n')
# Drop duplicates
data.drop_duplicates(keep='first', inplace=True)
# Inpect duplicate entries (Should return zero entries)
data[data.duplicated(keep=False)]

In [None]:
data.info()

In [None]:
# Save cleaned data
data.to_pickle('../data/cleaned_data.pkl')

## **Exploratory Data Analysis**

In [None]:
df = pd.read_pickle('../data/cleaned_data.pkl')
df.head()

---

### **Calculate RFM Metrics**

In [None]:
# Calculate Reference data
reference_date = df['InvoiceDate'].max() + pd.DateOffset(1)
reference_date

In [None]:
# Calculate RFM Metrics
rfm = df.groupby('CustomerID').agg(
    Recency=('InvoiceDate', lambda x: (reference_date - x.max()).days),
    Frequency=('InvoiceNo', 'nunique'),
    Monetary=('TransactionAmount', 'sum')
).reset_index()

# RFM table preview
rfm.head()

In [None]:
# Check skewness of RFM metrics
print(rfm[["Recency", "Frequency", "Monetary"]].skew())

In [None]:
# Log-Transform RFM Metrics
rfm[["Recency_log", "Frequency_log", "Monetary_log"]] = rfm[["Recency", "Frequency", "Monetary"]].apply(lambda x: np.log1p(x)).round(2)
rfm.head()

In [None]:
rfm[["Recency_log", "Frequency_log", "Monetary_log"]].skew()

### **RFM Scoring**

In [None]:
rfm['Recency'] = pd.qcut(rfm['Recency'], 5, labels=[5,4,3,2,1])
rfm['Frequency'] = pd.qcut(rfm['Frequency'].rank(method='first'), 5, labels=[1,2,3,4,5])
rfm['Monetary'] = pd.qcut(rfm['Monetary'].rank(method='first'), 5, labels=[1,2,3,4,5])

# Combine scores into one
rfm['RFM_score'] = rfm['Recency'].astype(int) + rfm['Frequency'].astype(int) + rfm['Monetary'].astype(int)

rfm.head()

### **Value Segmentation**

In [None]:
value_labels = ['Low-value', 'Mid-value', 'High-value']
rfm['Value_segment'] = pd.qcut(rfm['RFM_score'], q=3, labels=value_labels)
# Preview
rfm.head()

### **Behavioral Segmentation**

In [None]:
def behaviour_segment(row):
    if row['RFM_score'] >= 13:
        return 'VIP'
    if row['RFM_score'] >= 10:
        return 'Loyal Customers'
    if row['RFM_score'] >= 7:
        return 'Potential Loyalists'
    if row['RFM_score'] >= 5:
        return 'At Risk'
    else:
        return 'Hibernating'
    
# Applying the segmentation strategy
rfm['Customer_segment'] = rfm.apply(behaviour_segment, axis=1)
print(rfm['Customer_segment'].value_counts())

rfm.head()

In [None]:
# Merge the RFM metrics back to the original dataframe
df_combined = df.merge(rfm, on='CustomerID', how='left')
print(df_combined.shape)
df_combined.head(3)

In [None]:
# Save combined DataFrame
df_combined.to_pickle('../data/combined_data.pkl')

## **Overview of Segments (RFM Analysis)**

In [None]:
# Load combined DataFrame
rfm_df = pd.read_pickle('../data/combined_data.pkl')
rfm_df.columns

In [None]:
rfm_df[['Recency', 'Frequency', 'Monetary']] = rfm_df[['Recency', 'Frequency', 'Monetary']].astype(int)

How are the segments distributed?

In [None]:
# Value segments
value_counts = rfm_df['Value_segment'].value_counts()
print(value_counts)

# Plot distribution
sns.countplot(data=rfm_df, x='Value_segment', order=value_counts.index, palette="inferno")
plt.title('Distribution of Value Segments')
plt.xlabel('Value Segment')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

**Investigating compositions of Value segments by customer segments**.

Group by Value_segment and calculate the percentage of Customer_segment within each value segment

In [None]:
composition = pd.crosstab(
    rfm['Value_segment'], 
    rfm['Customer_segment'], 
    normalize='index'
).mul(100).reset_index()
# print(composition.head())

pivot_data = composition.set_index('Value_segment')
pivot_data.plot(kind='bar', stacked=True, colormap='inferno', figsize=(10, 6))
plt.title('Composition of Value Segments by Customer Segments')
plt.xlabel('Value Segment')
plt.ylabel('Percentage')
plt.legend(title='Customer Segment', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

**Analyze Revenue contribution by value segment**

*Objective: Understand financial value of each segment*

In [None]:
revenue_by_value_seg = rfm_df.groupby('Value_segment')['TransactionAmount'].sum()
print(revenue_by_value_seg)

# Plot a pie chart
plt.figure(figsize=(8, 8))
wedges, texts, autotexts = plt.pie(
    revenue_by_value_seg, 
    labels=revenue_by_value_seg.index,  # Add segment labels
    autopct='%1.1f%%',                 # Show percentage
    startangle=320,                    # Rotate for better view
    textprops={'fontsize': 14}         # Adjust text size
)
plt.legend(
    wedges, revenue_by_value_seg.index,
    title="Value Segments",
    loc="center left",
    bbox_to_anchor=(1, 0.5)
)
plt.title('Revenue Contribution by Value Segment', fontsize=14)
plt.tight_layout()  # Adjust layout to fit everything
plt.show()

**Customer Engagement Analysis within Value Segments**

In [None]:
engagement_by_value_seg = rfm_df.groupby('Value_segment')['Frequency_log'].mean()
engagement_by_value_seg

In [None]:
# Convert to DataFrame for Seaborn
engagement_df = engagement_by_value_seg.reset_index()
engagement_df.columns = ['Value Segment', 'Average Engagement']

# Plot a heatmap
plt.figure(figsize=(8, 4))
sns.heatmap(
    engagement_df.pivot_table(index='Value Segment', values='Average Engagement'), 
    annot=True, fmt=".2f", cmap='Blues', cbar_kws={'label': 'Average Engagement'}
)
plt.title('Customer Engagement Heatmap by Value Segment', fontsize=14)
plt.tight_layout()
plt.show()

**Trend Analysis of Value Segment Behavior**

*Objective: Understand how the purchasing behavior of each value segment changes over time.*

In [None]:
rfm_df['InvoiceDate'] = pd.to_datetime(rfm_df['InvoiceDate'])
rfm_df.set_index('InvoiceDate', inplace=True)
trend_by_segment = rfm_df.groupby(['Value_segment']).resample('M')['TransactionAmount'].sum().unstack()
# trend_by_segment

In [None]:
# Prepare data for seaborn
trend_df = trend_by_segment.stack().reset_index()
trend_df.columns = ['Value Segment', 'Month', 'Transaction Amount']

# Plot line plot
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=trend_df,
    x='Month', y='Transaction Amount', hue='Value Segment', marker='o'
)

# Add titles and labels
plt.title('Monthly Revenue Trend by Value Segment', fontsize=14)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Revenue (Transaction Amount)', fontsize=12)

# Improve date readability
plt.xticks(rotation=45)
plt.tight_layout()

# Display the plot
plt.show()

**Churn Risk and Retention Analysis**

*Objective: Evaluate the churn risk for each value segment*

In [None]:
churn_by_value_segment = rfm_df.groupby('Value_segment')['Recency_log'].mean()  # Longer recency = higher churn risk
churn_by_value_segment

In [None]:
# Plot a dot plot
plt.figure(figsize=(8, 5))
plt.scatter(churn_by_value_segment.index, churn_by_value_segment, color=['red', 'orange', 'green'], s=100)

# Add labels and title
plt.ylabel('Average Recency (Log Scale)', fontsize=12)
plt.xlabel('Value Segment', fontsize=12)
plt.title('Churn Risk by Value Segment', fontsize=14)

# Add values near the dots
for index, value in enumerate(churn_by_value_segment):
    plt.text(index, value + 0.1, f'{value:.2f}', ha='center', fontsize=10)

# Display the chart
plt.tight_layout()
plt.show()

**Cross-Sell and Upsell Potential by Value Segment**

*Objective*: Identify potential for cross-selling and upselling within value segments.

*Approach*:
For each value segment, examine which products are frequently purchased together or which higher-value products can be suggested

In [None]:
cross_sell_potential = rfm_df.groupby(['Value_segment', 'StockCode'])['TransactionAmount'].sum().unstack()
cross_sell_potential

In [None]:
# Aggregate top 10 products by value segment
top_products_by_segment = cross_sell_potential.sum(axis=1).nlargest(10)

plt.figure(figsize=(10, 6))
for segment in cross_sell_potential.index:
    top_products = cross_sell_potential.loc[segment].nlargest(10)  # Top 10 products for the segment
    plt.barh(top_products.index, top_products.values, label=segment)

# Add labels and legend
plt.xlabel('Transaction Amount', fontsize=12)
plt.ylabel('Product (StockCode)', fontsize=12)
plt.title('Top 10 Products by Value Segment', fontsize=14)
plt.legend(title="Value Segment", loc='upper right')
plt.tight_layout()

# Show the plot
plt.show()

**Geographic and Demographic Insights by Value Segment**

*Objective:* 

Examine how location (Country) and demographic factors (e.g., age, gender, etc.) influence the purchasing behavior of each value segment.

*Approach:*

Look at the geographical distribution of each segment.
For example, VIP customers may be more concentrated in certain regions or countries.

In [None]:
geographic_by_segment = rfm_df.groupby(['Country', 'Value_segment'])['TransactionAmount'].sum().unstack()
geographic_by_segment.head()

In [None]:
# Select top 10 countries
top_countries = geographic_by_segment.sum(axis=1).nlargest(10).index
stacked_data = geographic_by_segment.loc[top_countries]

# Plot a stacked bar chart
stacked_data.plot(kind='barh', stacked=True, figsize=(12, 8), colormap='viridis')

# Add labels and title
plt.xlabel('Total Revenue', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.title('Top 10 Countries by Revenue and Value Segment', fontsize=14)
plt.legend(title='Value Segment', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# Show the chart
plt.show()

9. **Marketing Effectiveness by Segment**

*Objective:* 

Evaluate how effective different marketing campaigns are for each segment.

*Approach*:

Compare conversion rates, response to promotions, and discount redemption rates across value segments.
Flow to Next: Allows the retailer to refine their marketing strategies for each value segment based on the campaign's performance.

**Behavioral Change and Predictive Modeling**

*Objective:* 

Predict how customers will move between value segments or within the same segment over time.
Use predictive models to understand which customers are likely to move between segments (e.g., from Potential Loyalists to Loyal Customers). For example:

- Predict if a Potential Loyalist is likely to become a Loyal Customer.
- Predict if an At Risk customer will churn or make a repeat purchase.

*Approach*:

Use machine learning models like logistic regression or decision trees to predict segment transitions based on recency, frequency, and monetary data. 

We'll start by setting the church threshold

In [None]:
# Define churn threshold
churn_threshold = rfm_df['Recency_log'].quantile(0.75) # This sets the churn threshold at 75th percentile of Recency meaning customers whose Recency exceeds
                                                       # this threshold are considered at risk of churn.

In [None]:
# # Multi-class transitions
# def label_transition(row):
#     if row['Customer_segment'] == 'At Risk' and row['Recency_log'] > churn_threshold:
#         return 'Churned'
#     elif row['Customer_segment'] == 'Potential Loyalist' and row['Value_segment'] == 'High Value':
#         return 'Upgraded'
#     elif row['Customer_segment'] == 'VIP' and row['Value_segment'] == 'Low Value':
#         return 'Downgraded'
#     else:
#         return 'No Change'

# rfm['Transition'] = rfm.apply(label_transition, axis=1)


# Binary classification: Customer who haven't purchased in a set period
rfm_df['Churned'] = (rfm_df['Recency_log'] > churn_threshold).astype(int) # 1 is at risk of churning, 0 is active
# Validate column 
rfm_df['Churned'].value_counts()

10. **Final Recommendations for Action**

*Objective:* 

Provide actionable insights based on the combined analysis.

*Approach:*

Use the findings to create a segment-based marketing strategy, targeting each value segment with appropriate strategies like loyalty rewards, re-engagement campaigns, or personalized offers.
Flow to Next: This can result in ongoing tracking of performance, where you'll reanalyze your value segments periodically to refine strategies.