In [None]:
import pandas as pd

df = pd.read_excel('../data/Online Retail.xlsx')
df = df.dropna(subset=['CustomerID'])
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

df.head()

In [None]:
from datetime import timedelta

snapshot_date = df['InvoiceDate'].max() + timedelta(days=1)

rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': 'sum'
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']
rfm.head()


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

scaler = StandardScaler()
X = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

kmeans = KMeans(n_clusters=4, random_state=42)
rfm['Segment'] = kmeans.fit_predict(X)
rfm.head()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.scatterplot(data=rfm, x='Recency', y='Monetary', hue='Segment', palette='Set2')
plt.title('Customer Segments by Recency and Monetary Value')
plt.show()


## 📊 Data Exploration
Let's understand the structure and key characteristics of the UCI Online Retail dataset.


In [None]:
# Number of rows and columns
print("📦 Shape of dataset:", df.shape)

# Display random sample of 5 rows
print("\n🔍 Sample rows:")
display(df.sample(5))

# Number of unique customers
print("\n👥 Unique customers:", df['CustomerID'].nunique())

# Top 10 countries by order count
print("\n🌍 Top 10 countries by order count:")
display(df['Country'].value_counts().head(10))

# Quantity stats
print("\n📦 Quantity distribution:")
display(df['Quantity'].describe())

# Negative quantities (possibly returns)
print("\n❗ Negative quantities (returns or cancellations):")
display(df[df['Quantity'] < 0].head())

# Price stats
print("\n💷 Unit price distribution:")
display(df['UnitPrice'].describe())
