# 01 - Comprehensive Exploratory Data Analysis (EDA)

This notebook provides a comprehensive EDA of the retail transaction dataset to understand market behavior patterns.

## Objectives
- **Summary Statistics**: Descriptive statistics for all key variables
- **Distribution Analysis**: Histograms, KDE plots, and boxplots
- **Correlation Analysis**: Heatmaps showing relationships between variables
- **Missing Data Analysis**: Visual matrix of missing values
- **Outlier Detection**: IQR and Z-score methods
- **Time-Trend Analysis**: Temporal patterns and market volatility
- **Customer Analysis**: RFM-style customer segmentation
- **Product Analysis**: Category and product-level insights
- **Deep Insights**: Market volatility, seasonal patterns, geographical demand

## Required Components
1. Summary statistics
2. Distribution plots (Histogram, KDE, Boxplot)
3. Correlation heatmap
4. Missing data matrix
5. Outlier detection (IQR, Z-score)
6. Time-trend plots
7. Customer-level summaries (RFM-style)
8. Product/Category-level insights
9. Deep insights (volatility, seasonal, geographical)


In [1]:
# # Mount Google Drive (if using Drive to store data)
# from google.colab import drive
# drive.mount('/content/drive')

# # Alternatively, upload files directly in Colab
# # Go to Files -> Upload to upload the dataset


In [3]:
# Load and prepare dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
from scipy import stats
from scipy.stats import zscore

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Set the project root directory
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
data_path = os.path.join(project_root, 'data', 'raw', 'Online Retail.csv')

# Load raw data
df_raw = pd.read_csv(data_path, encoding='latin-1')
print("=" * 80)
print("COMPREHENSIVE EXPLORATORY DATA ANALYSIS")
print("=" * 80)
print(f"\nRaw Dataset Shape: {df_raw.shape}")
print(f"Columns: {df_raw.columns.tolist()}")
print(f"\nData Types:\n{df_raw.dtypes}")

# Basic cleaning for EDA (preserve raw for comparison)
df = df_raw.copy()

# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')

# Remove canceled orders
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

# Remove invalid transactions
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df = df[df['Description'].notna()]

# Create derived features
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
df = df[df['InvoiceDate'].notna()]

# Temporal features
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['DayOfWeek'] = df['InvoiceDate'].dt.dayofweek
df['Hour'] = df['InvoiceDate'].dt.hour
df['MonthYear'] = df['InvoiceDate'].dt.to_period('M').astype(str)

print(f"\nCleaned Dataset Shape: {df.shape}")
print(f"Rows removed: {len(df_raw) - len(df):,} ({(len(df_raw) - len(df))/len(df_raw)*100:.2f}%)")
print("\n" + "=" * 80)


NameError: name 'os' is not defined

In [None]:
# 1.1 Summary Statistics for Numeric Variables
print("=" * 80)
print("1. SUMMARY STATISTICS")
print("=" * 80)

numeric_cols = ['Quantity', 'UnitPrice', 'TotalPrice']
print("\n1.1 Numeric Variables Summary:")
print(df[numeric_cols].describe().T)

# Additional statistics
print("\n1.2 Additional Statistics:")
stats_df = pd.DataFrame({
    'Variable': numeric_cols,
    'Mean': [df[col].mean() for col in numeric_cols],
    'Median': [df[col].median() for col in numeric_cols],
    'Std Dev': [df[col].std() for col in numeric_cols],
    'Skewness': [df[col].skew() for col in numeric_cols],
    'Kurtosis': [df[col].kurtosis() for col in numeric_cols],
    'Min': [df[col].min() for col in numeric_cols],
    'Max': [df[col].max() for col in numeric_cols],
    'Q1': [df[col].quantile(0.25) for col in numeric_cols],
    'Q3': [df[col].quantile(0.75) for col in numeric_cols],
    'IQR': [df[col].quantile(0.75) - df[col].quantile(0.25) for col in numeric_cols]
})
print(stats_df.to_string(index=False))

# Categorical summary
print("\n1.3 Categorical Variables Summary:")
print(f"\nUnique Countries: {df['Country'].nunique()}")
print(f"Top 10 Countries by Transaction Count:")
print(df['Country'].value_counts().head(10))

print(f"\nUnique Products: {df['StockCode'].nunique()}")
print(f"Unique Customers: {df['CustomerID'].nunique():.0f}")
print(f"Unique Invoices: {df['InvoiceNo'].nunique()}")

print(f"\nDate Range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
print(f"Total Revenue: £{df['TotalPrice'].sum():,.2f}")
print(f"Average Transaction Value: £{df['TotalPrice'].mean():.2f}")
print(f"Median Transaction Value: £{df['TotalPrice'].median():.2f}")


## 2. Distribution Analysis

Histograms, KDE plots, and boxplots to understand the distribution of key variables.


In [None]:
# 2. Distribution Plots: Histogram, KDE, and Boxplot
print("=" * 80)
print("2. DISTRIBUTION ANALYSIS")
print("=" * 80)

fig, axes = plt.subplots(3, 3, figsize=(18, 15))
fig.suptitle('Distribution Analysis: Histograms, KDE, and Boxplots', fontsize=16, y=1.02)

variables = ['Quantity', 'UnitPrice', 'TotalPrice']
plot_types = ['Histogram', 'KDE', 'Boxplot']

for idx, var in enumerate(variables):
    # Histogram
    axes[0, idx].hist(df[var], bins=50, edgecolor='black', alpha=0.7, color='skyblue')
    axes[0, idx].set_title(f'{var} - Histogram', fontweight='bold')
    axes[0, idx].set_xlabel(var)
    axes[0, idx].set_ylabel('Frequency')
    axes[0, idx].grid(True, alpha=0.3)
    
    # KDE Plot
    df[var].plot(kind='kde', ax=axes[1, idx], color='darkblue', linewidth=2)
    axes[1, idx].set_title(f'{var} - Kernel Density Estimation', fontweight='bold')
    axes[1, idx].set_xlabel(var)
    axes[1, idx].set_ylabel('Density')
    axes[1, idx].grid(True, alpha=0.3)
    
    # Boxplot
    bp = axes[2, idx].boxplot(df[var], vert=True, patch_artist=True)
    bp['boxes'][0].set_facecolor('lightcoral')
    axes[2, idx].set_title(f'{var} - Boxplot', fontweight='bold')
    axes[2, idx].set_ylabel(var)
    axes[2, idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Log-transformed distributions for better visualization of skewed data
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Log-Transformed Distributions (for Skewed Variables)', fontsize=16)

for idx, var in enumerate(variables):
    log_data = np.log1p(df[var][df[var] > 0])  # log1p to handle zeros
    axes[idx].hist(log_data, bins=50, edgecolor='black', alpha=0.7, color='lightgreen')
    axes[idx].set_title(f'Log({var}) Distribution', fontweight='bold')
    axes[idx].set_xlabel(f'Log({var})')
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 3. Correlation Analysis

Heatmap showing relationships between numeric variables.


In [None]:
# 3. Correlation Heatmap
print("=" * 80)
print("3. CORRELATION ANALYSIS")
print("=" * 80)

# Calculate correlation matrix
corr_vars = ['Quantity', 'UnitPrice', 'TotalPrice', 'Year', 'Month', 'DayOfWeek', 'Hour']
corr_matrix = df[corr_vars].corr()

# Create heatmap
fig, ax = plt.subplots(figsize=(10, 8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Mask upper triangle
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.3f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax)
ax.set_title('Correlation Heatmap of Key Variables', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Print correlation values
print("\nCorrelation Matrix:")
print(corr_matrix.round(3))

# Identify strong correlations
print("\nStrong Correlations (|r| > 0.5):")
strong_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > 0.5:
            strong_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))
            print(f"  {corr_matrix.columns[i]} ↔ {corr_matrix.columns[j]}: {corr_val:.3f}")


## 4. Missing Data Analysis

Visual matrix showing missing data patterns across the dataset.


In [None]:
# 4. Missing Data Matrix
print("=" * 80)
print("4. MISSING DATA ANALYSIS")
print("=" * 80)

# Calculate missing values
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing Count': df.isnull().sum(),
    'Missing %': (df.isnull().sum() / len(df)) * 100
})
missing_data = missing_data[missing_data['Missing Count'] > 0].sort_values('Missing %', ascending=False)

print("\nMissing Data Summary:")
if len(missing_data) > 0:
    print(missing_data.to_string(index=False))
else:
    print("  No missing values found in cleaned dataset!")

# Visual missing data matrix
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Missing data heatmap
if df.isnull().sum().sum() > 0:
    sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis', ax=axes[0])
    axes[0].set_title('Missing Data Matrix (Yellow = Missing)', fontweight='bold')
else:
    axes[0].text(0.5, 0.5, 'No Missing Data', ha='center', va='center', 
                fontsize=16, transform=axes[0].transAxes)
    axes[0].set_title('Missing Data Matrix', fontweight='bold')

# Missing data bar chart
if len(missing_data) > 0:
    sns.barplot(data=missing_data, x='Missing %', y='Column', palette='Reds_r', ax=axes[1])
    axes[1].set_title('Missing Data by Column (%)', fontweight='bold')
    axes[1].set_xlabel('Missing Percentage')
else:
    axes[1].text(0.5, 0.5, 'No Missing Data', ha='center', va='center', 
                fontsize=16, transform=axes[1].transAxes)
    axes[1].set_title('Missing Data by Column', fontweight='bold')

plt.tight_layout()
plt.show()

# Check raw data for comparison
print("\nMissing Data in Raw Dataset:")
raw_missing = pd.DataFrame({
    'Column': df_raw.columns,
    'Missing Count': df_raw.isnull().sum(),
    'Missing %': (df_raw.isnull().sum() / len(df_raw)) * 100
})
print(raw_missing[raw_missing['Missing Count'] > 0].to_string(index=False))


## 5. Outlier Detection

Using IQR (Interquartile Range) and Z-score methods to identify outliers.


In [None]:
# 5. Outlier Detection: IQR and Z-score Methods
print("=" * 80)
print("5. OUTLIER DETECTION")
print("=" * 80)

def detect_outliers_iqr(data, column):
    """Detect outliers using IQR method"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

def detect_outliers_zscore(data, column, threshold=3):
    """Detect outliers using Z-score method"""
    z_scores = np.abs(zscore(data[column].dropna()))
    outliers = data[z_scores > threshold]
    return outliers

# Detect outliers for key variables
outlier_results = {}
variables = ['Quantity', 'UnitPrice', 'TotalPrice']

for var in variables:
    iqr_outliers, lower, upper = detect_outliers_iqr(df, var)
    zscore_outliers = detect_outliers_zscore(df, var, threshold=3)
    
    outlier_results[var] = {
        'IQR_count': len(iqr_outliers),
        'IQR_percent': (len(iqr_outliers) / len(df)) * 100,
        'Zscore_count': len(zscore_outliers),
        'Zscore_percent': (len(zscore_outliers) / len(df)) * 100,
        'IQR_lower': lower,
        'IQR_upper': upper
    }

# Create summary table
outlier_summary = pd.DataFrame({
    'Variable': variables,
    'IQR Outliers': [outlier_results[v]['IQR_count'] for v in variables],
    'IQR %': [outlier_results[v]['IQR_percent'] for v in variables],
    'Z-score Outliers': [outlier_results[v]['Zscore_count'] for v in variables],
    'Z-score %': [outlier_results[v]['Zscore_percent'] for v in variables],
    'IQR Lower Bound': [outlier_results[v]['IQR_lower'] for v in variables],
    'IQR Upper Bound': [outlier_results[v]['IQR_upper'] for v in variables]
})

print("\nOutlier Detection Summary:")
print(outlier_summary.to_string(index=False))

# Visualize outliers
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Outlier Detection: IQR and Z-score Methods', fontsize=16, y=1.02)

for idx, var in enumerate(variables):
    # IQR Method
    iqr_outliers, lower, upper = detect_outliers_iqr(df, var)
    axes[0, idx].scatter(df.index, df[var], alpha=0.5, s=10, color='blue', label='Normal')
    axes[0, idx].scatter(iqr_outliers.index, iqr_outliers[var], alpha=0.7, s=20, 
                         color='red', label='IQR Outliers')
    axes[0, idx].axhline(y=lower, color='orange', linestyle='--', label='Lower Bound')
    axes[0, idx].axhline(y=upper, color='orange', linestyle='--', label='Upper Bound')
    axes[0, idx].set_title(f'{var} - IQR Method\n({len(iqr_outliers)} outliers)', fontweight='bold')
    axes[0, idx].set_xlabel('Index')
    axes[0, idx].set_ylabel(var)
    axes[0, idx].legend()
    axes[0, idx].grid(True, alpha=0.3)
    
    # Z-score Method
    zscore_outliers = detect_outliers_zscore(df, var, threshold=3)
    z_scores = np.abs(zscore(df[var].dropna()))
    axes[1, idx].scatter(range(len(z_scores)), z_scores, alpha=0.5, s=10, color='blue', label='Normal')
    outlier_indices = np.where(z_scores > 3)[0]
    if len(outlier_indices) > 0:
        axes[1, idx].scatter(outlier_indices, z_scores[outlier_indices], alpha=0.7, s=20,
                            color='red', label='Z-score Outliers')
    axes[1, idx].axhline(y=3, color='orange', linestyle='--', label='Threshold (Z=3)')
    axes[1, idx].set_title(f'{var} - Z-score Method\n({len(zscore_outliers)} outliers)', fontweight='bold')
    axes[1, idx].set_xlabel('Index')
    axes[1, idx].set_ylabel('Absolute Z-score')
    axes[1, idx].legend()
    axes[1, idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# 6. Time-Trend Analysis and Market Volatility
print("=" * 80)
print("6. TIME-TREND ANALYSIS & MARKET VOLATILITY")
print("=" * 80)

# Aggregate by date
daily_sales = df.groupby(df['InvoiceDate'].dt.date).agg({
    'TotalPrice': ['sum', 'mean', 'count'],
    'Quantity': 'sum',
    'InvoiceNo': 'nunique'
}).reset_index()
daily_sales.columns = ['Date', 'DailyRevenue', 'AvgTransactionValue', 'TransactionCount', 'TotalQuantity', 'UniqueInvoices']
daily_sales['Date'] = pd.to_datetime(daily_sales['Date'])

# Calculate rolling statistics for volatility
daily_sales['Revenue_7d_MA'] = daily_sales['DailyRevenue'].rolling(window=7, center=True).mean()
daily_sales['Revenue_30d_MA'] = daily_sales['DailyRevenue'].rolling(window=30, center=True).mean()
daily_sales['Revenue_Std'] = daily_sales['DailyRevenue'].rolling(window=7, center=True).std()
daily_sales['Volatility'] = daily_sales['Revenue_Std'] / daily_sales['Revenue_7d_MA'] * 100  # Coefficient of variation

# Time series plots
fig, axes = plt.subplots(4, 1, figsize=(16, 16))
fig.suptitle('Time-Trend Analysis: Revenue, Transactions, and Volatility', fontsize=16, y=0.995)

# 1. Daily Revenue with moving averages
axes[0].plot(daily_sales['Date'], daily_sales['DailyRevenue'], alpha=0.6, color='steelblue', label='Daily Revenue', linewidth=1)
axes[0].plot(daily_sales['Date'], daily_sales['Revenue_7d_MA'], color='orange', label='7-day MA', linewidth=2)
axes[0].plot(daily_sales['Date'], daily_sales['Revenue_30d_MA'], color='red', label='30-day MA', linewidth=2)
axes[0].fill_between(daily_sales['Date'], 
                     daily_sales['Revenue_7d_MA'] - daily_sales['Revenue_Std'],
                     daily_sales['Revenue_7d_MA'] + daily_sales['Revenue_Std'],
                     alpha=0.2, color='orange', label='±1 Std Dev')
axes[0].set_title('Daily Revenue Trend with Moving Averages', fontweight='bold')
axes[0].set_ylabel('Revenue (£)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 2. Transaction Count
axes[1].plot(daily_sales['Date'], daily_sales['TransactionCount'], alpha=0.7, color='green', linewidth=1.5)
axes[1].set_title('Daily Transaction Count', fontweight='bold')
axes[1].set_ylabel('Number of Transactions')
axes[1].grid(True, alpha=0.3)

# 3. Volatility (Coefficient of Variation)
axes[2].plot(daily_sales['Date'], daily_sales['Volatility'], alpha=0.7, color='purple', linewidth=1.5)
axes[2].axhline(y=daily_sales['Volatility'].mean(), color='red', linestyle='--', label=f'Mean: {daily_sales["Volatility"].mean():.2f}%')
axes[2].set_title('Market Volatility (7-day Rolling CV%)', fontweight='bold')
axes[2].set_ylabel('Volatility (%)')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

# 4. Monthly aggregation
monthly_sales = df.groupby('MonthYear').agg({
    'TotalPrice': ['sum', 'mean'],
    'InvoiceNo': 'nunique',
    'Quantity': 'sum'
}).reset_index()
monthly_sales.columns = ['Month', 'MonthlyRevenue', 'AvgTransaction', 'UniqueInvoices', 'TotalQuantity']
monthly_sales['Month'] = pd.to_datetime(monthly_sales['Month'])

axes[3].bar(monthly_sales['Month'], monthly_sales['MonthlyRevenue'], alpha=0.7, color='teal', edgecolor='black')
axes[3].set_title('Monthly Revenue', fontweight='bold')
axes[3].set_ylabel('Revenue (£)')
axes[3].set_xlabel('Month')
axes[3].tick_params(axis='x', rotation=45)
axes[3].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Hourly and Day-of-Week patterns
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Temporal Patterns: Hourly and Weekly Trends', fontsize=16, y=0.995)

# Hourly revenue
hourly_revenue = df.groupby('Hour')['TotalPrice'].sum().sort_index()
axes[0, 0].bar(hourly_revenue.index, hourly_revenue.values, alpha=0.7, color='coral', edgecolor='black')
axes[0, 0].set_title('Revenue by Hour of Day', fontweight='bold')
axes[0, 0].set_xlabel('Hour')
axes[0, 0].set_ylabel('Revenue (£)')
axes[0, 0].grid(True, alpha=0.3, axis='y')

# Day of week revenue
dow_map = {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}
dow_revenue = df.groupby('DayOfWeek')['TotalPrice'].sum().sort_index()
axes[0, 1].bar(range(len(dow_revenue)), dow_revenue.values, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 1].set_xticks(range(len(dow_revenue)))
axes[0, 1].set_xticklabels([dow_map[i] for i in dow_revenue.index])
axes[0, 1].set_title('Revenue by Day of Week', fontweight='bold')
axes[0, 1].set_ylabel('Revenue (£)')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# Monthly revenue trend
axes[1, 0].plot(monthly_sales['Month'], monthly_sales['MonthlyRevenue'], marker='o', linewidth=2, markersize=8, color='darkgreen')
axes[1, 0].set_title('Monthly Revenue Trend', fontweight='bold')
axes[1, 0].set_ylabel('Revenue (£)')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)

# Transaction count by day of week
dow_transactions = df.groupby('DayOfWeek')['InvoiceNo'].nunique().sort_index()
axes[1, 1].bar(range(len(dow_transactions)), dow_transactions.values, alpha=0.7, color='gold', edgecolor='black')
axes[1, 1].set_xticks(range(len(dow_transactions)))
axes[1, 1].set_xticklabels([dow_map[i] for i in dow_transactions.index])
axes[1, 1].set_title('Unique Invoices by Day of Week', fontweight='bold')
axes[1, 1].set_ylabel('Number of Invoices')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Volatility statistics
print("\nVolatility Statistics:")
print(f"  Mean Volatility: {daily_sales['Volatility'].mean():.2f}%")
print(f"  Max Volatility: {daily_sales['Volatility'].max():.2f}%")
print(f"  Min Volatility: {daily_sales['Volatility'].min():.2f}%")
print(f"  Std Dev of Volatility: {daily_sales['Volatility'].std():.2f}%")


In [None]:
# 7. Customer-Level Analysis: RFM Segmentation
print("=" * 80)
print("7. CUSTOMER-LEVEL ANALYSIS (RFM SEGMENTATION)")
print("=" * 80)

# Calculate RFM metrics
snapshot_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)

rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',  # Frequency
    'TotalPrice': 'sum'  # Monetary
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

# Create RFM scores (1-5 scale, where 5 is best)
rfm['R_Score'] = pd.qcut(rfm['Recency'], q=5, labels=[5, 4, 3, 2, 1], duplicates='drop')
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5], duplicates='drop')
rfm['M_Score'] = pd.qcut(rfm['Monetary'], q=5, labels=[1, 2, 3, 4, 5], duplicates='drop')

# Convert to numeric
rfm['R_Score'] = rfm['R_Score'].astype(int)
rfm['F_Score'] = rfm['F_Score'].astype(int)
rfm['M_Score'] = rfm['M_Score'].astype(int)

# Create RFM segment
rfm['RFM_Score'] = rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str) + rfm['M_Score'].astype(str)

# Define customer segments
def segment_customer(row):
    if row['R_Score'] >= 4 and row['F_Score'] >= 4 and row['M_Score'] >= 4:
        return 'Champions'
    elif row['R_Score'] >= 3 and row['F_Score'] >= 3 and row['M_Score'] >= 3:
        return 'Loyal Customers'
    elif row['R_Score'] >= 4 and row['F_Score'] <= 2:
        return 'New Customers'
    elif row['R_Score'] <= 2 and row['F_Score'] >= 3:
        return 'At Risk'
    elif row['R_Score'] <= 2 and row['F_Score'] <= 2:
        return 'Lost Customers'
    else:
        return 'Regular'

rfm['Segment'] = rfm.apply(segment_customer, axis=1)

# RFM Summary Statistics
print("\nRFM Summary Statistics:")
print(rfm[['Recency', 'Frequency', 'Monetary']].describe())

# Customer segment distribution
print("\nCustomer Segment Distribution:")
segment_dist = rfm['Segment'].value_counts()
print(segment_dist)
print(f"\nSegment Revenue Contribution:")
segment_revenue = rfm.groupby('Segment')['Monetary'].sum().sort_values(ascending=False)
for seg, rev in segment_revenue.items():
    print(f"  {seg}: £{rev:,.2f} ({rev/rfm['Monetary'].sum()*100:.2f}%)")

# Visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Customer RFM Analysis', fontsize=16, y=0.995)

# RFM Score distributions
axes[0, 0].hist(rfm['R_Score'], bins=5, edgecolor='black', alpha=0.7, color='skyblue')
axes[0, 0].set_title('Recency Score Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Recency Score')
axes[0, 0].set_ylabel('Count')
axes[0, 0].grid(True, alpha=0.3, axis='y')

axes[0, 1].hist(rfm['F_Score'], bins=5, edgecolor='black', alpha=0.7, color='lightgreen')
axes[0, 1].set_title('Frequency Score Distribution', fontweight='bold')
axes[0, 1].set_xlabel('Frequency Score')
axes[0, 1].set_ylabel('Count')
axes[0, 1].grid(True, alpha=0.3, axis='y')

axes[0, 2].hist(rfm['M_Score'], bins=5, edgecolor='black', alpha=0.7, color='coral')
axes[0, 2].set_title('Monetary Score Distribution', fontweight='bold')
axes[0, 2].set_xlabel('Monetary Score')
axes[0, 2].set_ylabel('Count')
axes[0, 2].grid(True, alpha=0.3, axis='y')

# Segment distribution
axes[1, 0].barh(segment_dist.index, segment_dist.values, alpha=0.7, color='teal', edgecolor='black')
axes[1, 0].set_title('Customer Segment Count', fontweight='bold')
axes[1, 0].set_xlabel('Number of Customers')
axes[1, 0].grid(True, alpha=0.3, axis='x')

# Segment revenue
axes[1, 1].barh(segment_revenue.index, segment_revenue.values, alpha=0.7, color='purple', edgecolor='black')
axes[1, 1].set_title('Revenue by Segment', fontweight='bold')
axes[1, 1].set_xlabel('Revenue (£)')
axes[1, 1].grid(True, alpha=0.3, axis='x')

# RFM scatter plot (Frequency vs Monetary, colored by Recency)
scatter = axes[1, 2].scatter(rfm['Frequency'], rfm['Monetary'], c=rfm['Recency'], 
                            cmap='RdYlGn_r', alpha=0.6, s=50, edgecolors='black', linewidth=0.5)
axes[1, 2].set_title('Frequency vs Monetary (colored by Recency)', fontweight='bold')
axes[1, 2].set_xlabel('Frequency')
axes[1, 2].set_ylabel('Monetary Value (£)')
axes[1, 2].set_xscale('log')
axes[1, 2].set_yscale('log')
plt.colorbar(scatter, ax=axes[1, 2], label='Recency (days)')
axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Additional customer statistics
print("\nCustomer Statistics:")
print(f"  Total Customers: {len(rfm):,}")
print(f"  Average Recency: {rfm['Recency'].mean():.1f} days")
print(f"  Average Frequency: {rfm['Frequency'].mean():.2f} transactions")
print(f"  Average Monetary Value: £{rfm['Monetary'].mean():,.2f}")
print(f"  Median Monetary Value: £{rfm['Monetary'].median():,.2f}")


In [None]:
# 8. Product/Category-Level Insights
print("=" * 80)
print("8. PRODUCT/CATEGORY-LEVEL INSIGHTS")
print("=" * 80)

# Product performance metrics
product_stats = df.groupby('StockCode').agg({
    'Quantity': ['sum', 'mean'],
    'TotalPrice': ['sum', 'mean'],
    'InvoiceNo': 'nunique',
    'CustomerID': 'nunique',
    'Description': 'first'
}).reset_index()
product_stats.columns = ['StockCode', 'TotalQuantity', 'AvgQuantity', 'TotalRevenue', 
                         'AvgPrice', 'TransactionCount', 'UniqueCustomers', 'Description']

# Top products
print("\nTop 10 Products by Revenue:")
top_revenue = product_stats.nlargest(10, 'TotalRevenue')[['Description', 'TotalRevenue', 'TotalQuantity', 'TransactionCount']]
print(top_revenue.to_string(index=False))

print("\nTop 10 Products by Quantity Sold:")
top_quantity = product_stats.nlargest(10, 'TotalQuantity')[['Description', 'TotalQuantity', 'TotalRevenue', 'TransactionCount']]
print(top_quantity.to_string(index=False))

print("\nTop 10 Products by Transaction Frequency:")
top_freq = product_stats.nlargest(10, 'TransactionCount')[['Description', 'TransactionCount', 'TotalRevenue', 'UniqueCustomers']]
print(top_freq.to_string(index=False))

# Product diversity metrics
print("\nProduct Diversity Metrics:")
print(f"  Total Unique Products: {df['StockCode'].nunique():,}")
print(f"  Products with >100 transactions: {len(product_stats[product_stats['TransactionCount'] > 100]):,}")
print(f"  Products with >£10,000 revenue: {len(product_stats[product_stats['TotalRevenue'] > 10000]):,}")
print(f"  Average revenue per product: £{product_stats['TotalRevenue'].mean():,.2f}")
print(f"  Median revenue per product: £{product_stats['TotalRevenue'].median():,.2f}")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Product Performance Analysis', fontsize=16, y=0.995)

# Top 15 products by revenue
top15_revenue = product_stats.nlargest(15, 'TotalRevenue')
axes[0, 0].barh(range(len(top15_revenue)), top15_revenue['TotalRevenue'], alpha=0.7, color='steelblue', edgecolor='black')
axes[0, 0].set_yticks(range(len(top15_revenue)))
axes[0, 0].set_yticklabels([desc[:40] + '...' if len(desc) > 40 else desc for desc in top15_revenue['Description']], fontsize=8)
axes[0, 0].set_title('Top 15 Products by Revenue', fontweight='bold')
axes[0, 0].set_xlabel('Revenue (£)')
axes[0, 0].grid(True, alpha=0.3, axis='x')

# Revenue distribution (log scale)
axes[0, 1].hist(np.log1p(product_stats['TotalRevenue']), bins=50, edgecolor='black', alpha=0.7, color='lightgreen')
axes[0, 1].set_title('Product Revenue Distribution (Log Scale)', fontweight='bold')
axes[0, 1].set_xlabel('Log(Revenue)')
axes[0, 1].set_ylabel('Number of Products')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# Quantity vs Revenue scatter
axes[1, 0].scatter(product_stats['TotalQuantity'], product_stats['TotalRevenue'], 
                   alpha=0.5, s=30, color='coral', edgecolors='black', linewidth=0.5)
axes[1, 0].set_title('Quantity vs Revenue', fontweight='bold')
axes[1, 0].set_xlabel('Total Quantity Sold')
axes[1, 0].set_ylabel('Total Revenue (£)')
axes[1, 0].set_xscale('log')
axes[1, 0].set_yscale('log')
axes[1, 0].grid(True, alpha=0.3)

# Transaction frequency distribution
axes[1, 1].hist(product_stats['TransactionCount'], bins=50, edgecolor='black', alpha=0.7, color='purple')
axes[1, 1].set_title('Product Transaction Frequency Distribution', fontweight='bold')
axes[1, 1].set_xlabel('Number of Transactions')
axes[1, 1].set_ylabel('Number of Products')
axes[1, 1].set_xscale('log')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Price analysis
print("\nPrice Analysis:")
print(f"  Average Unit Price: £{df['UnitPrice'].mean():.2f}")
print(f"  Median Unit Price: £{df['UnitPrice'].median():.2f}")
print(f"  Price Range: £{df['UnitPrice'].min():.2f} - £{df['UnitPrice'].max():,.2f}")
print(f"  Products under £10: {len(df[df['UnitPrice'] < 10])/len(df)*100:.1f}% of transactions")
print(f"  Products £10-50: {len(df[(df['UnitPrice'] >= 10) & (df['UnitPrice'] < 50)])/len(df)*100:.1f}% of transactions")
print(f"  Products over £50: {len(df[df['UnitPrice'] >= 50])/len(df)*100:.1f}% of transactions")


## 9. Deep Insights: Market Volatility, Seasonal Patterns, and Geographical Analysis

Comprehensive analysis of market behavior patterns including volatility observations, seasonal trends, and geographical demand patterns.


In [None]:
# 9. Deep Insights: Volatility, Seasonal, and Geographical Patterns
print("=" * 80)
print("9. DEEP INSIGHTS: VOLATILITY, SEASONAL, & GEOGRAPHICAL ANALYSIS")
print("=" * 80)

# 9.1 Market Volatility Deep Dive
print("\n9.1 MARKET VOLATILITY OBSERVATIONS")
print("-" * 80)

# Calculate volatility metrics
daily_sales['DailyChange'] = daily_sales['DailyRevenue'].pct_change() * 100
daily_sales['DailyChangeAbs'] = daily_sales['DailyChange'].abs()

# Identify high volatility periods
high_vol_threshold = daily_sales['Volatility'].quantile(0.75)
high_vol_days = daily_sales[daily_sales['Volatility'] > high_vol_threshold]

print(f"\nVolatility Metrics:")
print(f"  High Volatility Threshold (75th percentile): {high_vol_threshold:.2f}%")
print(f"  Days with High Volatility: {len(high_vol_days)} ({len(high_vol_days)/len(daily_sales)*100:.1f}%)")
print(f"  Average Daily Revenue Change: {daily_sales['DailyChange'].mean():.2f}%")
print(f"  Average Absolute Daily Change: {daily_sales['DailyChangeAbs'].mean():.2f}%")
print(f"  Max Daily Revenue Increase: {daily_sales['DailyChange'].max():.2f}%")
print(f"  Max Daily Revenue Decrease: {daily_sales['DailyChange'].min():.2f}%")

# 9.2 Seasonal Patterns
print("\n9.2 SEASONAL PATTERNS")
print("-" * 80)

# Monthly seasonality
monthly_agg = df.groupby('Month').agg({
    'TotalPrice': ['sum', 'mean', 'count'],
    'InvoiceNo': 'nunique'
}).reset_index()
monthly_agg.columns = ['Month', 'MonthlyRevenue', 'AvgTransaction', 'TransactionCount', 'UniqueInvoices']
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_agg['MonthName'] = monthly_agg['Month'].apply(lambda x: month_names[x-1] if x <= 12 else 'Unknown')

print("\nMonthly Revenue Pattern:")
for _, row in monthly_agg.iterrows():
    print(f"  {row['MonthName']}: £{row['MonthlyRevenue']:,.2f} ({row['TransactionCount']:,} transactions)")

# Identify peak months
peak_month = monthly_agg.loc[monthly_agg['MonthlyRevenue'].idxmax()]
print(f"\nPeak Sales Month: {peak_month['MonthName']} (£{peak_month['MonthlyRevenue']:,.2f})")
print(f"Lowest Sales Month: {monthly_agg.loc[monthly_agg['MonthlyRevenue'].idxmin(), 'MonthName']} (£{monthly_agg['MonthlyRevenue'].min():,.2f})")

# Day of week seasonality
dow_seasonality = df.groupby('DayOfWeek').agg({
    'TotalPrice': 'sum',
    'InvoiceNo': 'nunique'
}).reset_index()
dow_seasonality['DayName'] = dow_seasonality['DayOfWeek'].map(dow_map)
print("\nDay of Week Pattern:")
for _, row in dow_seasonality.iterrows():
    print(f"  {row['DayName']}: £{row['TotalPrice']:,.2f} ({row['InvoiceNo']:,} invoices)")

# 9.3 Geographical Demand Patterns
print("\n9.3 GEOGRAPHICAL DEMAND PATTERNS")
print("-" * 80)

country_stats = df.groupby('Country').agg({
    'TotalPrice': ['sum', 'mean'],
    'InvoiceNo': 'nunique',
    'CustomerID': 'nunique',
    'Quantity': 'sum'
}).reset_index()
country_stats.columns = ['Country', 'TotalRevenue', 'AvgTransaction', 'UniqueInvoices', 'UniqueCustomers', 'TotalQuantity']
country_stats = country_stats.sort_values('TotalRevenue', ascending=False)

print(f"\nTotal Countries: {len(country_stats)}")
print(f"\nTop 10 Countries by Revenue:")
print(country_stats.head(10)[['Country', 'TotalRevenue', 'UniqueCustomers', 'UniqueInvoices']].to_string(index=False))

print(f"\nGeographical Concentration:")
top5_revenue = country_stats.head(5)['TotalRevenue'].sum()
print(f"  Top 5 countries account for {top5_revenue/df['TotalPrice'].sum()*100:.1f}% of total revenue")
print(f"  Top country (UK) accounts for {country_stats.iloc[0]['TotalRevenue']/df['TotalPrice'].sum()*100:.1f}% of total revenue")

# Visualizations for deep insights
fig, axes = plt.subplots(3, 2, figsize=(16, 18))
fig.suptitle('Deep Insights: Volatility, Seasonal, and Geographical Patterns', fontsize=16, y=0.995)

# Volatility over time
axes[0, 0].plot(daily_sales['Date'], daily_sales['Volatility'], alpha=0.7, color='red', linewidth=1.5)
axes[0, 0].axhline(y=high_vol_threshold, color='orange', linestyle='--', label=f'High Vol Threshold ({high_vol_threshold:.1f}%)')
axes[0, 0].fill_between(daily_sales['Date'], 0, daily_sales['Volatility'], 
                        where=(daily_sales['Volatility'] > high_vol_threshold), 
                        alpha=0.3, color='red', label='High Volatility Periods')
axes[0, 0].set_title('Market Volatility Over Time', fontweight='bold')
axes[0, 0].set_ylabel('Volatility (%)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Daily revenue changes
axes[0, 1].plot(daily_sales['Date'], daily_sales['DailyChange'], alpha=0.6, color='steelblue', linewidth=1)
axes[0, 1].axhline(y=0, color='black', linestyle='-', linewidth=0.5)
axes[0, 1].set_title('Daily Revenue Change (%)', fontweight='bold')
axes[0, 1].set_ylabel('Percentage Change')
axes[0, 1].grid(True, alpha=0.3)

# Monthly seasonality
axes[1, 0].bar(range(len(monthly_agg)), monthly_agg['MonthlyRevenue'], alpha=0.7, color='teal', edgecolor='black')
axes[1, 0].set_xticks(range(len(monthly_agg)))
axes[1, 0].set_xticklabels(monthly_agg['MonthName'], rotation=45)
axes[1, 0].set_title('Monthly Revenue Seasonality', fontweight='bold')
axes[1, 0].set_ylabel('Revenue (£)')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Day of week pattern
axes[1, 1].bar(range(len(dow_seasonality)), dow_seasonality['TotalPrice'], alpha=0.7, color='coral', edgecolor='black')
axes[1, 1].set_xticks(range(len(dow_seasonality)))
axes[1, 1].set_xticklabels(dow_seasonality['DayName'])
axes[1, 1].set_title('Day of Week Revenue Pattern', fontweight='bold')
axes[1, 1].set_ylabel('Revenue (£)')
axes[1, 1].grid(True, alpha=0.3, axis='y')

# Top 15 countries by revenue
top15_countries = country_stats.head(15)
axes[2, 0].barh(range(len(top15_countries)), top15_countries['TotalRevenue'], alpha=0.7, color='purple', edgecolor='black')
axes[2, 0].set_yticks(range(len(top15_countries)))
axes[2, 0].set_yticklabels(top15_countries['Country'], fontsize=9)
axes[2, 0].set_title('Top 15 Countries by Revenue', fontweight='bold')
axes[2, 0].set_xlabel('Revenue (£)')
axes[2, 0].grid(True, alpha=0.3, axis='x')

# Revenue concentration (Pareto chart)
country_stats_sorted = country_stats.sort_values('TotalRevenue', ascending=False)
cumulative_pct = (country_stats_sorted['TotalRevenue'].cumsum() / country_stats_sorted['TotalRevenue'].sum() * 100)
axes[2, 1].plot(range(1, min(21, len(country_stats_sorted)+1)), cumulative_pct[:20], 
               marker='o', linewidth=2, markersize=6, color='darkgreen')
axes[2, 1].axhline(y=80, color='red', linestyle='--', label='80% Line')
axes[2, 1].set_title('Revenue Concentration (Pareto)', fontweight='bold')
axes[2, 1].set_xlabel('Number of Countries')
axes[2, 1].set_ylabel('Cumulative Revenue %')
axes[2, 1].legend()
axes[2, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Summary insights
print("\n" + "=" * 80)
print("KEY INSIGHTS SUMMARY")
print("=" * 80)
print(f"\n1. Market Volatility:")
print(f"   - Average volatility: {daily_sales['Volatility'].mean():.2f}%")
print(f"   - {len(high_vol_days)} high volatility days identified")
print(f"   - Revenue shows {'high' if daily_sales['Volatility'].mean() > 30 else 'moderate' if daily_sales['Volatility'].mean() > 15 else 'low'} volatility")

print(f"\n2. Seasonal Patterns:")
print(f"   - Peak month: {peak_month['MonthName']}")
print(f"   - Strongest day: {dow_seasonality.loc[dow_seasonality['TotalPrice'].idxmax(), 'DayName']}")
print(f"   - Revenue variation across months: {(monthly_agg['MonthlyRevenue'].max() - monthly_agg['MonthlyRevenue'].min())/monthly_agg['MonthlyRevenue'].mean()*100:.1f}%")

print(f"\n3. Geographical Patterns:")
print(f"   - {len(country_stats)} countries represented")
print(f"   - Top country (UK) dominance: {country_stats.iloc[0]['TotalRevenue']/df['TotalPrice'].sum()*100:.1f}%")
print(f"   - Top 5 countries: {top5_revenue/df['TotalPrice'].sum()*100:.1f}% of revenue")
print("=" * 80)
