# RetailRocket Dataset - Exploratory Data Analysis

This notebook performs comprehensive EDA on the RetailRocket e-commerce dataset.

## Contents
1. Dataset Overview
2. Events Distribution
3. Conversion Funnel Analysis
4. Temporal Patterns
5. Long-tail Analysis
6. Session Analysis
7. User Behavior Analysis

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from src.data.loaders.retailrocket import RetailRocketLoader
from src.data.processors.session_builder import SessionBuilder

# Settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:.2f}'.format)
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

%matplotlib inline

## 1. Dataset Overview

In [None]:
# Load data
loader = RetailRocketLoader()
events = loader.load_events()

print(f"Dataset shape: {events.shape}")
print(f"\nColumn types:")
print(events.dtypes)
print(f"\nMemory usage: {events.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

In [None]:
# Basic statistics
print("=" * 50)
print("DATASET STATISTICS")
print("=" * 50)
print(f"Total events: {len(events):,}")
print(f"Unique visitors: {events['visitor_id'].nunique():,}")
print(f"Unique items: {events['item_id'].nunique():,}")
print(f"Unique transactions: {events['transaction_id'].nunique():,}")
print(f"\nDate range: {events['datetime'].min()} to {events['datetime'].max()}")
print(f"Duration: {(events['datetime'].max() - events['datetime'].min()).days} days")

In [None]:
# Sample data
events.head(10)

In [None]:
# Missing values
print("Missing values:")
print(events.isnull().sum())
print(f"\nNote: transaction_id is NaN for view and addtocart events (expected)")

## 2. Events Distribution

In [None]:
# Event type distribution
event_counts = events['event'].value_counts()
event_percentages = events['event'].value_counts(normalize=True) * 100

print("Event Distribution:")
for event_type in event_counts.index:
    print(f"  {event_type}: {event_counts[event_type]:,} ({event_percentages[event_type]:.2f}%)")

In [None]:
# Visualize event distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart
colors = ['#2ecc71', '#3498db', '#e74c3c']
axes[0].bar(event_counts.index, event_counts.values, color=colors)
axes[0].set_title('Event Type Distribution', fontsize=14)
axes[0].set_xlabel('Event Type')
axes[0].set_ylabel('Count')
for i, (count, pct) in enumerate(zip(event_counts.values, event_percentages.values)):
    axes[0].annotate(f'{count:,}\n({pct:.1f}%)', 
                     xy=(i, count), 
                     ha='center', va='bottom', fontsize=10)

# Pie chart (log scale for visibility)
axes[1].pie(event_counts.values, labels=event_counts.index, autopct='%1.1f%%',
            colors=colors, explode=[0, 0.05, 0.1])
axes[1].set_title('Event Type Proportion', fontsize=14)

plt.tight_layout()
plt.show()

## 3. Conversion Funnel Analysis

In [None]:
# Funnel analysis
funnel_data = {
    'stage': ['View', 'Add to Cart', 'Transaction'],
    'events': [
        event_counts.get('view', 0),
        event_counts.get('addtocart', 0),
        event_counts.get('transaction', 0)
    ]
}
funnel_df = pd.DataFrame(funnel_data)
funnel_df['conversion_rate'] = funnel_df['events'] / funnel_df['events'].iloc[0] * 100
funnel_df['step_conversion'] = funnel_df['events'] / funnel_df['events'].shift(1) * 100

print("Conversion Funnel:")
print(funnel_df.to_string(index=False))

In [None]:
# Funnel visualization
fig = go.Figure(go.Funnel(
    y=funnel_df['stage'],
    x=funnel_df['events'],
    textinfo="value+percent initial",
    marker={"color": ["#2ecc71", "#3498db", "#e74c3c"]}
))
fig.update_layout(
    title="E-commerce Conversion Funnel",
    width=600,
    height=400
)
fig.show()

In [None]:
# User-level funnel (how many users reached each stage)
users_with_view = events[events['event'] == 'view']['visitor_id'].nunique()
users_with_cart = events[events['event'] == 'addtocart']['visitor_id'].nunique()
users_with_purchase = events[events['event'] == 'transaction']['visitor_id'].nunique()

user_funnel = pd.DataFrame({
    'stage': ['Viewed', 'Added to Cart', 'Purchased'],
    'users': [users_with_view, users_with_cart, users_with_purchase]
})
user_funnel['rate'] = user_funnel['users'] / user_funnel['users'].iloc[0] * 100

print("\nUser-level Funnel:")
print(user_funnel.to_string(index=False))

## 4. Temporal Patterns

In [None]:
# Extract temporal features
events['hour'] = events['datetime'].dt.hour
events['dayofweek'] = events['datetime'].dt.dayofweek
events['date'] = events['datetime'].dt.date

In [None]:
# Daily events time series
daily_events = events.groupby('date').size().reset_index(name='events')

fig = px.line(daily_events, x='date', y='events', 
              title='Daily Events Over Time')
fig.update_layout(xaxis_title='Date', yaxis_title='Number of Events')
fig.show()

In [None]:
# Hourly pattern
hourly_events = events.groupby(['hour', 'event']).size().reset_index(name='count')

fig = px.bar(hourly_events, x='hour', y='count', color='event',
             title='Events by Hour of Day',
             barmode='stack',
             color_discrete_map={'view': '#2ecc71', 'addtocart': '#3498db', 'transaction': '#e74c3c'})
fig.update_layout(xaxis_title='Hour', yaxis_title='Number of Events')
fig.show()

In [None]:
# Day of week pattern
dow_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
dow_events = events.groupby(['dayofweek', 'event']).size().reset_index(name='count')
dow_events['day_name'] = dow_events['dayofweek'].map(lambda x: dow_names[x])

fig = px.bar(dow_events, x='day_name', y='count', color='event',
             title='Events by Day of Week',
             barmode='group',
             category_orders={'day_name': dow_names},
             color_discrete_map={'view': '#2ecc71', 'addtocart': '#3498db', 'transaction': '#e74c3c'})
fig.update_layout(xaxis_title='Day of Week', yaxis_title='Number of Events')
fig.show()

In [None]:
# Heatmap: Hour vs Day of Week
heatmap_data = events.pivot_table(index='dayofweek', columns='hour', 
                                   values='visitor_id', aggfunc='count')

fig, ax = plt.subplots(figsize=(14, 6))
sns.heatmap(heatmap_data, cmap='YlOrRd', ax=ax, 
            yticklabels=dow_names, annot=False)
ax.set_title('Activity Heatmap: Hour vs Day of Week', fontsize=14)
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Day of Week')
plt.tight_layout()
plt.show()

## 5. Long-tail Analysis (Item Popularity)

In [None]:
# Item interaction counts
item_counts = events['item_id'].value_counts()

print(f"Items statistics:")
print(f"  Total unique items: {len(item_counts):,}")
print(f"  Max interactions: {item_counts.max():,}")
print(f"  Min interactions: {item_counts.min():,}")
print(f"  Mean interactions: {item_counts.mean():.1f}")
print(f"  Median interactions: {item_counts.median():.1f}")

In [None]:
# Pareto analysis (80/20 rule)
item_counts_sorted = item_counts.sort_values(ascending=False)
cumsum = item_counts_sorted.cumsum()
total = item_counts_sorted.sum()

# Find what % of items accounts for 80% of interactions
threshold_80 = total * 0.8
items_for_80 = (cumsum <= threshold_80).sum() + 1
pct_items_for_80 = items_for_80 / len(item_counts_sorted) * 100

print(f"\nPareto Analysis:")
print(f"  {pct_items_for_80:.1f}% of items account for 80% of interactions")
print(f"  ({items_for_80:,} out of {len(item_counts_sorted):,} items)")

In [None]:
# Long-tail visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Log-log plot (rank vs count)
ranks = np.arange(1, len(item_counts_sorted) + 1)
axes[0].loglog(ranks, item_counts_sorted.values, 'b-', alpha=0.7)
axes[0].set_xlabel('Item Rank (log scale)')
axes[0].set_ylabel('Interaction Count (log scale)')
axes[0].set_title('Long-tail Distribution (Log-Log)')
axes[0].grid(True, alpha=0.3)

# Cumulative distribution
pct_items = np.arange(1, len(item_counts_sorted) + 1) / len(item_counts_sorted) * 100
pct_interactions = cumsum / total * 100

axes[1].plot(pct_items, pct_interactions, 'b-', linewidth=2)
axes[1].axhline(y=80, color='r', linestyle='--', label='80% of interactions')
axes[1].axvline(x=pct_items_for_80, color='r', linestyle='--')
axes[1].fill_between(pct_items, pct_interactions, alpha=0.3)
axes[1].set_xlabel('% of Items')
axes[1].set_ylabel('% of Interactions (Cumulative)')
axes[1].set_title('Cumulative Distribution (Pareto)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Item interaction histogram
fig, ax = plt.subplots(figsize=(10, 5))

# Use log bins for better visualization
bins = np.logspace(0, np.log10(item_counts.max()), 50)
ax.hist(item_counts.values, bins=bins, edgecolor='black', alpha=0.7)
ax.set_xscale('log')
ax.set_xlabel('Number of Interactions (log scale)')
ax.set_ylabel('Number of Items')
ax.set_title('Distribution of Item Popularity')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Session Analysis

In [None]:
# Build sessions
session_builder = SessionBuilder(timeout_minutes=30)
events_with_sessions = session_builder.build_sessions(events)

In [None]:
# Session statistics
session_stats = session_builder.get_session_stats(events_with_sessions)

print("Session Statistics:")
for key, value in session_stats.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value:,}")

In [None]:
# Session length distribution
session_lengths = events_with_sessions.groupby('session_id').size()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of session lengths
axes[0].hist(session_lengths.values, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Session Length (events)')
axes[0].set_ylabel('Number of Sessions')
axes[0].set_title('Session Length Distribution')
axes[0].set_xlim(0, session_lengths.quantile(0.99))  # Limit to 99th percentile
axes[0].axvline(session_lengths.mean(), color='r', linestyle='--', label=f'Mean: {session_lengths.mean():.1f}')
axes[0].axvline(session_lengths.median(), color='g', linestyle='--', label=f'Median: {session_lengths.median():.0f}')
axes[0].legend()

# Session length by event type
session_events = events_with_sessions.groupby(['session_id', 'event']).size().unstack(fill_value=0)
session_events['total'] = session_events.sum(axis=1)

# Box plot
session_events_melted = session_events[['view', 'addtocart', 'transaction']].reset_index().melt(
    id_vars='session_id', var_name='event', value_name='count'
)
session_events_melted = session_events_melted[session_events_melted['count'] > 0]

sns.boxplot(data=session_events_melted, x='event', y='count', ax=axes[1])
axes[1].set_xlabel('Event Type')
axes[1].set_ylabel('Events per Session')
axes[1].set_title('Events per Session by Type')
axes[1].set_ylim(0, 20)

plt.tight_layout()
plt.show()

In [None]:
# Sessions per user
sessions_per_user = events_with_sessions.groupby('visitor_id')['session_id'].nunique()

print(f"Sessions per user:")
print(f"  Mean: {sessions_per_user.mean():.2f}")
print(f"  Median: {sessions_per_user.median():.0f}")
print(f"  Max: {sessions_per_user.max()}")
print(f"  Single-session users: {(sessions_per_user == 1).sum():,} ({(sessions_per_user == 1).mean()*100:.1f}%)")

## 7. User Behavior Analysis

In [None]:
# User activity distribution
user_event_counts = events['visitor_id'].value_counts()

print(f"User activity statistics:")
print(f"  Mean events per user: {user_event_counts.mean():.2f}")
print(f"  Median events per user: {user_event_counts.median():.0f}")
print(f"  Max events per user: {user_event_counts.max():,}")
print(f"  Single-event users: {(user_event_counts == 1).sum():,} ({(user_event_counts == 1).mean()*100:.1f}%)")

In [None]:
# User activity histogram
fig, ax = plt.subplots(figsize=(10, 5))

bins = np.logspace(0, np.log10(user_event_counts.max()), 50)
ax.hist(user_event_counts.values, bins=bins, edgecolor='black', alpha=0.7)
ax.set_xscale('log')
ax.set_xlabel('Number of Events per User (log scale)')
ax.set_ylabel('Number of Users')
ax.set_title('User Activity Distribution')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# User funnel behavior
user_events = events.pivot_table(
    index='visitor_id', 
    columns='event', 
    values='item_id', 
    aggfunc='count'
).fillna(0)

user_events['has_view'] = user_events.get('view', 0) > 0
user_events['has_cart'] = user_events.get('addtocart', 0) > 0
user_events['has_purchase'] = user_events.get('transaction', 0) > 0

# User segments
segments = {
    'View only': (user_events['has_view'] & ~user_events['has_cart'] & ~user_events['has_purchase']).sum(),
    'View + Cart': (user_events['has_view'] & user_events['has_cart'] & ~user_events['has_purchase']).sum(),
    'Purchasers': (user_events['has_purchase']).sum()
}

print("User Segments:")
for segment, count in segments.items():
    pct = count / len(user_events) * 100
    print(f"  {segment}: {count:,} ({pct:.2f}%)")

In [None]:
# Visualize user segments
fig = px.pie(values=list(segments.values()), names=list(segments.keys()),
             title='User Segments by Behavior',
             color_discrete_sequence=['#2ecc71', '#3498db', '#e74c3c'])
fig.show()

## Summary

### Key Findings:

1. **Dataset Size**: ~2.7M events from ~1.4M unique visitors and ~235K items

2. **Conversion Funnel**:
   - View → AddToCart: ~2.6% conversion
   - AddToCart → Transaction: ~30% conversion  
   - Overall View → Transaction: ~0.8%

3. **Long-tail Distribution**:
   - Strong power-law distribution in item popularity
   - A small percentage of items receive most interactions

4. **Session Characteristics**:
   - High proportion of single-event sessions
   - Most users have only one session

5. **Temporal Patterns**:
   - Clear hourly and weekly patterns in user activity
   - Peak hours during daytime/evening

### Implications for Recommender System:

- Need to handle **cold-start** for majority of single-session users
- **Popularity bias** is strong - need to balance popular vs long-tail items
- **Session-based** approaches important given short user histories
- **Multi-signal weighting** valuable (view < addtocart < transaction)