E-COMMERCE CUSTOMER PURCHASE PREDICTION

**Problem Statement** 
Can we predict customer purchase using browsing patterns, session characteristics, and product features?

Data source 
https://data.rees46.com/datasets/marketplace/2019-Nov.csv.gz 

About the dataset
* Dataset contains user browsing and purchasing activities from a multi-category online store. Includes over 67.5 million events (views, cart additions, and purchases) across various product categories.

Please read the [Readme.md](Readme.md) file for more details.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load full dataset for analysis
data_url = "https://data.rees46.com/datasets/marketplace/2019-Nov.csv.gz"
df = pd.read_csv(data_url, compression='gzip')
print(f"Shape: {df.shape}")

print(f"Columns: {list(df.columns)}")
print(f"Data types:\n{df.dtypes}")
df.head(5)

#### Data Quality Assessment

In [None]:
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_df = pd.DataFrame(
    {
        'Column': missing_data.index,
        'Missing Count': missing_data.values,
        'Missing Percentage': missing_percent.values
    }
)
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("Missing Data Summary:")
    display(missing_df)
else:
    print("No missing data found!")

# Data quality checks
print(f"Total events: {len(df):,}")
print(f"Unique users: {df['user_id'].nunique():,}")
print(f"Unique products: {df['product_id'].nunique():,}")
print(f"Date range: {df['event_time'].min()} to {df['event_time'].max()}")
print(f"Zero prices: {(df['price'] == 0).sum():,} ({(df['price'] == 0).mean()*100:.1f}%)")

# Event type distribution
event_dist = df['event_type'].value_counts()
event_pct = (event_dist / len(df) * 100).round(2)
for event, count in event_dist.items():
    print(f"{event}: {count:,} ({event_pct[event]}%)")

#### Target Variable Creation
We will be creating user-level purchase behavior for classification. 
And we will be creating a target variable that will be 1 if the user has made a purchase and 0 otherwise.

In [None]:
def count_by_type(event_series, type):
    return (event_series == type).sum()

user_features = df.groupby('user_id').agg(
    total_events=('event_type', 'count'),
    views=('event_type', lambda x: count_by_type(x, 'view')),
    unique_products=('product_id', 'nunique'),
    unique_categories=('category_id', 'nunique'),
    avg_price=('price', 'mean'),
    max_price=('price', 'max'),
    total_price=('price', 'sum'),
    sessions=('user_session', 'nunique'),
    first_event=('event_time', 'min'),
    last_event=('event_time', 'max')
).round(2)

# Create binary target: 1 if user made any purchase, 0 otherwise
purchase_users = set(df[df['event_type'] == 'purchase']['user_id'].unique())
user_features['made_purchase'] = user_features.index.isin(purchase_users).astype(int)

print(f"Total # of users: {len(user_features)}")
target_dist = user_features['made_purchase'].value_counts()
print(f"No Purchase (0): {target_dist[0]:,} ({target_dist[0]/len(user_features)*100:.1f}%)")
print(f"Made Purchase (1): {target_dist[1]:,} ({target_dist[1]/len(user_features)*100:.1f}%)")
print(f"Conversion Rate: {user_features['made_purchase'].mean()*100:.2f}%")

user_features.head()

#### Feature Engineering
In this step, we will be adding features to the dataset for, 
1. Session-level features
2. User interaction features
3. Product-level features

In [None]:
df['event_time'] = pd.to_datetime(df['event_time'])

# 1. Session features
session_features = df.groupby('user_id').agg({
    'event_time': lambda x: (x.max() - x.min()).total_seconds() / 60
}).rename(columns={'event_time': 'session_duration_minutes'})

# 2. user interaction pattern features
user_features['events_per_session'] = user_features['total_events'] / user_features['sessions']

# 3. Product exploration behavior
user_features['category_exploration'] = user_features['unique_categories'] / user_features['unique_products']
user_features['product_focus'] = user_features['total_events'] / user_features['unique_products']
user_features['price_range'] = user_features['max_price'] - user_features['avg_price']

user_features = user_features.join(session_features)

print(f"Total features: {len(user_features.columns) - 1} (excluding target)")

pd.set_option('display.float_format', '{:.2f}'.format)
display(user_features.describe())

#### Exploratory Data Analysis
Here we will be analyzing the dataset to understand the data better. 
There are 4 plots that are created below, 
1. Conversion by number of views
2. Conversion by average price viewed
3. Conversion by session duration
4. Conversion by product explored

In [None]:
def plot_conversion_by_feature(data, feature_col, bins, labels, title, ax, color='skyblue'):
    groups = pd.cut(data[feature_col], bins=bins, labels=labels)
    conversion_by_feature = data.groupby(groups)['made_purchase'].mean()
    
    conversion_by_feature.plot(kind='bar', ax=ax, title=title, color=color)
    ax.set_ylabel('Conversion Rate')
    ax.tick_params(axis='x', rotation=45)
    
    return conversion_by_feature

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Conversion by number of views
conversion_views = plot_conversion_by_feature(
    data=user_features,
    feature_col='views',
    bins=[0, 1, 5, 10, 20, 100],
    labels=['1', '2-5', '6-10', '11-20', '20+'],
    title='Conversion Rate by Number of Views',
    ax=axes[0,0],
    color='skyblue'
)

# Plot 2: Conversion by average price
conversion_price = plot_conversion_by_feature(
    data=user_features,
    feature_col='avg_price',
    bins=5,
    labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'],
    title='Conversion Rate by Average Price Viewed',
    ax=axes[0,1],
    color='skyblue'
)

# Plot 3: Conversion by session duration
conversion_duration = plot_conversion_by_feature(
    data=user_features,
    feature_col='session_duration_minutes',
    bins=[0, 1, 5, 15, 60, 1000],
    labels=['<1min', '1-5min', '5-15min', '15-60min', '60min+'],
    title='Conversion Rate by Session Duration',
    ax=axes[1,0],
    color='skyblue'
)

# Plot 4: Conversion by unique products viewed
conversion_exploration = plot_conversion_by_feature(
    data=user_features,
    feature_col='unique_products',
    bins=[0, 1, 3, 5, 10, 100],
    labels=['1', '2-3', '4-5', '6-10', '10+'],
    title='Conversion Rate by Products Explored',
    ax=axes[1,1],
    color='skyblue'
)

plt.tight_layout()
plt.show()

print(f"Overall conversion rate: {user_features['made_purchase'].mean()*100:.2f}%")
print(f"Highest converting view group: {conversion_views.idxmax()}")
print(f"Highest converting price group: {conversion_price.idxmax()}")
print(f"Optimal session duration: {conversion_duration.idxmax()}")

#### Category & Brand Analysis

In [None]:
purchase_events = df[df['event_type'] == 'purchase'].copy()

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Top categories by purchases
top_categories = purchase_events['category_code'].value_counts().head(10)
top_categories.plot(kind='barh', ax=axes[0,0], title='Top 10 Categories by Purchases')
axes[0,0].set_xlabel('Number of Purchases')

# Top brands by purchases
top_brands = purchase_events['brand'].value_counts().head(10)
top_brands.plot(kind='barh', ax=axes[0,1], title='Top 10 Brands by Purchases')
axes[0,1].set_xlabel('Number of Purchases')

# Price distribution of purchases
axes[1,0].hist(purchase_events['price'], bins=30, alpha=0.7, color='skyblue')
axes[1,0].set_title('Distribution of Purchase Prices')
axes[1,0].set_xlabel('Price')
axes[1,0].set_ylabel('Frequency')

# Purchase timing
purchase_events['hour'] = purchase_events['event_time'].dt.hour
hourly_purchases = purchase_events['hour'].value_counts().sort_index()
hourly_purchases.plot(kind='line', ax=axes[1,1], title='Purchases by Hour of Day', marker='o')
axes[1,1].set_xlabel('Hour')
axes[1,1].set_ylabel('Number of Purchases')

plt.tight_layout()
plt.show()

print(f"Total purchases: {len(purchase_events):,}")
print(f"Average purchase price: ${purchase_events['price'].mean():.2f}")
print(f"Most popular category: {top_categories.index[0]}")
print(f"Most popular brand: {top_brands.index[0]}")


#### Correlation Analysis & Feature Selection

In [None]:
features_for_corr = user_features.select_dtypes(include=[np.number]).fillna(0)

corr_matrix = features_for_corr.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(
    corr_matrix,
    annot=True,
    cmap='coolwarm',
    center=0, 
    square=True,
    mask=mask,
    cbar_kws={"shrink": .8}
)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

target_corr_clean = corr_matrix['made_purchase'].abs().sort_values(ascending=False)
print("Features most correlated with purchase behavior:")
for feature, corr in target_corr_clean.drop('made_purchase').head(10).items():
    print(f"{feature}: {corr:.3f}")

important_clean_features = target_corr_clean.drop('made_purchase').head(10).index.tolist()
print(f"Top 10 clean features for modeling: {important_clean_features}")

#### Baseline Modeling
Below we will use Logistic Regression and Random Forest to create a baseline model.

In [None]:
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, precision_score,\
    accuracy_score, recall_score, f1_score

X = user_features[important_clean_features]
y = user_features['made_purchase']

print(f"Features: {X.shape[1]}")
print(f"Samples/Users: {len(X)}")
print(f"Target distribution: {y.value_counts().to_dict()}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training: {len(X_train)} samples")
print(f"Testing: {len(X_test)} samples")

# Pipeline 1: Logistic Regression (needs scaling)
lr_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000))
    ]
)

# Pipeline 2: Random Forest
rf_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='median')),
        ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=100))
    ]
)

models = {
    'Logistic Regression': lr_pipeline,
    'Random Forest': rf_pipeline
}

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1_Score': f1_score(y_test, y_pred),
        'AUC_ROC': roc_auc_score(y_test, y_prob)
    }

results = {}
for name, pipeline in models.items():
    print(f"Running {name}...")
    pipeline.fit(X_train, y_train)
    results[name] = evaluate_model(model=pipeline, 
                                   X_test=X_test,
                                   y_test=y_test)

results_df = pd.DataFrame(results).T
display(results_df)
