## echoFace - Product Recommendation Model
Multimodal User Authentication and Product Recommendation System

This notebook implements the Product Recommendation component that predicts
which product category a customer is likely to purchase based on their
social media engagement and transaction history.

In [1]:
# ============================================================================
# 1. IMPORT LIBRARIES
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, classification_report,
                           confusion_matrix, log_loss)
from xgboost import XGBClassifier
import joblib

# Check sklearn version
import sklearn
print(f"scikit-learn version: {sklearn.__version__}")

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("\n" + "=" * 80)
print("echoFace - Product Recommendation Model")
print("=" * 80)
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 80)

Matplotlib is building the font cache; this may take a moment.


scikit-learn version: 1.6.1

echoFace - Product Recommendation Model
Timestamp: 2025-11-13 17:37:51


In [10]:
# ============================================================================
# 2. HELPER FUNCTIONS
# ============================================================================

def load_data(filepath):
    """Load dataset from CSV file."""
    df = pd.read_csv(filepath)
    print(f"✓ Loaded {filepath}")
    print(f"  Shape: {df.shape}")
    return df

def display_categorical(df):
    """Display distribution of categorical variables."""
    print("\n--- Categorical Variable Distribution ---")
    print("\nProduct Category Distribution:")
    display(df['product_category'].value_counts())
    print("\nSocial Media Platform Distribution:")
    display(df['social_media_platform'].value_counts())
    print("\nReview Sentiment Distribution:")
    display(df['review_sentiment'].value_counts())

def data_preprocessing(df, le):
    """
    Preprocess the merged dataset:
    - Extract date features
    - Drop irrelevant columns
    - Encode categorical variables
    """
    print("\n--- Data Preprocessing ---")

    # Create temporal features from purchase_date
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['month'] = df['purchase_date'].dt.month
    df['day'] = df['purchase_date'].dt.day
    df['weekday'] = df['purchase_date'].dt.day_of_week
    df['isweekend'] = (df['weekday'] >= 5).astype(int)

    # Drop columns not needed for prediction
    df.drop(columns='purchase_date', inplace=True)
    df = df.drop(['customer_id', 'transaction_id', 'year'], axis=1)

    # Get numerical columns (before encoding)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.drop('isweekend')

    # Label encode target variable (product_category)
    df['product_category'] = le.fit_transform(df['product_category'])

    # Ordinal encoding for review sentiment
    sentiment_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
    df['review_sentiment'] = df['review_sentiment'].map(sentiment_mapping)

    # One-hot encode social media platform
    df = pd.get_dummies(df, columns=['social_media_platform'], dtype=int)

    # Print label mapping
    print("\n✓ Product Category Encoding:")
    for idx, category in enumerate(le.classes_):
        print(f"  {category} = {idx}")

    return df, numeric_cols

def prep_data(df):
    """Prepare features and target, split into train/test sets."""
    X = df.drop('product_category', axis=1)
    y = df['product_category']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    print(f'\n--- Train-Test Split ---')
    print(f'X_train shape: {X_train.shape}')
    print(f'X_test shape: {X_test.shape}')
    print(f'y_train shape: {y_train.shape}')
    print(f'y_test shape: {y_test.shape}')

    return X, y, X_train, X_test, y_train, y_test

def save_model_artifacts(model, scaler, le, final_columns, numeric_cols, output_dir='../models/product/'):
    """Save all model artifacts needed for prediction."""
    os.makedirs(output_dir, exist_ok=True)

    joblib.dump(model, os.path.join(output_dir, 'product_recommendation_model.pkl'))
    joblib.dump(scaler, os.path.join(output_dir, 'product_model_scaler.pkl'))
    joblib.dump(le, os.path.join(output_dir, 'product_model_encoder.pkl'))
    joblib.dump(final_columns, os.path.join(output_dir, 'product_model_columns.pkl'))
    joblib.dump(numeric_cols, os.path.join(output_dir, 'product_model_numeric_cols.pkl'))

    print(f"\n✓ All model artifacts saved to '{output_dir}'")
    print("  - product_recommendation_model.pkl")
    print("  - product_model_scaler.pkl")
    print("  - product_model_encoder.pkl")
    print("  - product_model_columns.pkl")
    print("  - product_model_numeric_cols.pkl")

In [8]:
# ============================================================================
# 3. DATA LOADING
# ============================================================================

print("\n" + "=" * 80)
print("STEP 1: DATA LOADING AND MERGING")
print("=" * 80)

# Load individual datasets
df_social = load_data('/Users/edine.mugisha/Documents/School/echoFace/data/customer-info/customer_social_profiles - customer_social_profiles.csv')
df_transactions = load_data('/Users/edine.mugisha/Documents/School/echoFace/data/customer-info/customer_transactions - customer_transactions.csv')

print("\n--- Social Profiles Sample ---")
display(df_social.head())

print("\n--- Transactions Sample ---")
display(df_transactions.head())


STEP 1: DATA LOADING AND MERGING
✓ Loaded /Users/edine.mugisha/Documents/School/echoFace/data/customer-info/customer_social_profiles - customer_social_profiles.csv
  Shape: (155, 5)
✓ Loaded /Users/edine.mugisha/Documents/School/echoFace/data/customer-info/customer_transactions - customer_transactions.csv
  Shape: (150, 6)

--- Social Profiles Sample ---


Unnamed: 0,customer_id_new,social_media_platform,engagement_score,purchase_interest_score,review_sentiment
0,A178,LinkedIn,74,4.9,Positive
1,A190,Twitter,82,4.8,Neutral
2,A150,Facebook,96,1.6,Positive
3,A162,Twitter,89,2.6,Positive
4,A197,Twitter,92,2.3,Neutral



--- Transactions Sample ---


Unnamed: 0,customer_id_legacy,transaction_id,purchase_amount,purchase_date,product_category,customer_rating
0,151,1001,408,2024-01-01,Sports,2.3
1,192,1002,332,2024-01-02,Electronics,4.2
2,114,1003,442,2024-01-03,Electronics,2.1
3,171,1004,256,2024-01-04,Clothing,2.8
4,160,1005,64,2024-01-05,Clothing,1.3


In [14]:
# ============================================================================
# 4. DATA MERGING
# ============================================================================

print("\n--- Merging Datasets ---")
print("\nMerging strategy:")
print("  1. Use customer_id_new from social profiles")
print("  2. Use customer_id_legacy from transactions")
print("  3. Match customer identifiers after cleaning")

# Rename columns for consistency
df_social_renamed = df_social.rename(columns={'customer_id_new': 'customer_id'})
df_transactions_renamed = df_transactions.rename(columns={'customer_id_legacy': 'customer_id'})

# Check ID formats before merging
print("\n--- Investigating ID Formats ---")
print(f"Social Profile ID sample: {df_social_renamed['customer_id'].head(5).tolist()}")
print(f"Transaction ID sample: {df_transactions_renamed['customer_id'].head(5).tolist()}")

# Clean IDs - Remove 'A' prefix from social profiles if present
if df_social_renamed['customer_id'].astype(str).str.contains('A').any():
    print("\n⚠ Detected 'A' prefix in social profile IDs - removing for merge")
    df_social_renamed['customer_id'] = df_social_renamed['customer_id'].astype(str).str.replace('A', '', regex=False)

# Ensure both are strings
df_social_renamed['customer_id'] = df_social_renamed['customer_id'].astype(str)
df_transactions_renamed['customer_id'] = df_transactions_renamed['customer_id'].astype(str)

# Check overlap before merge
social_ids = set(df_social_renamed['customer_id'])
transaction_ids = set(df_transactions_renamed['customer_id'])
overlap = social_ids.intersection(transaction_ids)
print(f"\n✓ Found {len(overlap)} matching customer IDs")

# Merge datasets
df_merged = pd.merge(
    df_transactions_renamed, 
    df_social_renamed, 
    on='customer_id', 
    how='inner'
)

print(f"\n✓ Datasets merged successfully!")
print(f"  Merged dataset shape: {df_merged.shape}")
print(f"  Unique customers: {df_merged['customer_id'].nunique()}")

if df_merged.shape[0] == 0:
    print("\n⚠ WARNING: Merge resulted in 0 rows!")
    print("  This means customer IDs don't match between datasets.")
    print("  Please check the ID format in both files.")
else:
    # Save merged dataset
    df_merged.to_csv('merged_dataset.csv', index=False)
    print("\n✓ Merged dataset saved as 'merged_dataset.csv'")

print("\n--- Merged Dataset Sample ---")
display(df_merged.head())


--- Merging Datasets ---

Merging strategy:
  1. Use customer_id_new from social profiles
  2. Use customer_id_legacy from transactions
  3. Match customer identifiers after cleaning

--- Investigating ID Formats ---
Social Profile ID sample: ['A178', 'A190', 'A150', 'A162', 'A197']
Transaction ID sample: [151, 192, 114, 171, 160]

⚠ Detected 'A' prefix in social profile IDs - removing for merge

✓ Found 61 matching customer IDs

✓ Datasets merged successfully!
  Merged dataset shape: (219, 10)
  Unique customers: 61

✓ Merged dataset saved as 'merged_dataset.csv'

--- Merged Dataset Sample ---


Unnamed: 0,customer_id,transaction_id,purchase_amount,purchase_date,product_category,customer_rating,social_media_platform,engagement_score,purchase_interest_score,review_sentiment
0,151,1001,408,2024-01-01,Sports,2.3,TikTok,61,1.3,Neutral
1,151,1001,408,2024-01-01,Sports,2.3,Twitter,72,1.6,Neutral
2,151,1001,408,2024-01-01,Sports,2.3,Twitter,82,3.6,Negative
3,192,1002,332,2024-01-02,Electronics,4.2,Instagram,60,4.3,Positive
4,114,1003,442,2024-01-03,Electronics,2.1,Facebook,87,4.8,Negative
