## echoFace - Product Recommendation Model
Multimodal User Authentication and Product Recommendation System

This notebook implements the Product Recommendation component that predicts
which product category a customer is likely to purchase based on their
social media engagement and transaction history.

In [1]:
# ============================================================================
# 1. IMPORT LIBRARIES
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, classification_report,
                           confusion_matrix, log_loss)
from xgboost import XGBClassifier
import joblib

# Check sklearn version
import sklearn
print(f"scikit-learn version: {sklearn.__version__}")

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("\n" + "=" * 80)
print("echoFace - Product Recommendation Model")
print("=" * 80)
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 80)

scikit-learn version: 1.6.1

echoFace - Product Recommendation Model
Timestamp: 2025-11-12 13:31:58


In [2]:
# ============================================================================
# 2. HELPER FUNCTIONS
# ============================================================================

def load_data(filepath):
    """Load dataset from CSV file."""
    df = pd.read_csv(filepath)
    print(f"✓ Loaded {filepath}")
    print(f"  Shape: {df.shape}")
    return df

def display_categorical(df):
    """Display distribution of categorical variables."""
    print("\n--- Categorical Variable Distribution ---")
    print("\nProduct Category Distribution:")
    display(df['product_category'].value_counts())
    print("\nSocial Media Platform Distribution:")
    display(df['social_media_platform'].value_counts())
    print("\nReview Sentiment Distribution:")
    display(df['review_sentiment'].value_counts())

def data_preprocessing(df, le):
    """
    Preprocess the merged dataset:
    - Extract date features
    - Drop irrelevant columns
    - Encode categorical variables
    """
    print("\n--- Data Preprocessing ---")

    # Create temporal features from purchase_date
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['month'] = df['purchase_date'].dt.month
    df['day'] = df['purchase_date'].dt.day
    df['weekday'] = df['purchase_date'].dt.day_of_week
    df['isweekend'] = (df['weekday'] >= 5).astype(int)

    # Drop columns not needed for prediction
    df.drop(columns='purchase_date', inplace=True)
    df = df.drop(['customer_id', 'transaction_id', 'year'], axis=1)

    # Get numerical columns (before encoding)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.drop('isweekend')

    # Label encode target variable (product_category)
    df['product_category'] = le.fit_transform(df['product_category'])

    # Ordinal encoding for review sentiment
    sentiment_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
    df['review_sentiment'] = df['review_sentiment'].map(sentiment_mapping)

    # One-hot encode social media platform
    df = pd.get_dummies(df, columns=['social_media_platform'], dtype=int)

    # Print label mapping
    print("\n✓ Product Category Encoding:")
    for idx, category in enumerate(le.classes_):
        print(f"  {category} = {idx}")

    return df, numeric_cols

def prep_data(df):
    """Prepare features and target, split into train/test sets."""
    X = df.drop('product_category', axis=1)
    y = df['product_category']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    print(f'\n--- Train-Test Split ---')
    print(f'X_train shape: {X_train.shape}')
    print(f'X_test shape: {X_test.shape}')
    print(f'y_train shape: {y_train.shape}')
    print(f'y_test shape: {y_test.shape}')

    return X, y, X_train, X_test, y_train, y_test

def save_model_artifacts(model, scaler, le, final_columns, numeric_cols, output_dir='../models/product/'):
    """Save all model artifacts needed for prediction."""
    os.makedirs(output_dir, exist_ok=True)

    joblib.dump(model, os.path.join(output_dir, 'product_recommendation_model.pkl'))
    joblib.dump(scaler, os.path.join(output_dir, 'product_model_scaler.pkl'))
    joblib.dump(le, os.path.join(output_dir, 'product_model_encoder.pkl'))
    joblib.dump(final_columns, os.path.join(output_dir, 'product_model_columns.pkl'))
    joblib.dump(numeric_cols, os.path.join(output_dir, 'product_model_numeric_cols.pkl'))

    print(f"\n✓ All model artifacts saved to '{output_dir}'")
    print("  - product_recommendation_model.pkl")
    print("  - product_model_scaler.pkl")
    print("  - product_model_encoder.pkl")
    print("  - product_model_columns.pkl")
    print("  - product_model_numeric_cols.pkl")