In [None]:
## Case Study: Building a Personalized Recommendation Model

This notebook presents the process of developing a personalized recommendation model based on user behaviour data from a shopping website.



In [None]:
## Data Processing Module

#Imports
import pandas as pd
from collections import Counter

#Loading Data
def load_data(filepath):
    """
    Loads data from a Parquet file.

    Args:
    filepath (str): Path to the Parquet file.

    Returns:
    pd.DataFrame: Loaded DataFrame.
    """
    return pd.read_parquet(filepath)
    

#Session-based aggregation
def aggregate_session_data(data):
    """
    Aggregates session-based data.

    Args:
    data (pd.DataFrame): Raw data.

    Returns:
    pd.DataFrame: Aggregated session data with additional features.
    """
    # Convert timestamp to datetime
    data['timestamp'] = pd.to_datetime(data['date'])

    # Group by session ID and aggregate data
    session_data = data.groupby('sessionId').agg({
        'userId': 'first',
        'timestamp': ['min', 'max'],
        'pageType': lambda x: list(x),
        'itemId': lambda x: list(x),
        'productPrice': lambda x: list(x),
        'oldProductPrice': lambda x: list(x),
        'category': lambda x: list(x)
    }).reset_index()

    # Rename columns for clarity
    session_data.columns = ['sessionId', 'userId', 'session_start', 'session_end', 'pageTypes', 'itemIds',
                            'productPrices', 'oldProductPrices', 'categories']

    # Calculate session duration in seconds
    session_data['session_duration'] = (session_data['session_end'] - session_data['session_start']).dt.total_seconds()

    # Calculate number of events per session
    session_data['num_events'] = session_data['pageTypes'].apply(len)

    # Additional labels for recommendation:
    # 1. Interaction label based on 'success' in pageTypes
    session_data['interaction'] = session_data['pageTypes'].apply(lambda x: 1 if 'success' in x else 0)

    # 2. Average product price per session
    session_data['avg_product_price'] = session_data['productPrices'].apply(lambda x: sum(x) / len(x) if len(x) > 0 else 0)

    return session_data


# Constants
DATA_FILE_PATH = "path/to/data/file.parquet"

# Load data
data = load_data(DATA_FILE_PATH)

# Aggregate session data
session_data = aggregate_session_data(data)

# Display the first few rows of aggregated session data
session_data.head()



In [None]:
##Feature Engineering
#feature_engineering module is focused on creating mappings for user and item IDs, 
#and generating features for model training based on these mappings. 

#creating user and item mappings
def create_user_item_mappings(data):
    """
    Create mappings for user and item IDs to integer indices.

    Args:
    data (pd.DataFrame): Session-based aggregated data.

    Returns:
    tuple: (user_mapping, item_mapping)
    
    """
    
    user_ids = data['userId'].unique()
    item_ids = data['itemId'].explode().unique()
    

    #creating userId - userId_encoded mapping
    user_mapping = pd.DataFrame({
        'userId': user_ids,
        'userId_encoded': np.arange(len(user_ids))
    })
    
    #creating itemId - itemId_encoded mapping
    item_mapping = pd.DataFrame({
        'itemId': item_ids,
        'itemId_encoded': np.arange(len(item_ids))
    })

    return user_mapping, item_mapping

#generating features
def generate_features(data, user_mapping, item_mapping):
    """
    Generate features for model training.

    Args:
    data (pd.DataFrame): Session-based aggregated data.
    user_mapping (pd.DataFrame): Mapping of user IDs to integer indices.
    item_mapping (pd.DataFrame): Mapping of item IDs to integer indices.

    Returns:
    tuple: (X, y)
    """
    # Merge user mapping
    data = data.merge(user_mapping, left_on='userId', right_on='userId', how='left')
    
    # Explode itemId column to ensure each item is on a separate row
    data = data.explode('itemId')

    # Merge item mapping
    data = data.merge(item_mapping, left_on='itemId', right_on='itemId', how='left')

    # Select features and target
    X = data[['userId_encoded', 'itemId_encoded']].values
    y = data['interaction'].values

    return X, y

    #Create mappings
    user_mapping, item_mapping = create_user_item_mappings(session_data)

    #Genereate features
    X, y = generate_features(session_data, user_mapping, item_mapping)




In [None]:
##Build and Train Model Module

#Imports
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense
from tensorflow.keras.optimizers import Adam

#Define the build_model function to construct a recommendation model using TensorFlow. 
#This function sets up embedding layers for users and items, concatenates them, and adds dense layers for learning interactions.
def build_model(num_users, num_items, embedding_dim=50):
    """
    Build the recommendation model using TensorFlow.

    Args:
    num_users (int): Number of unique users.
    num_items (int): Number of unique items.
    embedding_dim (int): Dimension of the embedding vectors.

    Returns:
    Model: Compiled TensorFlow model.
    """
    # Define the input layers
    user_input = Input(shape=(1,), name='user_input')
    item_input = Input(shape=(1,), name='item_input')

    # Define the embedding layers
    user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_embedding')(user_input)
    item_embedding = Embedding(input_dim=num_items, output_dim=embedding_dim, name='item_embedding')(item_input)

    # Flatten the embeddings
    user_vecs = Flatten()(user_embedding)
    item_vecs = Flatten()(item_embedding)

    # Concatenate the embeddings
    concat = Concatenate()([user_vecs, item_vecs])

    # Add a dense layer
    dense = Dense(128, activation='relu')(concat)

    # Add the output layer
    output = Dense(1, activation='sigmoid')(dense)

    # Build and compile the model
    model = Model(inputs=[user_input, item_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    model.summary()

    return model


#Implement train_and_save_model function to train the model using the provided feature matrix X and target vector y, and save the trained model.

def train_and_save_model(X, y, num_users, num_items, model_path):
    """
    Train and save the recommendation model.

    Args:
    X (np.ndarray): Feature matrix.
    y (np.ndarray): Target vector.
    num_users (int): Number of unique users.
    num_items (int): Number of unique items.
    model_path (str): Path to save the trained model.
    """
    # Build the model
    model = build_model(num_users, num_items)

    # Prepare input data
    X_user = X[:, 0].astype(np.int32)
    X_item = X[:, 1].astype(np.int32)
    y = y.astype(np.float32)

    # Train the model
    model.fit([X_user, X_item], y, epochs=5, batch_size=64, validation_split=0.2)

    # Save the trained model
    model.save(model_path)
    

# Constants
MODEL_PATH = "path/to/save/model"

# Train and save the model
train_and_save_model(X, y, num_users, num_items, MODEL_PATH)



In [None]:

# Conclusion
"""
In this notebook, I have implemented a recommendation system using TensorFlow. Here's a summary of what I accomplished:

1. Data Processing: I aggregated session-based data and engineered features like session duration, number of events, and interaction labels.

2. Feature Engineering: Created mappings for user and item IDs, encoded them for model training, and generated feature vectors for users and items.

3. Model Building: Constructed a recommendation model using TensorFlow, incorporating user and item embeddings, dense layers for learning interactions, and compiled it with appropriate loss and optimizer functions.

4. Model Training: Trained the recommendation model using the prepared feature matrix and target vector, and saved the trained model to disk.

This notebook focused on setting up the foundational components of the recommendation system, emphasizing data processing, feature engineering, and model construction. Future steps could include hyperparameter tuning, evaluation on test datasets, and deployment in real-world applications.
"""

# Save the notebook and prepare it for submission
# Ensure all necessary libraries, functions, and explanations are included for clarity and completeness.
