In [1]:
# Cell 1: Install Libraries & Imports

# Install required libraries
!pip install transformers torch pandas numpy scikit-learn tqdm -q

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam

import pandas as pd
import numpy as np
import json
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm # For progress bars

# Set the device to the A100 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"--- Environment Setup Complete ---")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

--- Environment Setup Complete ---
Using device: cuda
GPU Name: Tesla T4


In [2]:
# Cell 2: Project Configuration (Co-Attention + Simple Dims + Regularization)

class Config:
    # --- File Paths ---
    SOURCE_DOMAIN_FILE = 'Movies_and_TV.jsonl'
    TARGET_DOMAIN_FILE = 'Digital_Music.jsonl'
    
    # --- Data Processing ---
    MIN_REVIEWS_SOURCE = 10
    MIN_REVIEWS_TARGET = 5
    TEST_SET_SIZE = 0.2
    
    # --- Model Hyperparameters ---
    LLM_MODEL_NAME = 'distilbert-base-uncased'
    EMBEDDING_DIM = 768
    
    # --- Use SIMPLE Dims ---
    FEATURE_DIM = 64     # Was 128
    SHARED_DIM = 32      # Was 64
    SPECIFIC_DIM = 32    # Was 64
    TOP_K_REVIEWS = 15   # Keep 15
    DROPOUT_RATE = 0.3   # Keep Dropout
    
    # --- Training Hyperparameters ---
    EPOCHS = 50          # Keep 50 epochs
    EMBEDDING_BATCH_SIZE = 64
    TRAIN_BATCH_SIZE = 64
    LEARNING_RATE = 5e-5 # Keep smaller LR
    WEIGHT_DECAY = 1e-5  # Keep Weight Decay
    
    # --- Loss Weights (from Paper 1, Sec III-E) ---
    L_PRED_COEFF = 1000.0
    L_DIFF_COEFF = 0.1
    L_CLASS_COEFF = 0.1
    L_RECON_COEFF = 0.1
    L_IREC_COEFF = 0.1

print("--- Configuration Loaded (Tuning Co-Attention Model) ---")
print(f"Model Dims: {Config.FEATURE_DIM}/{Config.SHARED_DIM}/{Config.SPECIFIC_DIM}")
print(f"Epochs: {Config.EPOCHS}, Dropout: {Config.DROPOUT_RATE}, LR: {Config.LEARNING_RATE}")

--- Configuration Loaded (Tuning Co-Attention Model) ---
Model Dims: 64/32/32
Epochs: 50, Dropout: 0.3, LR: 5e-05


In [3]:
# Cell 3: Phase 1 - Data Loading & Splitting (Memory-Efficient Version)

import collections

def stream_and_filter_data(config):
    """
    This new function uses a two-pass streaming approach to avoid OOM errors.
    """
    print("--- Phase 1: Data Loading & Splitting (Memory-Efficient) ---")
    start_phase1 = time.time()

    source_path = config.SOURCE_DOMAIN_FILE
    target_path = config.TARGET_DOMAIN_FILE

    # --- Pass 1: Get user review counts ---
    print("Pass 1: Counting reviews for all users...")
    source_user_counts = collections.defaultdict(int)
    target_user_counts = collections.defaultdict(int)

    # Count reviews in source file
    try:
        with open(source_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f, desc=f"Counting {os.path.basename(source_path)}"):
                try:
                    user_id = json.loads(line)['user_id']
                    source_user_counts[user_id] += 1
                except (json.JSONDecodeError, KeyError):
                    pass # Skip malformed lines or lines without user_id

        # Count reviews in target file
        with open(target_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f, desc=f"Counting {os.path.basename(target_path)}"):
                try:
                    user_id = json.loads(line)['user_id']
                    target_user_counts[user_id] += 1
                except (json.JSONDecodeError, KeyError):
                    pass
    except FileNotFoundError as e:
        print(f"ERROR: File not found. {e}")
        print("Please make sure your .jsonl files are in the same directory as this notebook.")
        return None, None

    # --- Filter for shared, active users ---
    print("Filtering for shared, active users...")
    shared_users = set(source_user_counts.keys()).intersection(set(target_user_counts.keys()))
    
    filtered_shared_users = set()
    for user in shared_users:
        if (source_user_counts[user] >= config.MIN_REVIEWS_SOURCE and
            target_user_counts[user] >= config.MIN_REVIEWS_TARGET):
            filtered_shared_users.add(user)
            
    print(f"Found {len(filtered_shared_users)} shared users after filtering.")

    if not filtered_shared_users:
        print("ERROR: No shared users found with the current filters. Stopping.")
        return None, None

    # --- Split users for train/test ---
    filtered_user_list = list(filtered_shared_users)
    train_user_ids, test_user_ids = train_test_split(filtered_user_list, test_size=config.TEST_SET_SIZE, random_state=42)
    train_user_ids = set(train_user_ids)
    test_user_ids = set(test_user_ids)

    # --- Pass 2: Load ONLY the data we need ---
    print("Pass 2: Loading data for filtered users...")
    source_data = []
    target_data = []

    # Load source data
    with open(source_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc=f"Loading {os.path.basename(source_path)} data"):
            try:
                review = json.loads(line)
                if review.get('user_id') in filtered_shared_users:
                    source_data.append(review)
            except (json.JSONDecodeError, KeyError):
                pass
    
    # Load target data
    with open(target_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc=f"Loading {os.path.basename(target_path)} data"):
            try:
                review = json.loads(line)
                if review.get('user_id') in filtered_shared_users:
                    target_data.append(review)
            except (json.JSONDecodeError, KeyError):
                pass

    # --- Create final DataFrames ---
    print("Creating final DataFrames...")
    source_df = pd.DataFrame(source_data)[['user_id', 'parent_asin', 'text', 'rating']]
    source_df['domain'] = 'source'
    
    target_df = pd.DataFrame(target_data)[['user_id', 'parent_asin', 'text', 'rating']]
    target_df['domain'] = 'target'
    
    # --- Create final cold-start splits ---
    train_df = pd.concat([
        source_df[source_df['user_id'].isin(train_user_ids)],
        target_df[target_df['user_id'].isin(train_user_ids)],
        source_df[source_df['user_id'].isin(test_user_ids)]
    ])
    test_df = target_df[target_df['user_id'].isin(test_user_ids)]

    print(f"Training records: {len(train_df)}")
    print(f"Testing records (cold-start): {len(test_df)}")
    print(f"--- Phase 1 Complete ({time.time() - start_phase1:.2f}s) ---")
    
    return train_df, test_df

# --- Main execution of this cell ---
train_df, test_df = stream_and_filter_data(Config)

--- Phase 1: Data Loading & Splitting (Memory-Efficient) ---
Pass 1: Counting reviews for all users...


Counting Movies_and_TV.jsonl: 17328314it [01:41, 171451.98it/s]
Counting Digital_Music.jsonl: 130434it [00:00, 154909.74it/s]


Filtering for shared, active users...
Found 627 shared users after filtering.
Pass 2: Loading data for filtered users...


Loading Movies_and_TV.jsonl data: 17328314it [01:34, 183308.48it/s]
Loading Digital_Music.jsonl data: 130434it [00:00, 169246.82it/s]


Creating final DataFrames...
Training records: 67138
Testing records (cold-start): 1161
--- Phase 1 Complete (198.87s) ---


In [4]:
# Cell 4: Phase 2 - BPE-LLM Encoder Class

class BpeLlmReviewEncoder:
    """
    Encodes review texts into semantic vectors using BPE tokenization
    and a pre-trained DistilBERT model.
    """
    def __init__(self, model_name=Config.LLM_MODEL_NAME, batch_size=Config.EMBEDDING_BATCH_SIZE):
        print("Initializing BPE-LLM Review Encoder...")
        self.device = device
        # Load tokenizer and model from Hugging Face, move to GPU, set to eval mode
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        self.model = DistilBertModel.from_pretrained(model_name).to(self.device).eval()
        self.batch_size = batch_size
        print(f"Encoder initialized on {self.device} with model {model_name}.")

    @torch.no_grad()
    def encode(self, review_texts):
        """
        Takes a list of review texts and returns their aggregated embedding vectors.
        """
        all_embeddings = []
        
        # Process in batches with a progress bar
        for i in tqdm(range(0, len(review_texts), self.batch_size), desc="Encoding Reviews (A100)"):
            batch_texts = review_texts[i:i+self.batch_size]
            
            # Tokenize, pad, truncate, and move to GPU
            inputs = self.tokenizer(
                batch_texts,
                padding=True, truncation=True, return_tensors='pt', max_length=512
            ).to(self.device)

            # Get token embeddings
            outputs = self.model(**inputs)
            token_embeddings = outputs.last_hidden_state
            
            # Mean pooling - mask out padding tokens
            attention_mask = inputs['attention_mask']
            mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * mask_expanded, 1)
            sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
            mean_pooled = sum_embeddings / sum_mask
            
            # Move to CPU and store
            all_embeddings.append(mean_pooled.cpu().numpy())

        return np.vstack(all_embeddings)

print("--- Encoder Class Defined ---")

--- Encoder Class Defined ---


In [5]:
# Cell 5: Phase 2 - Embedding Generation (The Heavy Task)

print("--- Phase 2: Generating LLM Embeddings ---")
start_phase2 = time.time()

# 1. Combine all data to find unique reviews
full_df = pd.concat([train_df, test_df])
# We must handle potential 'None' or non-string values in 'text'
full_df = full_df.dropna(subset=['text'])
full_df['text'] = full_df['text'].astype(str)
unique_reviews_text = full_df['text'].drop_duplicates().tolist()
print(f"Found {len(unique_reviews_text)} unique reviews to encode.")

# 2. Initialize and run the encoder
encoder = BpeLlmReviewEncoder()
unique_embeddings = encoder.encode(unique_reviews_text)
print(f"Finished encoding in {time.time() - start_phase2:.2f} seconds.")

# 3. Create a simple dictionary to map review text to its embedding
embedding_map = {text: emb for text, emb in zip(unique_reviews_text, unique_embeddings)}
print(f"Created embedding map.")

# 4. Pre-process and group reviews by user and item for fast lookup
print("Pre-grouping reviews by user and item...")
all_reviews_with_embeddings = full_df.copy()
all_reviews_with_embeddings['embedding'] = all_reviews_with_embeddings['text'].map(embedding_map)

# Drop rows where embedding might be null (if any texts were dropped)
all_reviews_with_embeddings = all_reviews_with_embeddings.dropna(subset=['embedding'])

# Group reviews by user_id and domain
user_reviews_grouped = all_reviews_with_embeddings.groupby(['user_id', 'domain'])['embedding'].apply(list)
# Group reviews by item_id (only in target domain)
item_reviews_grouped = all_reviews_with_embeddings[
    all_reviews_with_embeddings['domain'] == 'target'
].groupby('parent_asin')['embedding'].apply(list)

# Convert to dictionary for fast lookup in the Dataset class
user_reviews_map = user_reviews_grouped.to_dict()
item_reviews_map = item_reviews_grouped.to_dict()

print(f"--- Phase 2 Complete ({time.time() - start_phase2:.2f}s) ---")

--- Phase 2: Generating LLM Embeddings ---
Found 62685 unique reviews to encode.
Initializing BPE-LLM Review Encoder...


Encoder initialized on cuda with model distilbert-base-uncased.


Encoding Reviews (A100): 100%|██████████| 980/980 [16:02<00:00,  1.02it/s]


Finished encoding in 963.54 seconds.
Created embedding map.
Pre-grouping reviews by user and item...
--- Phase 2 Complete (963.80s) ---


In [None]:
# Cell 6: Phase 3 - PyTorch Dataset & DataLoader

class RACRecDataset(Dataset):
    """
    Custom PyTorch Dataset for the RACRec-LLM model.
    It fetches pre-computed embeddings for users and items.
    """
    def __init__(self, dataframe, user_reviews_map, item_reviews_map, is_test=False):
        self.dataframe = dataframe
        self.user_reviews_map = user_reviews_map
        self.item_reviews_map = item_reviews_map
        self.max_reviews = Config.TOP_K_REVIEWS
        self.is_test = is_test # To simulate cold-start
        self.embedding_dim = Config.EMBEDDING_DIM
        self.zero_pad_vector = np.zeros(self.embedding_dim, dtype=np.float32)

    def __len__(self):
        return len(self.dataframe)

    def _pad_reviews(self, reviews_list):
        """Pads or truncates a list of review embeddings to a fixed size."""
        # Take the most recent K reviews
        reviews = reviews_list[-self.max_reviews:]
        
        # Pad with zero vectors if fewer than K reviews
        padded_reviews = reviews + [self.zero_pad_vector] * (self.max_reviews - len(reviews))
        
        return np.array(padded_reviews, dtype=np.float32)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        user_id = row['user_id']
        item_id = row['parent_asin']
        rating = float(row['rating'])
        
        # Get user's source (Movies) reviews
        user_source_reviews = self.user_reviews_map.get((user_id, 'source'), [])
        
        # Get user's target (Music) reviews
        if self.is_test:
            # *** COLD-START SIMULATION ***
            # In the test phase, we pretend the user has no target domain history.
            user_target_reviews = []
        else:
            user_target_reviews = self.user_reviews_map.get((user_id, 'target'), [])
        
        # Get item's target (Music) reviews
        item_target_reviews = self.item_reviews_map.get(item_id, [])
        
        # Pad all review lists to a fixed size
        usr_src_pad = self._pad_reviews(user_source_reviews)
        usr_tgt_pad = self._pad_reviews(user_target_reviews)
        itm_tgt_pad = self._pad_reviews(item_target_reviews)
        
        return {
            'usr_src_reviews': torch.tensor(usr_src_pad, dtype=torch.float32),
            'usr_tgt_reviews': torch.tensor(usr_tgt_pad, dtype=torch.float32),
            'itm_tgt_reviews': torch.tensor(itm_tgt_pad, dtype=torch.float32),
            'rating': torch.tensor(rating, dtype=torch.float32),
            'domain_label_source': torch.tensor(0, dtype=torch.long), # 0 for source
            'domain_label_target': torch.tensor(1, dtype=torch.long)  # 1 for target
        }

print("--- Phase 3: Dataset & DataLoader Definition ---")

# We only use target domain interactions for training/testing the final prediction
train_dataset_df = train_df[train_df['domain'] == 'target'].dropna(subset=['rating'])
test_dataset_df = test_df.dropna(subset=['rating'])

# Create Dataset instances
train_dataset = RACRecDataset(train_dataset_df, user_reviews_map, item_reviews_map, is_test=False)
test_dataset = RACRecDataset(test_dataset_df, user_reviews_map, item_reviews_map, is_test=True)

# Create DataLoader instances
# num_workers=2 is a safe and efficient value for most systems
train_loader = DataLoader(train_dataset, batch_size=Config.TRAIN_BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=Config.TRAIN_BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

print(f"Created Train Dataloader with {len(train_dataset)} samples.")
print(f"Created Test Dataloader with {len(test_dataset)} samples.")
print("--- Phase 3 Complete ---")

--- Phase 3: Dataset & DataLoader Definition ---
Created Train Dataloader with 6423 samples.
Created Test Dataloader with 1161 samples.
--- Phase 3 Complete ---


In [7]:
# Cell 7: Phase 4 - Model Architecture [Co-Attention + Regularized]

class ReviewSelection(nn.Module):
    """
    Implements the "Reviews Selection" mechanism (Paper 1, Sec III-B)
    using co-attention to find the K most relevant reviews.
    """
    def __init__(self, embedding_dim, feature_dim, dropout_rate):
        super(ReviewSelection, self).__init__()
        self.embedding_dim = embedding_dim
        
        # FFN with Dropout
        self.ffn = nn.Sequential(
            nn.Linear(embedding_dim, feature_dim),
            nn.Tanh(),
            nn.Dropout(dropout_rate) # <-- ADDED DROPOUT
        )
        
        # Affinity matrix parameter M (Paper 1, Eq. 3)
        self.M = nn.Parameter(torch.randn(feature_dim, feature_dim))

    def forward(self, urlist, irlist):
        batch_size = urlist.shape[0]
        urlist_mask = (urlist.sum(dim=-1) != 0)
        irlist_mask = (irlist.sum(dim=-1) != 0)

        # F(urlist) and F(irlist)
        ur_feat = self.ffn(urlist) # (B, K, F)
        ir_feat = self.ffn(irlist) # (B, K, F)
        
        # Calculate Affinity Matrix A
        A = torch.bmm(torch.bmm(ur_feat, self.M.unsqueeze(0).expand(batch_size, -1, -1)), ir_feat.transpose(1, 2))
        
        # Masking
        A_mask = torch.bmm(urlist_mask.float().unsqueeze(2), irlist_mask.float().unsqueeze(1))
        A.masked_fill_(A_mask == 0, -1e9)

        # Co-attention weights
        W_urlist = F.softmax(A.max(dim=2).values, dim=1)
        W_irlist = F.softmax(A.max(dim=1).values, dim=1)
        W_urlist = W_urlist * urlist_mask.float()
        W_irlist = W_irlist * irlist_mask.float()

        # Final aggregated vector
        uv = torch.bmm(W_urlist.unsqueeze(1), urlist).squeeze(1)
        iv = torch.bmm(W_irlist.unsqueeze(1), irlist).squeeze(1)
        
        return uv, iv


class RACRecLLM(nn.Module):
    """
    The complete hybrid model, now with Dropout for regularization.
    """
    def __init__(self, config):
        super(RACRecLLM, self).__init__()
        self.config = config
        
        self.review_selector = ReviewSelection(
            config.EMBEDDING_DIM, config.FEATURE_DIM, config.DROPOUT_RATE
        )
        
        encoder_input_dim = config.EMBEDDING_DIM
        self.dropout = nn.Dropout(config.DROPOUT_RATE) # <-- Main Dropout Layer
        
        # 2. Migration of User Preference Modules
        self.user_shared_encoder = nn.Linear(encoder_input_dim, config.SHARED_DIM)
        self.user_source_encoder = nn.Linear(encoder_input_dim, config.SPECIFIC_DIM)
        self.user_target_encoder = nn.Linear(encoder_input_dim, config.SPECIFIC_DIM)
        
        self.user_source_decoder = nn.Linear(config.SHARED_DIM + config.SPECIFIC_DIM, encoder_input_dim)
        self.user_target_decoder = nn.Linear(config.SHARED_DIM + config.SPECIFIC_DIM, encoder_input_dim)
        
        self.domain_classifier = nn.Sequential(
            nn.Linear(config.SHARED_DIM, 2),
            nn.LogSoftmax(dim=1)
        )
        
        # 2. Product Feature Generation Modules
        self.product_encoder = nn.Linear(encoder_input_dim, config.SHARED_DIM + config.SPECIFIC_DIM)
        self.product_decoder = nn.Linear(config.SHARED_DIM + config.SPECIFIC_DIM, encoder_input_dim)
        
    def difference_loss(self, vec1, vec2):
        return F.cosine_similarity(vec1, vec2).mean()

    def forward(self, batch):
        usr_src = batch['usr_src_reviews']
        usr_tgt = batch['usr_tgt_reviews']
        itm_tgt = batch['itm_tgt_reviews']
        
        # --- 1. Get Aggregated Vectors ---
        uv_source, _ = self.review_selector(usr_src, itm_tgt)
        uv_target, iv_target = self.review_selector(usr_tgt, itm_tgt)
        
        # Apply dropout to the aggregated vectors
        uv_source = self.dropout(uv_source)
        uv_target = self.dropout(uv_target)
        iv_target = self.dropout(iv_target)
        
        # --- 2. User Migration Path ---
        sh_pv_source = self.user_shared_encoder(uv_source)
        sp_pv_source = self.user_source_encoder(uv_source)
        sh_pv_target = self.user_shared_encoder(uv_target)
        sp_pv_target = self.user_target_encoder(uv_target)
        
        # --- 3. Product Feature Path ---
        pfv = self.product_encoder(iv_target)
        
        # --- 4. Final Preference Vectors for Prediction ---
        th_pv = torch.where(
            (usr_tgt.sum(dim=[1,2]) == 0).unsqueeze(1),
            sh_pv_source.detach(),
            sh_pv_target
        )
        user_pref_vec_concat = torch.cat([th_pv, sp_pv_target], dim=1) # (B, 32+32) -> (B, 64)
        
        # --- 5. Rating Prediction ---
        rating_pred = (user_pref_vec_concat * pfv).sum(dim=1)
        
        # --- 6. Calculate All 5 Losses ---
        loss_diff = (self.difference_loss(sh_pv_source, sp_pv_source) + 
                     self.difference_loss(sh_pv_target, sp_pv_target)) / 2.0
        
        domain_pred_source = self.domain_classifier(sh_pv_source)
        domain_pred_target = self.domain_classifier(sh_pv_target)
        loss_class = F.nll_loss(domain_pred_source, batch['domain_label_source']) + \
                     F.nll_loss(domain_pred_target, batch['domain_label_target'])
        
        recon_source_in = torch.cat([sh_pv_source, sp_pv_source], dim=1)
        recon_target_in = torch.cat([sh_pv_target, sp_pv_target], dim=1)
        uv_source_recon = self.user_source_decoder(recon_source_in)
        uv_target_recon = self.user_target_decoder(recon_target_in)
        loss_rec = F.mse_loss(uv_source_recon, uv_source.detach()) + \
                   F.mse_loss(uv_target_recon, uv_target.detach())
                   
        iv_target_recon = self.product_decoder(pfv)
        loss_irec = F.mse_loss(iv_target_recon, iv_target.detach())
        
        return {
            "rating_pred": rating_pred,
            "loss_diff": loss_diff,
            "loss_class": -loss_class, # Maximize classification loss
            "loss_rec": loss_rec,
            "loss_irec": loss_irec
        }

print("--- Phase 4: Model Architecture Defined [Co-Attention + Regularized] ---")
model = RACRecLLM(Config).to(device)
print(f"Model created with Co-Attention & Dropout and moved to {device}.")

--- Phase 4: Model Architecture Defined [Co-Attention + Regularized] ---
Model created with Co-Attention & Dropout and moved to cuda.


In [8]:
# Cell 8: Phase 5 - Training Loop

def train_one_epoch(model, loader, optimizer, criterion_pred, config, epoch):
    model.train() # Set model to training mode
    total_loss = 0.0
    total_pred_loss = 0.0
    
    pbar = tqdm(loader, desc=f"Epoch {epoch+1}/{Config.EPOCHS} [Training]")
    for batch in pbar:
        # Move batch to GPU
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(batch)
        
        # --- Calculate all 5 losses ---
        loss_pred = criterion_pred(outputs['rating_pred'], batch['rating'])
        
        # Combine losses with coefficients from Paper 1 (Eq. 17)
        total_loss_combined = (
            config.L_PRED_COEFF * loss_pred +
            config.L_DIFF_COEFF * outputs['loss_diff'] +
            config.L_CLASS_COEFF * outputs['loss_class'] +
            config.L_RECON_COEFF * outputs['loss_rec'] +
            config.L_IREC_COEFF * outputs['loss_irec']
        )
        
        # Backward pass and optimize
        total_loss_combined.backward()
        optimizer.step()
        
        total_loss += total_loss_combined.item()
        total_pred_loss += loss_pred.item()
        pbar.set_postfix(Loss=f"{total_loss_combined.item():.4f}", PredLoss=f"{loss_pred.item():.4f}")
        
    return total_loss / len(loader), total_pred_loss / len(loader)

print("--- Phase 5: Training Loop Defined ---")

--- Phase 5: Training Loop Defined ---


In [9]:
# Cell 9: Phase 5 - Evaluation Loop (Cold-Start)

def evaluate_model(model, loader):
    model.eval() # Set model to evaluation mode
    all_preds = []
    all_labels = []
    
    pbar = tqdm(loader, desc="[Evaluating Cold-Start]")
    with torch.no_grad():
        for batch in pbar:
            # Move batch to GPU
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass - This will automatically trigger the cold-start logic
            # because batch['usr_tgt_reviews'] is all zeros
            outputs = model(batch)
            
            all_preds.extend(outputs['rating_pred'].cpu().numpy())
            all_labels.extend(batch['rating'].cpu().numpy())

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(all_labels, all_preds))
    
    # Binarize ratings for AUC calculation (e.g., 4+ is "good")
    labels_binary = [1 if r >= 4.0 else 0 for r in all_labels]
    preds_scores = all_preds # Use raw scores for AUC
    
    try:
        auc = roc_auc_score(labels_binary, preds_scores)
    except ValueError as e:
        # This can happen if all labels in a batch are the same class
        print(f"Could not calculate AUC (likely all labels are one class): {e}")
        auc = 0.5
        
    return rmse, auc

print("--- Phase 5: Evaluation Loop Defined ---")

--- Phase 5: Evaluation Loop Defined ---


In [10]:
# Cell 10: Phase 6 - Main Execution (Tuning the Co-Attention Model)

print("--- Phase 6: Starting Main Execution (Tuning Co-Attention Model) ---")

model = RACRecLLM(Config).to(device)
optimizer = Adam(model.parameters(), lr=Config.LEARNING_RATE, weight_decay=Config.WEIGHT_DECAY)
criterion_pred = nn.MSELoss()

best_rmse = float('inf')
best_auc = 0.0
total_start_time = time.time()

print(f"Starting training for {Config.EPOCHS} epochs on {device}...")

for epoch in range(Config.EPOCHS):
    epoch_start_time = time.time()
    
    # --- Train ---
    train_loss, train_pred_loss = train_one_epoch(model, train_loader, optimizer, criterion_pred, Config, epoch)
    
    # --- Evaluate ---
    val_rmse, val_auc = evaluate_model(model, test_loader)
    
    epoch_time = time.time() - epoch_start_time
    
    print("\n" + "="*50)
    print(f"--- Epoch {epoch+1}/{Config.EPOCHS} Summary (Time: {epoch_time:.2f}s) ---")
    print(f"  Train Loss (Total):      {train_loss:.4f}")
    print(f"  Train Loss (Pred ONLY):  {train_pred_loss:.4f}")
    print(f"  Val RMSE (Cold-Start):   {val_rmse:.4f}")
    print(f"  Val AUC (Cold-Start):    {val_auc:.4f}")
    print("="*50 + "\n")
    
    # Save the model if it's the best *AUC*
    if val_auc > best_auc:
        best_auc = val_auc
        best_rmse = val_rmse
        torch.save(model.state_dict(), 'best_model_co-attention_tuned.pth')
        print(f"  *** New best model saved with AUC: {best_auc:.4f} (RMSE: {best_rmse:.4f}) ***\n")

total_end_time = time.time()
print("--- PROJECT COMPLETE ---")
print(f"Total execution time: {(total_end_time - total_start_time) / 60:.2f} minutes")
print(f"Best cold-start AUC achieved: {best_auc:.4f} (with RMSE: {best_rmse:.4f})")
print("Final model saved to 'best_model_co-attention_tuned.pth'")

--- Phase 6: Starting Main Execution (Tuning Co-Attention Model) ---
Starting training for 50 epochs on cuda...


Epoch 1/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 37.94it/s, Loss=1416.7894, PredLoss=1.4169] 
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 43.75it/s]



--- Epoch 1/50 Summary (Time: 3.12s) ---
  Train Loss (Total):      4346.2750
  Train Loss (Pred ONLY):  4.3464
  Val RMSE (Cold-Start):   4.5278
  Val AUC (Cold-Start):    0.5571

  *** New best model saved with AUC: 0.5571 (RMSE: 4.5278) ***



Epoch 2/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.64it/s, Loss=736.4739, PredLoss=0.7366]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.31it/s]



--- Epoch 2/50 Summary (Time: 2.31s) ---
  Train Loss (Total):      1286.6267
  Train Loss (Pred ONLY):  1.2868
  Val RMSE (Cold-Start):   4.5147
  Val AUC (Cold-Start):    0.6184

  *** New best model saved with AUC: 0.6184 (RMSE: 4.5147) ***



Epoch 3/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.21it/s, Loss=887.4182, PredLoss=0.8876]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 43.14it/s]



--- Epoch 3/50 Summary (Time: 2.39s) ---
  Train Loss (Total):      1015.4816
  Train Loss (Pred ONLY):  1.0157
  Val RMSE (Cold-Start):   4.5068
  Val AUC (Cold-Start):    0.6408

  *** New best model saved with AUC: 0.6408 (RMSE: 4.5068) ***



Epoch 4/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.11it/s, Loss=1058.0238, PredLoss=1.0582]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.60it/s]



--- Epoch 4/50 Summary (Time: 2.33s) ---
  Train Loss (Total):      915.3833
  Train Loss (Pred ONLY):  0.9156
  Val RMSE (Cold-Start):   4.4959
  Val AUC (Cold-Start):    0.6646

  *** New best model saved with AUC: 0.6646 (RMSE: 4.4959) ***



Epoch 5/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.48it/s, Loss=1463.6113, PredLoss=1.4638]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 36.19it/s]



--- Epoch 5/50 Summary (Time: 2.46s) ---
  Train Loss (Total):      845.2375
  Train Loss (Pred ONLY):  0.8454
  Val RMSE (Cold-Start):   4.4920
  Val AUC (Cold-Start):    0.6770

  *** New best model saved with AUC: 0.6770 (RMSE: 4.4920) ***



Epoch 6/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.89it/s, Loss=345.6053, PredLoss=0.3458]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.70it/s]



--- Epoch 6/50 Summary (Time: 2.34s) ---
  Train Loss (Total):      806.0281
  Train Loss (Pred ONLY):  0.8062
  Val RMSE (Cold-Start):   4.4885
  Val AUC (Cold-Start):    0.6905

  *** New best model saved with AUC: 0.6905 (RMSE: 4.4885) ***



Epoch 7/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 48.29it/s, Loss=565.7816, PredLoss=0.5660]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 39.77it/s]



--- Epoch 7/50 Summary (Time: 2.58s) ---
  Train Loss (Total):      772.5064
  Train Loss (Pred ONLY):  0.7727
  Val RMSE (Cold-Start):   4.4796
  Val AUC (Cold-Start):    0.6938

  *** New best model saved with AUC: 0.6938 (RMSE: 4.4796) ***



Epoch 8/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 51.90it/s, Loss=726.6689, PredLoss=0.7269]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 46.16it/s]



--- Epoch 8/50 Summary (Time: 2.37s) ---
  Train Loss (Total):      764.7878
  Train Loss (Pred ONLY):  0.7650
  Val RMSE (Cold-Start):   4.4793
  Val AUC (Cold-Start):    0.7067

  *** New best model saved with AUC: 0.7067 (RMSE: 4.4793) ***



Epoch 9/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 51.52it/s, Loss=1527.0459, PredLoss=1.5273]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 46.35it/s]



--- Epoch 9/50 Summary (Time: 2.38s) ---
  Train Loss (Total):      747.6519
  Train Loss (Pred ONLY):  0.7479
  Val RMSE (Cold-Start):   4.4725
  Val AUC (Cold-Start):    0.7090

  *** New best model saved with AUC: 0.7090 (RMSE: 4.4725) ***



Epoch 10/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.60it/s, Loss=1597.9304, PredLoss=1.5982]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 44.93it/s]



--- Epoch 10/50 Summary (Time: 2.32s) ---
  Train Loss (Total):      720.9901
  Train Loss (Pred ONLY):  0.7212
  Val RMSE (Cold-Start):   4.4656
  Val AUC (Cold-Start):    0.7220

  *** New best model saved with AUC: 0.7220 (RMSE: 4.4656) ***



Epoch 11/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.34it/s, Loss=668.5604, PredLoss=0.6688]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 44.00it/s]



--- Epoch 11/50 Summary (Time: 2.37s) ---
  Train Loss (Total):      688.1185
  Train Loss (Pred ONLY):  0.6884
  Val RMSE (Cold-Start):   4.4706
  Val AUC (Cold-Start):    0.7380

  *** New best model saved with AUC: 0.7380 (RMSE: 4.4706) ***



Epoch 12/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 51.26it/s, Loss=459.8079, PredLoss=0.4601]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 43.67it/s]



--- Epoch 12/50 Summary (Time: 2.42s) ---
  Train Loss (Total):      697.3710
  Train Loss (Pred ONLY):  0.6976
  Val RMSE (Cold-Start):   4.4624
  Val AUC (Cold-Start):    0.7412

  *** New best model saved with AUC: 0.7412 (RMSE: 4.4624) ***



Epoch 13/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.27it/s, Loss=811.4734, PredLoss=0.8117]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 46.48it/s]



--- Epoch 13/50 Summary (Time: 2.32s) ---
  Train Loss (Total):      694.0061
  Train Loss (Pred ONLY):  0.6943
  Val RMSE (Cold-Start):   4.4574
  Val AUC (Cold-Start):    0.7335



Epoch 14/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 50.08it/s, Loss=798.3762, PredLoss=0.7987]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 43.71it/s]



--- Epoch 14/50 Summary (Time: 2.46s) ---
  Train Loss (Total):      675.5050
  Train Loss (Pred ONLY):  0.6758
  Val RMSE (Cold-Start):   4.4468
  Val AUC (Cold-Start):    0.7353



Epoch 15/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.68it/s, Loss=1061.1422, PredLoss=1.0614]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.24it/s]



--- Epoch 15/50 Summary (Time: 2.35s) ---
  Train Loss (Total):      667.3055
  Train Loss (Pred ONLY):  0.6676
  Val RMSE (Cold-Start):   4.4522
  Val AUC (Cold-Start):    0.7457

  *** New best model saved with AUC: 0.7457 (RMSE: 4.4522) ***



Epoch 16/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 51.74it/s, Loss=369.2327, PredLoss=0.3695]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.16it/s]



--- Epoch 16/50 Summary (Time: 2.38s) ---
  Train Loss (Total):      662.1397
  Train Loss (Pred ONLY):  0.6624
  Val RMSE (Cold-Start):   4.4441
  Val AUC (Cold-Start):    0.7460

  *** New best model saved with AUC: 0.7460 (RMSE: 4.4441) ***



Epoch 17/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.54it/s, Loss=571.8721, PredLoss=0.5722]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 44.53it/s]



--- Epoch 17/50 Summary (Time: 2.32s) ---
  Train Loss (Total):      646.3655
  Train Loss (Pred ONLY):  0.6467
  Val RMSE (Cold-Start):   4.4394
  Val AUC (Cold-Start):    0.7349



Epoch 18/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 50.84it/s, Loss=714.0003, PredLoss=0.7143]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.21it/s]



--- Epoch 18/50 Summary (Time: 2.42s) ---
  Train Loss (Total):      644.2025
  Train Loss (Pred ONLY):  0.6445
  Val RMSE (Cold-Start):   4.4384
  Val AUC (Cold-Start):    0.7506

  *** New best model saved with AUC: 0.7506 (RMSE: 4.4384) ***



Epoch 19/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 50.98it/s, Loss=556.6489, PredLoss=0.5570]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 47.68it/s]



--- Epoch 19/50 Summary (Time: 2.39s) ---
  Train Loss (Total):      641.2500
  Train Loss (Pred ONLY):  0.6416
  Val RMSE (Cold-Start):   4.4348
  Val AUC (Cold-Start):    0.7505



Epoch 20/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 50.92it/s, Loss=362.4827, PredLoss=0.3628]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 44.77it/s]



--- Epoch 20/50 Summary (Time: 2.42s) ---
  Train Loss (Total):      631.9885
  Train Loss (Pred ONLY):  0.6323
  Val RMSE (Cold-Start):   4.4318
  Val AUC (Cold-Start):    0.7454



Epoch 21/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.79it/s, Loss=828.6212, PredLoss=0.8290]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 42.71it/s]



--- Epoch 21/50 Summary (Time: 2.33s) ---
  Train Loss (Total):      623.9049
  Train Loss (Pred ONLY):  0.6242
  Val RMSE (Cold-Start):   4.4249
  Val AUC (Cold-Start):    0.7372



Epoch 22/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.91it/s, Loss=493.9437, PredLoss=0.4943]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 44.04it/s]



--- Epoch 22/50 Summary (Time: 2.35s) ---
  Train Loss (Total):      615.4712
  Train Loss (Pred ONLY):  0.6158
  Val RMSE (Cold-Start):   4.4226
  Val AUC (Cold-Start):    0.7511

  *** New best model saved with AUC: 0.7511 (RMSE: 4.4226) ***



Epoch 23/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.33it/s, Loss=664.8723, PredLoss=0.6652]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 44.63it/s]



--- Epoch 23/50 Summary (Time: 2.37s) ---
  Train Loss (Total):      620.8366
  Train Loss (Pred ONLY):  0.6212
  Val RMSE (Cold-Start):   4.4139
  Val AUC (Cold-Start):    0.7397



Epoch 24/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 50.83it/s, Loss=408.9024, PredLoss=0.4093]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.49it/s]



--- Epoch 24/50 Summary (Time: 2.42s) ---
  Train Loss (Total):      606.5585
  Train Loss (Pred ONLY):  0.6069
  Val RMSE (Cold-Start):   4.4073
  Val AUC (Cold-Start):    0.7416



Epoch 25/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 51.00it/s, Loss=651.9542, PredLoss=0.6523]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 47.12it/s]



--- Epoch 25/50 Summary (Time: 2.39s) ---
  Train Loss (Total):      613.5620
  Train Loss (Pred ONLY):  0.6139
  Val RMSE (Cold-Start):   4.4115
  Val AUC (Cold-Start):    0.7406



Epoch 26/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.26it/s, Loss=1252.8799, PredLoss=1.2533]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.45it/s]



--- Epoch 26/50 Summary (Time: 2.33s) ---
  Train Loss (Total):      612.7280
  Train Loss (Pred ONLY):  0.6131
  Val RMSE (Cold-Start):   4.4050
  Val AUC (Cold-Start):    0.7343



Epoch 27/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.20it/s, Loss=669.5752, PredLoss=0.6700]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 46.10it/s]



--- Epoch 27/50 Summary (Time: 2.32s) ---
  Train Loss (Total):      600.8172
  Train Loss (Pred ONLY):  0.6012
  Val RMSE (Cold-Start):   4.4046
  Val AUC (Cold-Start):    0.7491



Epoch 28/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.10it/s, Loss=1171.5205, PredLoss=1.1719]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 43.87it/s]



--- Epoch 28/50 Summary (Time: 2.35s) ---
  Train Loss (Total):      590.9827
  Train Loss (Pred ONLY):  0.5914
  Val RMSE (Cold-Start):   4.4014
  Val AUC (Cold-Start):    0.7416



Epoch 29/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.71it/s, Loss=796.4009, PredLoss=0.7968]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.20it/s]



--- Epoch 29/50 Summary (Time: 2.31s) ---
  Train Loss (Total):      594.9347
  Train Loss (Pred ONLY):  0.5953
  Val RMSE (Cold-Start):   4.4013
  Val AUC (Cold-Start):    0.7476



Epoch 30/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 51.06it/s, Loss=330.0175, PredLoss=0.3304]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 46.34it/s]



--- Epoch 30/50 Summary (Time: 2.40s) ---
  Train Loss (Total):      584.8435
  Train Loss (Pred ONLY):  0.5853
  Val RMSE (Cold-Start):   4.3984
  Val AUC (Cold-Start):    0.7459



Epoch 31/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.95it/s, Loss=784.5187, PredLoss=0.7849]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 43.20it/s]



--- Epoch 31/50 Summary (Time: 2.36s) ---
  Train Loss (Total):      587.5420
  Train Loss (Pred ONLY):  0.5880
  Val RMSE (Cold-Start):   4.3948
  Val AUC (Cold-Start):    0.7368



Epoch 32/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.63it/s, Loss=375.1664, PredLoss=0.3756]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 41.44it/s]



--- Epoch 32/50 Summary (Time: 2.39s) ---
  Train Loss (Total):      579.9597
  Train Loss (Pred ONLY):  0.5804
  Val RMSE (Cold-Start):   4.3884
  Val AUC (Cold-Start):    0.7432



Epoch 33/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 49.31it/s, Loss=789.5105, PredLoss=0.7899]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 47.07it/s]



--- Epoch 33/50 Summary (Time: 2.46s) ---
  Train Loss (Total):      570.9099
  Train Loss (Pred ONLY):  0.5713
  Val RMSE (Cold-Start):   4.3835
  Val AUC (Cold-Start):    0.7488



Epoch 34/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.51it/s, Loss=371.7736, PredLoss=0.3722]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.39it/s]



--- Epoch 34/50 Summary (Time: 2.32s) ---
  Train Loss (Total):      574.4719
  Train Loss (Pred ONLY):  0.5749
  Val RMSE (Cold-Start):   4.3758
  Val AUC (Cold-Start):    0.7336



Epoch 35/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 47.06it/s, Loss=390.5295, PredLoss=0.3910]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 43.91it/s]



--- Epoch 35/50 Summary (Time: 2.59s) ---
  Train Loss (Total):      579.7705
  Train Loss (Pred ONLY):  0.5802
  Val RMSE (Cold-Start):   4.3658
  Val AUC (Cold-Start):    0.7375



Epoch 36/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 51.88it/s, Loss=598.0748, PredLoss=0.5985]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 42.82it/s]



--- Epoch 36/50 Summary (Time: 2.40s) ---
  Train Loss (Total):      565.9414
  Train Loss (Pred ONLY):  0.5664
  Val RMSE (Cold-Start):   4.3667
  Val AUC (Cold-Start):    0.7447



Epoch 37/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.38it/s, Loss=1006.1530, PredLoss=1.0066]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 42.81it/s]



--- Epoch 37/50 Summary (Time: 2.38s) ---
  Train Loss (Total):      564.4192
  Train Loss (Pred ONLY):  0.5649
  Val RMSE (Cold-Start):   4.3639
  Val AUC (Cold-Start):    0.7348



Epoch 38/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 49.00it/s, Loss=542.2682, PredLoss=0.5428]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 42.09it/s]



--- Epoch 38/50 Summary (Time: 2.52s) ---
  Train Loss (Total):      542.9745
  Train Loss (Pred ONLY):  0.5435
  Val RMSE (Cold-Start):   4.3608
  Val AUC (Cold-Start):    0.7320



Epoch 39/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 51.88it/s, Loss=286.8896, PredLoss=0.2874]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 46.85it/s]



--- Epoch 39/50 Summary (Time: 2.36s) ---
  Train Loss (Total):      565.0987
  Train Loss (Pred ONLY):  0.5656
  Val RMSE (Cold-Start):   4.3556
  Val AUC (Cold-Start):    0.7326



Epoch 40/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 40.00it/s, Loss=240.1933, PredLoss=0.2407]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 29.80it/s]



--- Epoch 40/50 Summary (Time: 3.18s) ---
  Train Loss (Total):      540.8688
  Train Loss (Pred ONLY):  0.5414
  Val RMSE (Cold-Start):   4.3501
  Val AUC (Cold-Start):    0.7287



Epoch 41/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 48.89it/s, Loss=474.9492, PredLoss=0.4755]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 42.86it/s]



--- Epoch 41/50 Summary (Time: 2.52s) ---
  Train Loss (Total):      547.7406
  Train Loss (Pred ONLY):  0.5483
  Val RMSE (Cold-Start):   4.3477
  Val AUC (Cold-Start):    0.7321



Epoch 42/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.28it/s, Loss=348.8628, PredLoss=0.3494]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 41.59it/s]



--- Epoch 42/50 Summary (Time: 2.40s) ---
  Train Loss (Total):      542.3779
  Train Loss (Pred ONLY):  0.5429
  Val RMSE (Cold-Start):   4.3419
  Val AUC (Cold-Start):    0.7335



Epoch 43/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 47.72it/s, Loss=790.7914, PredLoss=0.7914]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.46it/s]



--- Epoch 43/50 Summary (Time: 2.55s) ---
  Train Loss (Total):      546.6583
  Train Loss (Pred ONLY):  0.5472
  Val RMSE (Cold-Start):   4.3427
  Val AUC (Cold-Start):    0.7298



Epoch 44/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 53.46it/s, Loss=553.7347, PredLoss=0.5543]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 44.00it/s]



--- Epoch 44/50 Summary (Time: 2.33s) ---
  Train Loss (Total):      538.1818
  Train Loss (Pred ONLY):  0.5387
  Val RMSE (Cold-Start):   4.3426
  Val AUC (Cold-Start):    0.7272



Epoch 45/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 47.26it/s, Loss=386.2357, PredLoss=0.3868]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 41.64it/s]



--- Epoch 45/50 Summary (Time: 2.60s) ---
  Train Loss (Total):      540.9830
  Train Loss (Pred ONLY):  0.5415
  Val RMSE (Cold-Start):   4.3401
  Val AUC (Cold-Start):    0.7383



Epoch 46/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 52.86it/s, Loss=540.4496, PredLoss=0.5410]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.94it/s]



--- Epoch 46/50 Summary (Time: 2.34s) ---
  Train Loss (Total):      534.1439
  Train Loss (Pred ONLY):  0.5347
  Val RMSE (Cold-Start):   4.3286
  Val AUC (Cold-Start):    0.7232



Epoch 47/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 50.87it/s, Loss=597.9464, PredLoss=0.5985]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.92it/s]



--- Epoch 47/50 Summary (Time: 2.41s) ---
  Train Loss (Total):      532.5452
  Train Loss (Pred ONLY):  0.5331
  Val RMSE (Cold-Start):   4.3330
  Val AUC (Cold-Start):    0.7301



Epoch 48/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 49.57it/s, Loss=275.9243, PredLoss=0.2765]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 42.63it/s]



--- Epoch 48/50 Summary (Time: 2.49s) ---
  Train Loss (Total):      531.1182
  Train Loss (Pred ONLY):  0.5317
  Val RMSE (Cold-Start):   4.3301
  Val AUC (Cold-Start):    0.7362



Epoch 49/50 [Training]: 100%|██████████| 101/101 [00:01<00:00, 50.98it/s, Loss=346.9146, PredLoss=0.3475]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 45.22it/s]



--- Epoch 49/50 Summary (Time: 2.41s) ---
  Train Loss (Total):      517.3735
  Train Loss (Pred ONLY):  0.5180
  Val RMSE (Cold-Start):   4.3315
  Val AUC (Cold-Start):    0.7390



Epoch 50/50 [Training]: 100%|██████████| 101/101 [00:02<00:00, 48.91it/s, Loss=361.5952, PredLoss=0.3622]
[Evaluating Cold-Start]: 100%|██████████| 19/19 [00:00<00:00, 40.66it/s]


--- Epoch 50/50 Summary (Time: 2.55s) ---
  Train Loss (Total):      510.8736
  Train Loss (Pred ONLY):  0.5115
  Val RMSE (Cold-Start):   4.3299
  Val AUC (Cold-Start):    0.7379

--- PROJECT COMPLETE ---
Total execution time: 2.03 minutes
Best cold-start AUC achieved: 0.7511 (with RMSE: 4.4226)
Final model saved to 'best_model_co-attention_tuned.pth'





In [11]:
# Cell 11: Load Your Trained Model and Encoder

print("--- Loading all necessary components for the interactive demo... ---")

# 1. We must redefine the model architecture classes exactly as before
# This is from your WINNING run (Simple + Dropout)

class RACRecLLM(nn.Module):
    """
    This is our original, simple, mean-pooling model.
    We are now adding Dropout for regularization.
    """
    def __init__(self, config):
        super(RACRecLLM, self).__init__()
        self.config = config
        
        encoder_input_dim = config.EMBEDDING_DIM
        self.dropout = nn.Dropout(config.DROPOUT_RATE) # <-- Dropout is included
        
        # 1. Migration of User Preference Modules
        self.user_shared_encoder = nn.Linear(encoder_input_dim, config.SHARED_DIM)
        self.user_source_encoder = nn.Linear(encoder_input_dim, config.SPECIFIC_DIM)
        self.user_target_encoder = nn.Linear(encoder_input_dim, config.SPECIFIC_DIM)
        
        self.user_source_decoder = nn.Linear(config.SHARED_DIM + config.SPECIFIC_DIM, encoder_input_dim)
        self.user_target_decoder = nn.Linear(config.SHARED_DIM + config.SPECIFIC_DIM, encoder_input_dim)
        
        self.domain_classifier = nn.Sequential(
            nn.Linear(config.SHARED_DIM, 2),
            nn.LogSoftmax(dim=1)
        )
        
        # 2. Product Feature Generation Modules
        self.product_encoder = nn.Linear(encoder_input_dim, config.SHARED_DIM + config.SPECIFIC_DIM)
        self.product_decoder = nn.Linear(config.SHARED_DIM + config.SPECIFIC_DIM, encoder_input_dim)
        
    def difference_loss(self, vec1, vec2):
        return F.cosine_similarity(vec1, vec2).mean()

    def forward(self, batch):
        usr_src = batch['usr_src_reviews'] # (B, K, E)
        usr_tgt = batch['usr_tgt_reviews'] # (B, K, E)
        itm_tgt = batch['itm_tgt_reviews'] # (B, K, E)
        
        uv_source = torch.sum(usr_src, dim=1) / torch.clamp((usr_src.sum(dim=-1) != 0).sum(dim=1).unsqueeze(1), min=1)
        uv_target = torch.sum(usr_tgt, dim=1) / torch.clamp((usr_tgt.sum(dim=-1) != 0).sum(dim=1).unsqueeze(1), min=1)
        iv_target = torch.sum(itm_tgt, dim=1) / torch.clamp((itm_tgt.sum(dim=-1) != 0).sum(dim=1).unsqueeze(1), min=1)
        
        # Apply dropout (this is automatically disabled by model.eval())
        uv_source = self.dropout(uv_source)
        uv_target = self.dropout(uv_target)
        iv_target = self.dropout(iv_target)
        
        sh_pv_source = self.user_shared_encoder(uv_source)
        sp_pv_source = self.user_source_encoder(uv_source)
        sh_pv_target = self.user_shared_encoder(uv_target)
        sp_pv_target = self.user_target_encoder(uv_target)
        pfv = self.product_encoder(iv_target)
        
        th_pv = torch.where(
            (usr_tgt.sum(dim=[1,2]) == 0).unsqueeze(1),
            sh_pv_source.detach(),
            sh_pv_target
        )
        user_pref_vec_concat = torch.cat([th_pv, sp_pv_target], dim=1)
        rating_pred = (user_pref_vec_concat * pfv).sum(dim=1)
        
        # For inference, we only need the rating prediction
        return {"rating_pred": rating_pred}


# 2. Load the BPE-LLM Encoder (from Cell 4)
try:
    if 'encoder' not in globals() or not isinstance(encoder, BpeLlmReviewEncoder):
        print("Re-defining BpeLlmReviewEncoder class...")
        class BpeLlmReviewEncoder:
            def __init__(self, model_name='distilbert-base-uncased', batch_size=64):
                print("Initializing BPE-LLM Review Encoder...")
                self.device = device
                self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
                self.model = DistilBertModel.from_pretrained(model_name).to(self.device).eval()
                self.batch_size = batch_size
                print(f"Encoder initialized on {self.device} with model {model_name}.")

            @torch.no_grad()
            def encode(self, review_texts):
                all_embeddings = []
                for i in range(0, len(review_texts), self.batch_size):
                    batch_texts = review_texts[i:i+self.batch_size]
                    inputs = self.tokenizer(
                        batch_texts, padding=True, truncation=True, return_tensors='pt', max_length=512
                    ).to(self.device)
                    outputs = self.model(**inputs)
                    token_embeddings = outputs.last_hidden_state
                    attention_mask = inputs['attention_mask']
                    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
                    sum_embeddings = torch.sum(token_embeddings * mask_expanded, 1)
                    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
                    mean_pooled = sum_embeddings / sum_mask
                    all_embeddings.append(mean_pooled.cpu().numpy())
                return np.vstack(all_embeddings)
        
        encoder = BpeLlmReviewEncoder()
    print("BPE-LLM Encoder is ready.")

    # 3. Load the Config (from your WINNING run)
    class Config:
        EMBEDDING_DIM = 768
        FEATURE_DIM = 64
        SHARED_DIM = 32
        SPECIFIC_DIM = 32
        TOP_K_REVIEWS = 10
        DROPOUT_RATE = 0.3
        EMBEDDING_BATCH_SIZE = 64
    
    print("Configuration from the winning run is loaded.")
    
    # 4. Initialize the model and load the saved weights
    inference_model = RACRecLLM(Config).to(device)
    
    # --- This is your best model file from Run 4 ---
    model_path = 'best_model_simple_tuned.pth' 
    
    inference_model.load_state_dict(torch.load(model_path, map_location=device))
    inference_model.eval() # Set model to evaluation mode (turns off dropout)

    print(f"\nSuccessfully loaded trained model from '{model_path}'!")
    print("The interactive demo is ready.")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please make sure all previous cells (especially 2, 4, 7 from Run 4) have been defined.")

--- Loading all necessary components for the interactive demo... ---
BPE-LLM Encoder is ready.
Configuration from the winning run is loaded.

Successfully loaded trained model from 'best_model_simple_tuned.pth'!
The interactive demo is ready.


In [12]:
# Cell 12: Define Helper and Prediction Functions

def _pad_reviews(reviews_list, max_reviews, emb_dim, zero_pad_vector):
    """Pads or truncates a list of review embeddings to a fixed size."""
    reviews = reviews_list[-max_reviews:]
    padded_reviews = reviews + [zero_pad_vector] * (max_reviews - len(reviews))
    return np.array(padded_reviews, dtype=np.float32)

def predict_cold_start_rating(user_movie_reviews, item_music_reviews, model, encoder, config):
    """
    Predicts a rating for a single item, given a new user's source history.
    """
    
    # --- 1. Process and Encode Reviews ---
    user_source_embs = list(encoder.encode(user_movie_reviews))
    item_target_embs = list(encoder.encode(item_music_reviews))
    
    # --- 2. Prepare the Batch ---
    zero_pad = np.zeros(config.EMBEDDING_DIM, dtype=np.float32)
    
    usr_src_pad = _pad_reviews(user_source_embs, config.TOP_K_REVIEWS, config.EMBEDDING_DIM, zero_pad)
    itm_tgt_pad = _pad_reviews(item_target_embs, config.TOP_K_REVIEWS, config.EMBEDDING_DIM, zero_pad)
    
    # *** THIS IS THE KEY COLD-START STEP ***
    # Create an all-zero tensor for the user's target (music) history
    usr_tgt_pad = np.array(
        [zero_pad] * config.TOP_K_REVIEWS, 
        dtype=np.float32
    )

    # --- 3. Format for the Model ---
    # Create a batch of size 1
    batch = {
        'usr_src_reviews': torch.tensor(usr_src_pad, dtype=torch.float32).unsqueeze(0).to(device),
        'usr_tgt_reviews': torch.tensor(usr_tgt_pad, dtype=torch.float32).unsqueeze(0).to(device),
        'itm_tgt_reviews': torch.tensor(itm_tgt_pad, dtype=torch.float32).unsqueeze(0).to(device),
    }

    # --- 4. Get Prediction ---
    with torch.no_grad():
        outputs = model(batch)
        predicted_rating = outputs['rating_pred'].item()
        
    return predicted_rating

print("Prediction functions are ready.")

Prediction functions are ready.


In [1]:
# Cell 11: Load Your Trained Model and Encoder

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
import json
import os
import time
from tqdm import tqdm
import pandas as pd

# --- FIX: Define device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"--- Loading all components for the interactive demo on {device} ---")

# 1. We must redefine the model architecture classes exactly as before
# This is from your WINNING run (Simple + Dropout)

class RACRecLLM(nn.Module):
    """
    This is our original, simple, mean-pooling model.
    We are now adding Dropout for regularization.
    """
    def __init__(self, config):
        super(RACRecLLM, self).__init__()
        self.config = config
        
        encoder_input_dim = config.EMBEDDING_DIM
        self.dropout = nn.Dropout(config.DROPOUT_RATE) # <-- Dropout is included
        
        # 1. Migration of User Preference Modules
        self.user_shared_encoder = nn.Linear(encoder_input_dim, config.SHARED_DIM)
        self.user_source_encoder = nn.Linear(encoder_input_dim, config.SPECIFIC_DIM)
        self.user_target_encoder = nn.Linear(encoder_input_dim, config.SPECIFIC_DIM)
        
        self.user_source_decoder = nn.Linear(config.SHARED_DIM + config.SPECIFIC_DIM, encoder_input_dim)
        self.user_target_decoder = nn.Linear(config.SHARED_DIM + config.SPECIFIC_DIM, encoder_input_dim)
        
        self.domain_classifier = nn.Sequential(
            nn.Linear(config.SHARED_DIM, 2),
            nn.LogSoftmax(dim=1)
        )
        
        # 2. Product Feature Generation Modules
        self.product_encoder = nn.Linear(encoder_input_dim, config.SHARED_DIM + config.SPECIFIC_DIM)
        self.product_decoder = nn.Linear(config.SHARED_DIM + config.SPECIFIC_DIM, encoder_input_dim)
        
    def difference_loss(self, vec1, vec2):
        return F.cosine_similarity(vec1, vec2).mean()

    def forward_for_recommendation(self, uv_source, uv_target, iv_target):
        """
        A modified forward pass for real-time recommendation.
        We feed in the aggregated vectors directly.
        """
        # Apply dropout (this is automatically disabled by model.eval())
        uv_source = self.dropout(uv_source)
        uv_target = self.dropout(uv_target)
        iv_target = self.dropout(iv_target)
        
        # --- 2. User Migration Path ---
        sh_pv_source = self.user_shared_encoder(uv_source)
        sp_pv_source = self.user_source_encoder(uv_source)
        sh_pv_target = self.user_shared_encoder(uv_target)
        sp_pv_target = self.user_target_encoder(uv_target)
        
        # --- 3. Product Feature Path ---
        pfv = self.product_encoder(iv_target) # Product Feature Vector
        
        # --- 4. Final Preference Vectors for Prediction ---
        # This is the CORE cold-start logic
        th_pv = sh_pv_source.detach() # We detach, as we are in cold-start
        
        user_pref_vec_concat = torch.cat([th_pv, sp_pv_target], dim=1) # (B, 32+32) -> (B, 64)
        
        # --- 5. Rating Prediction ---
        rating_pred = (user_pref_vec_concat * pfv).sum(dim=1)
        
        return rating_pred


# 2. Load the BPE-LLM Encoder (from Cell 4)
try:
    if 'encoder' not in globals() or not isinstance(encoder, BpeLlmReviewEncoder):
        print("Re-defining BpeLlmReviewEncoder class...")
        class BpeLlmReviewEncoder:
            def __init__(self, model_name='distilbert-base-uncased', batch_size=64):
                print("Initializing BPE-LLM Review Encoder...")
                self.device = device
                self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
                self.model = DistilBertModel.from_pretrained(model_name).to(self.device).eval()
                self.batch_size = batch_size
                print(f"Encoder initialized on {self.device} with model {model_name}.")

            @torch.no_grad()
            def encode(self, review_texts):
                all_embeddings = []
                for i in range(0, len(review_texts), self.batch_size):
                    batch_texts = review_texts[i:i+self.batch_size]
                    inputs = self.tokenizer(
                        batch_texts, padding=True, truncation=True, return_tensors='pt', max_length=512
                    ).to(self.device)
                    outputs = self.model(**inputs)
                    token_embeddings = outputs.last_hidden_state
                    attention_mask = inputs['attention_mask']
                    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
                    sum_embeddings = torch.sum(token_embeddings * mask_expanded, 1)
                    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
                    mean_pooled = sum_embeddings / sum_mask
                    all_embeddings.append(mean_pooled.cpu().numpy())
                return np.vstack(all_embeddings)
        
        encoder = BpeLlmReviewEncoder()
    print("BPE-LLM Encoder is ready.")

    # 3. Load the Config (from your WINNING run)
    class Config:
        # File path for Cell 12
        TARGET_DOMAIN_FILE = 'Digital_Music.jsonl'
        
        # Model Dims
        EMBEDDING_DIM = 768
        FEATURE_DIM = 64
        SHARED_DIM = 32
        SPECIFIC_DIM = 32
        TOP_K_REVIEWS = 10
        DROPOUT_RATE = 0.3
        
        # Encoder config
        EMBEDDING_BATCH_SIZE = 64
    
    print("Configuration from the winning run is loaded.")
    
    # 4. Initialize the model and load the saved weights
    inference_model = RACRecLLM(Config).to(device)
    
    # --- This is your best model file from Run 4 ---
    model_path = 'best_model_simple_tuned.pth' 
    
    inference_model.load_state_dict(torch.load(model_path, map_location=device))
    inference_model.eval() # Set model to evaluation mode (turns off dropout)

    print(f"\nSuccessfully loaded trained model from '{model_path}'!")
    print("The recommendation engine is ready.")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please make sure all previous cells (especially 2, 4, 7 from Run 4) have been defined.")

--- Loading all components for the interactive demo on cuda ---
Re-defining BpeLlmReviewEncoder class...
Initializing BPE-LLM Review Encoder...
Encoder initialized on cuda with model distilbert-base-uncased.
BPE-LLM Encoder is ready.
Configuration from the winning run is loaded.

Successfully loaded trained model from 'best_model_simple_tuned.pth'!
The recommendation engine is ready.


In [3]:
# Cell 12: Build the Recommendation Catalog
import json
import os
import time
from tqdm import tqdm # Make sure tqdm is imported for the progress bar

# --- ADDED MISSING HELPER FUNCTION ---
def load_jsonl(path):
    """A robust function to load a JSON Lines file."""
    data = []
    print(f"Loading {path}...")
    try:
        with open(path, 'r', encoding='utf-8') as f:
            # Added tqdm for progress
            for line in tqdm(f, desc=f"Reading {os.path.basename(path)}"):
                try:
                    data.append(json.loads(line))
                except (json.JSONDecodeError, KeyError):
                    pass # Skip malformed lines
        return pd.DataFrame(data)
    except FileNotFoundError as e:
        print(f"ERROR: File not found at {path}")
        print(f"Details: {e}")
        return None
# --- END OF FIX ---


# We need the _pad_reviews helper function
def _pad_reviews(reviews_list, max_reviews, emb_dim, zero_pad_vector):
    """Pads or truncates a list of review embeddings to a fixed size."""
    reviews = reviews_list[-max_reviews:]
    padded_reviews = reviews + [zero_pad_vector] * (max_reviews - len(reviews))
    return np.array(padded_reviews, dtype=np.float32)

def build_item_catalog(encoder, config):
    """
    Loads all music reviews, encodes them, and creates an aggregated
    vector for every unique music item.
    """
    print("Building music item catalog... This may take a few minutes.")
    start_time = time.time()
    
    # 1. Load the raw Digital Music dataset
    # This line will now work
    music_df = load_jsonl(Config.TARGET_DOMAIN_FILE) 
    if music_df is None:
        return None # Stop if file wasn't found
        
    music_df = music_df.dropna(subset=['text', 'parent_asin'])
    music_df['text'] = music_df['text'].astype(str)
    
    # 2. Find all unique reviews in the music dataset and encode them
    unique_reviews = music_df['text'].drop_duplicates().tolist()
    print(f"Found {len(unique_reviews)} unique music reviews to encode.")
    music_review_embeddings = encoder.encode(unique_reviews)
    
    # 3. Create a mapping for music reviews
    music_embedding_map = {text: emb for text, emb in zip(unique_reviews, music_review_embeddings)}
    music_df['embedding'] = music_df['text'].map(music_embedding_map)
    music_df = music_df.dropna(subset=['embedding'])
    
    # 4. Group reviews by item_id
    item_reviews_grouped = music_df.groupby('parent_asin')['embedding'].apply(list)
    print(f"Found {len(item_reviews_grouped)} unique music items.")
    
    # 5. Create the final catalog: {item_id: aggregated_vector}
    item_catalog = {}
    zero_pad = np.zeros(config.EMBEDDING_DIM, dtype=np.float32)
    
    for item_id, reviews in tqdm(item_reviews_grouped.items(), desc="Aggregating item vectors"):
        # Pad the reviews for the item
        padded_item_reviews = _pad_reviews(reviews, config.TOP_K_REVIEWS, config.EMBEDDING_DIM, zero_pad)
        
        # Aggregate using simple mean pooling (to match our winning model)
        item_tensor = torch.tensor(padded_item_reviews, dtype=torch.float32)
        item_agg_vec = torch.sum(item_tensor, dim=0) / torch.clamp((item_tensor.sum(dim=-1) != 0).sum(dim=0).unsqueeze(0), min=1)
        
        # Store the final vector (on CPU to save GPU memory)
        item_catalog[item_id] = item_agg_vec.cpu()

    print(f"Catalog built in {time.time() - start_time:.2f} seconds.")
    return item_catalog

# --- Build the catalog ---
item_catalog = build_item_catalog(encoder, Config)

Building music item catalog... This may take a few minutes.
Loading Digital_Music.jsonl...


Reading Digital_Music.jsonl: 10019it [00:00, 100175.77it/s]

Reading Digital_Music.jsonl: 130434it [00:02, 63180.20it/s]


Found 118645 unique music reviews to encode.
Found 70511 unique music items.


Aggregating item vectors: 70511it [00:04, 14488.09it/s]

Catalog built in 1445.69 seconds.





In [2]:
# Cell 13: The Final Recommendation Demo

def get_recommendations(user_movie_reviews, model, encoder, item_catalog, config):
    """
    Generates a Top-10 list of music recommendations for a cold-start user.
    """
    print("Encoding user's movie reviews...")
    # 1. Encode the user's source (movie) reviews
    user_source_embs = list(encoder.encode(user_movie_reviews))
    
    # 2. Pad them and create the user's source vector
    zero_pad = np.zeros(config.EMBEDDING_DIM, dtype=np.float32)
    usr_src_pad = _pad_reviews(user_source_embs, config.TOP_K_REVIEWS, config.EMBEDDING_DIM, zero_pad)
    usr_src_tensor = torch.tensor(usr_src_pad, dtype=torch.float32).unsqueeze(0).to(device) # (1, K, E)

    # 3. Create the zero vector for the cold-start target domain
    usr_tgt_tensor = torch.zeros_like(usr_src_tensor).to(device)
    
    # 4. Aggregate the user vectors (matching the logic in our model)
    uv_source = torch.sum(usr_src_tensor, dim=1) / torch.clamp((usr_src_tensor.sum(dim=-1) != 0).sum(dim=1).unsqueeze(1), min=1)
    uv_target = torch.zeros_like(uv_source) # It's all zeros
    
    print(f"Scoring all {len(item_catalog)} music items in the catalog...")
    predictions = {}
    
    # 5. Loop through the catalog and predict a rating for each item
    with torch.no_grad():
        for item_id, item_agg_vec in tqdm(item_catalog.items(), desc="Recommending"):
            iv_target = item_agg_vec.unsqueeze(0).to(device) # (1, E)

            # 6. Use the model's forward pass to get a rating
            rating_pred = model.forward_for_recommendation(uv_source, uv_target, iv_target)
            predictions[item_id] = rating_pred.item()
            
    # 7. Sort the predictions and get the Top 10
    sorted_recommendations = sorted(predictions.items(), key=lambda item: item[1], reverse=True)
    
    return sorted_recommendations[:10]

# --- The Interactive Loop ---
# We're referencing the AUC from your winning run here!
print(f"--- REAL-TIME COLD-START RECOMMENDATION ENGINE (AUC: 0.8466) ---")
print(f"I have pre-processed all {len(item_catalog)} music items.")
print("Give me a new user's movie reviews (their 'Netflix' history).")
print("I will give you a Top-10 list of 'Spotify' recommendations.")
print("Type 'quit' at any time to exit.")
print("="*40)

while True:
    user_input_movies = input("\n🎬 USER'S MOVIE REVIEWS (separate with ';'): \n")
    if user_input_movies.lower() == 'quit':
        break
        
    movie_reviews_list = [r.strip() for r in user_input_movies.split(';') if r.strip()]

    if not movie_reviews_list:
        print("\n[Error] Please provide at least one movie review.")
        continue

    # Get the recommendations
    try:
        recommendations = get_recommendations(
            movie_reviews_list, 
            inference_model, 
            encoder, 
            item_catalog, 
            Config
        )
        
        print("\n" + "-"*40)
        print("💬 TOP-10 MUSIC RECOMMENDATIONS FOR THIS USER:")
        print("-" * 40)
        for i, (item_id, rating) in enumerate(recommendations):
            print(f"  {i+1}. Item (ASIN): {item_id} (Predicted Rating: {rating:.2f})")
        print("-"*40)

    except Exception as e:
        print(f"\nAn error occurred during prediction: {e}")

print("\n--- Demo finished. ---")

--- REAL-TIME COLD-START RECOMMENDATION ENGINE (AUC: 0.8466) ---
I have pre-processed all 70511 music items.
Give me a new user's movie reviews (their 'Netflix' history).
I will give you a Top-10 list of 'Spotify' recommendations.
Type 'quit' at any time to exit.
Encoding user's movie reviews...
Scoring all 70511 music items in the catalog...


Recommending: 100%|██████████| 70511/70511 [00:19<00:00, 3595.98it/s]



----------------------------------------
💬 TOP-10 MUSIC RECOMMENDATIONS FOR THIS USER:
----------------------------------------
  1. Item (ASIN): B01K8O1NEY (Predicted Rating: 3.75)
  2. Item (ASIN): B01K8Q6W20 (Predicted Rating: 3.68)
  3. Item (ASIN): B000024SNU (Predicted Rating: 3.67)
  4. Item (ASIN): B004OMR7EM (Predicted Rating: 3.65)
  5. Item (ASIN): B00CKZKWU6 (Predicted Rating: 3.65)
  6. Item (ASIN): B013GVMA4I (Predicted Rating: 3.64)
  7. Item (ASIN): B01G65CPSC (Predicted Rating: 3.61)
  8. Item (ASIN): B007P8LJ5K (Predicted Rating: 3.59)
  9. Item (ASIN): B00824I8M8 (Predicted Rating: 3.59)
  10. Item (ASIN): B00004SOF4 (Predicted Rating: 3.56)
----------------------------------------
Encoding user's movie reviews...
Scoring all 70511 music items in the catalog...


Recommending: 100%|██████████| 70511/70511 [00:19<00:00, 3616.84it/s]



----------------------------------------
💬 TOP-10 MUSIC RECOMMENDATIONS FOR THIS USER:
----------------------------------------
  1. Item (ASIN): B01K8Q6W20 (Predicted Rating: 3.55)
  2. Item (ASIN): B004OMR7EM (Predicted Rating: 3.52)
  3. Item (ASIN): B007P8LJ5K (Predicted Rating: 3.51)
  4. Item (ASIN): B00824I8M8 (Predicted Rating: 3.51)
  5. Item (ASIN): B00CKZKWU6 (Predicted Rating: 3.49)
  6. Item (ASIN): B000024SNU (Predicted Rating: 3.47)
  7. Item (ASIN): B006VFOM2K (Predicted Rating: 3.47)
  8. Item (ASIN): B01K8O1NEY (Predicted Rating: 3.47)
  9. Item (ASIN): B013GVMA4I (Predicted Rating: 3.46)
  10. Item (ASIN): B00413ELMA (Predicted Rating: 3.41)
----------------------------------------

[Error] Please provide at least one movie review.

[Error] Please provide at least one movie review.

[Error] Please provide at least one movie review.

[Error] Please provide at least one movie review.

[Error] Please provide at least one movie review.

[Error] Please provide at least o

In [None]:
# Cell 13: The Interactive "Rating Prediction" Demo

print("--- Cold-Start Rating Prediction Tester ---")
print(f"I am your best trained model (AUC: 0.8466).")
print("Give me a user's movie reviews (source history).")
print("Then, give me a specific music item's reviews (target item).")
print("I will predict the single rating that user would give that item.")
print("Type 'quit' at any time to exit.")
print("="*40)

while True:
    # 1. Get user's movie reviews
    user_input_movies = input("\n🎬 USER'S MOVIE REVIEWS (separate with ';'): \n")
    if user_input_movies.lower() == 'quit':
        break
    
    # 2. Get item's music reviews
    user_input_music_item = input("\n🎵 MUSIC ITEM'S REVIEWS (separate with ';'): \n")
    if user_input_music_item.lower() == 'quit':
        break
        
    # Split the input strings into lists
    movie_reviews_list = [r.strip() for r in user_input_movies.split(';') if r.strip()]
    music_item_reviews_list = [r.strip() for r in user_input_music_item.split(';') if r.strip()]

    if not movie_reviews_list:
        print("\n[Error] Please provide at least one movie review.")
        continue
    if not music_item_reviews_list:
        print("\n[Error] Please provide at least one review for the music item.")
        continue

    # 3. Get the prediction
    try:
        rating = predict_cold_start_rating(
            movie_reviews_list, 
            music_item_reviews_list, 
            inference_model, 
            encoder, 
            Config
        )
        
        # 4. Print the result
        print("\n" + "-"*40)
        print("💬 MODEL PREDICTION:")
        print(f"   Based on the user's movie taste, I predict they would rate this music item:")
        print(f"\n   >>>>> {rating:.2f} out of 5 stars <<<<<")
        print("-"*40)

    except Exception as e:
        print(f"\nAn error occurred during prediction: {e}")

print("\n--- Demo finished. ---")