In [12]:
from typing import Tuple, Dict, Any
from preprocessing_utils import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
from torch.optim import Adam
from sklearn.metrics import mean_squared_error

device = torch.device("cuda:0" if torch.cuda.is_available() else "mps")

user_item_path = '/Volumes/DeepLearner/Search & Recommendation System/Data/australian_users_items_clean.json'
review_path = '/Volumes/DeepLearner/Search & Recommendation System/Data/steam_reviews_clean.json'

device

device(type='mps')

In [5]:
class SteamDataset(Dataset):
    def __init__(self, X, y, user_ids, item_ids):
        self.X = X
        self.y = y
        self.user_ids = user_ids
        self.item_ids = item_ids
        
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.user_ids[idx], self.item_ids[idx]

In [15]:
def load_and_merge_data(user_item_path: str, review_path: str) -> Tuple[pd.DataFrame, int, int]:
    """
    Load and merge user-item and review data.
    
    Parameters:
    - user_item_path (str): Path to user-item data file.
    - review_path (str): Path to review data file.

    Returns:
    - Tuple[pd.DataFrame, int, int]: Merged DataFrame, number of unique users, and number of unique items.
    """
    user_item_df = load_json_to_df(user_item_path)
    review_df = load_review_json_to_df(review_path)
    merged_df = pd.merge(user_item_df, review_df, how='inner', on=['user_id', 'item_id'])
    
    num_users = merged_df['user_id'].nunique()
    num_items = merged_df['item_id'].nunique()
    
    return merged_df, num_users, num_items


def feature_engineering(merged_df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
    """
    Perform feature engineering on the merged DataFrame.
    
    Parameters:
    - merged_df (pd.DataFrame): The DataFrame to be processed.

    Returns:
    - Tuple[np.ndarray, np.ndarray]: TF-IDF features and scaled target variable.
    """
    # TF-IDF Vectorization for review text
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_features = tfidf_vectorizer.fit_transform(merged_df['review']).toarray()
    
    # Target variable: 'playtime_forever'
    y = merged_df['playtime_forever'].values
    
    # Scale the target variable
    scaler = StandardScaler()
    y_scaled = scaler.fit_transform(y.reshape(-1, 1)).flatten()

    return tfidf_features, y_scaled



def prepare_dataloader(X, y, user_ids_array, item_ids_array, batch_size=32):
    # Split 70% for training, 15% for validation, and 15% for testing
    train_size = int(0.7 * len(X))
    val_size = int(0.15 * len(X))
    test_size = len(X) - train_size - val_size
    
    # Shuffle and split the data
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    
    train_indices = indices[:train_size]
    val_indices = indices[train_size:train_size + val_size]
    test_indices = indices[train_size + val_size:]
    
    # Create train, val, test datasets
    X_train, y_train = X[train_indices], y[train_indices]
    X_val, y_val = X[val_indices], y[val_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    
    user_ids_train, item_ids_train = user_ids_array[train_indices], item_ids_array[train_indices]
    user_ids_val, item_ids_val = user_ids_array[val_indices], item_ids_array[val_indices]
    user_ids_test, item_ids_test = user_ids_array[test_indices], item_ids_array[test_indices]
    
    X_train_tensor = torch.FloatTensor(X_train)
    y_train_tensor = torch.FloatTensor(y_train)
    X_val_tensor = torch.FloatTensor(X_val)
    y_val_tensor = torch.FloatTensor(y_val)
    X_test_tensor = torch.FloatTensor(X_test)
    y_test_tensor = torch.FloatTensor(y_test)
    
    # Create datasets using the custom SteamDataset class
    train_dataset = SteamDataset(X_train_tensor, y_train_tensor, user_ids_train, item_ids_train)
    val_dataset = SteamDataset(X_val_tensor, y_val_tensor, user_ids_val, item_ids_val)
    test_dataset = SteamDataset(X_test_tensor, y_test_tensor, user_ids_test, item_ids_test)
    
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader

def create_item_mapping(df: pd.DataFrame) -> Dict[str, str]:
    """
    Create a mapping between item IDs and item names.
    
    Parameters:
    - df (pd.DataFrame): DataFrame containing item information.

    Returns:
    - Dict[str, str]: Mapping from item_id to item_name.
    """
    return dict(zip(df['item_id'], df['item_name']))


In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class HybridModel(nn.Module):
    def __init__(self, num_users, num_items, num_text_features, embedding_dim):
        super(HybridModel, self).__init__()
        
        # User and Item Embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # Text feature layers
        self.text_layers = nn.Sequential(
            nn.Linear(num_text_features, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU()
        )
        
        # Combined layers
        self.combined_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2 + 32, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        
    def forward(self, user_ids, item_ids, text_features):
        user_embedding = self.user_embedding(user_ids)
        item_embedding = self.item_embedding(item_ids)
        
        text_output = self.text_layers(text_features)
        
        # Concatenate the embeddings and text features
        combined_input = torch.cat([user_embedding, item_embedding, text_output], dim=1)
        
        output = self.combined_layers(combined_input)

        output = torch.sigmoid(output)
        
        return output


import torch
import torch.nn as nn
import torch.nn.functional as F

class HybridModel(nn.Module):
    def __init__(self, num_users, num_items, num_text_features, num_numerical_features, embedding_dim):
        super(HybridModel, self).__init__()
        
        # Text features layer
        self.text_layer = nn.Linear(num_text_features, 64)
        
        # Numerical features layer
        self.numerical_layer = nn.Linear(num_numerical_features, 64)
        
        # User and Item Embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # Dense layers
        self.fc1 = nn.Linear(64 + 64 + 2 * embedding_dim, 128)
        self.fc2 = nn.Linear(128, 1)
        
    def forward(self, text_data, numerical_data, user_id, item_id):
        # Text features
        text_out = F.relu(self.text_layer(text_data))
        
        # Numerical features
        numerical_out = F.relu(self.numerical_layer(numerical_data))
        
        # User and Item Embedding
        user_embedding = self.user_embedding(user_id)
        item_embedding = self.item_embedding(item_id)
        
        # Concatenate all features
        concat_features = torch.cat([text_out, numerical_out, user_embedding, item_embedding], dim=1)
        
        # Fully connected layers
        x = F.relu(self.fc1(concat_features))
        out = self.fc2(x)
        
        return out


import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

def train_model(model, train_loader, val_loader, num_epochs, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Define loss and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        
        train_loss = 0.0
        for batch_data in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            # Get batch data
            X, y, user_id, item_id = batch_data
            X, y, user_id, item_id = (
                X.to(device),
                y.to(device),
                user_id.to(device),
                item_id.to(device),
                
            )
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(X, numerical_data, user_id, item_id)
            
            # Compute loss
            loss = criterion(outputs.squeeze(), targets)
            
            # Backpropagation
            loss.backward()
            
            # Update weights
            optimizer.step()
            
            train_loss += loss.item()
        
        # Calculate average training loss for this epoch
        avg_train_loss = train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        # Validation loop
        model.eval()  # Set the model to evaluation mode
        val_loss = 0.0
        with torch.no_grad():
            for batch_data in val_loader:
                X, y, user_id, item_id = batch_data
                X, y, user_id, item_id = (
                    X.to(device),
                    y.to(device),
                    user_id.to(device),
                    item_id.to(device),
                    
                )
                
                # Forward pass
                outputs = model(text_data, numerical_data, user_id, item_id)
                
                # Compute loss
                loss = criterion(outputs.squeeze(), targets)
                
                val_loss += loss.item()
        
        # Calculate average validation loss for this epoch
        avg_val_loss = val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
    
    return train_losses, val_losses


In [24]:
def get_actual_top_k_games(user_id, user_item_df, item_mapping, k=5):
    """
    Get the actual top k games for a given user based on playtime.
    
    Parameters:
    - user_id (int): The ID of the user
    - user_item_df (pd.DataFrame): DataFrame containing user-item interactions
    - item_mapping (dict): Mapping from item_id to item_name
    - k (int): Number of top items to return
    
    Returns:
    - list: Top k items for the user based on actual playtime
    """
    user_data = user_item_df[user_item_df['user_id'] == user_id]
    top_k_items = user_data.nlargest(k, 'playtime_forever')['item_id'].tolist()
    top_k_item_names = [item_mapping[item_id] for item_id in top_k_items]
    
    return top_k_item_names


In [26]:
def predict_top_k_games_for_user(model, user_id, user_item_matrix, item_mapping, k=5):
    model.eval()
    with torch.no_grad():
        user_id_tensor = torch.tensor([user_id] * len(item_mapping), dtype=torch.long)
        item_id_tensor = torch.tensor(list(item_mapping.keys()), dtype=torch.long)
        
        # Dummy text_features tensor. Replace this with actual data if available.
        text_features = torch.zeros((len(item_mapping), 5000))
        
        predictions = model(user_id_tensor, item_id_tensor, text_features)
        
    top_k_indices = predictions.argsort(descending=True)[:k]
    top_k_item_ids = item_id_tensor[top_k_indices].tolist()
    top_k_item_names = [item_mapping[item_id] for item_id in top_k_item_ids]
    
    actual_top_k_item_names = get_actual_top_k_games(user_id, user_item_matrix, item_mapping, k)
    
    return actual_top_k_item_names, top_k_item_names


In [27]:
def precision_at_k(predicted_top_k, actual_top_k, k):
    relevant_items = set(predicted_top_k) & set(actual_top_k)
    precision = len(relevant_items) / k
    return precision


In [22]:
from tqdm import tqdm
from torch.optim import Adam
from sklearn.metrics import mean_squared_error

def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    
    # Wrap your loader with tqdm to display the progress bar
    for i, (X_batch, y_batch, user_ids, item_ids) in tqdm(enumerate(train_loader), desc="Training", total=len(train_loader)):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        user_ids, item_ids = user_ids.long().to(device), item_ids.long().to(device)


        optimizer.zero_grad()
        
        output = model(user_ids, item_ids, X_batch)
        loss = criterion(output, y_batch)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    return total_loss / len(train_loader)

def evaluate_model(model, val_loader, device):
    model.eval()
    y_true = []
    y_pred = []
    
    # Wrap your loader with tqdm to display the progress bar
    with torch.no_grad():
        for i, (X_batch, y_batch, user_ids, item_ids) in tqdm(enumerate(val_loader), desc="Evaluating", total=len(val_loader)):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            user_ids, item_ids = user_ids.long().to(device), item_ids.long().to(device)

            
            output = model(user_ids, item_ids, X_batch)
            
            y_true += list(y_batch.cpu().numpy())
            y_pred += list(output.cpu().numpy())
    
    mse = mean_squared_error(y_true, y_pred)
    
    return mse


In [None]:
merged_df, num_users, num_items = load_and_merge_data(user_item_path, review_path)

X, y = feature_engineering(merged_df)

user_ids_array = merged_df['user_id'].astype('category').cat.codes.values
item_ids_array = merged_df['item_id'].astype('category').cat.codes.values

item_mapping = create_item_mapping(merged_df)

In [18]:
merged_df, num_users, num_items = load_and_merge_data(user_item_path, review_path)

X, y = feature_engineering(merged_df)

user_ids_array = merged_df['user_id'].astype('category').cat.codes.values
item_ids_array = merged_df['item_id'].astype('category').cat.codes.values
# getting train, test, val data
train_loader, val_loader, test_loader = prepare_dataloader(X, y, user_ids_array, item_ids_array, batch_size=32)

In [23]:

num_text_features = 5000
embedding_dim = 50
learning_rate = 0.001
num_epochs = 10

device = torch.device("cuda:0" if torch.cuda.is_available() else "mps")

# Initialize model, optimizer, and loss function
model = HybridModel(num_users, num_items, num_text_features, embedding_dim).to(device)
optimizer = Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

# Load data (assuming train_loader and val_loader are prepared)
# train_loader, val_loader = prepare_dataloader(...)

# Training loop
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    val_mse = evaluate_model(model, val_loader, device)
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation MSE: {val_mse:.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 1014/1014 [00:18<00:00, 54.32it/s]
Evaluating: 100%|██████████| 218/218 [00:01<00:00, 187.38it/s]


Epoch 1/10
Train Loss: 1.0259
Validation MSE: 0.9290


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 1014/1014 [00:16<00:00, 61.41it/s]
Evaluating: 100%|██████████| 218/218 [00:00<00:00, 262.76it/s]


Epoch 2/10
Train Loss: 1.0215
Validation MSE: 0.9187


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 1014/1014 [00:16<00:00, 62.91it/s]
Evaluating: 100%|██████████| 218/218 [00:00<00:00, 268.86it/s]


Epoch 3/10
Train Loss: 1.0201
Validation MSE: 0.9308


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 1014/1014 [00:15<00:00, 63.53it/s]
Evaluating: 100%|██████████| 218/218 [00:00<00:00, 273.98it/s]


Epoch 4/10
Train Loss: 1.0228
Validation MSE: 0.9288


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 1014/1014 [00:15<00:00, 63.68it/s]
Evaluating: 100%|██████████| 218/218 [00:00<00:00, 258.31it/s]


Epoch 5/10
Train Loss: 1.0200
Validation MSE: 0.9178


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 1014/1014 [00:17<00:00, 57.87it/s]
Evaluating: 100%|██████████| 218/218 [00:00<00:00, 267.30it/s]


Epoch 6/10
Train Loss: 1.0200
Validation MSE: 0.9245


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 1014/1014 [00:15<00:00, 63.75it/s]
Evaluating: 100%|██████████| 218/218 [00:00<00:00, 267.63it/s]


Epoch 7/10
Train Loss: 1.0200
Validation MSE: 0.9278


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 1014/1014 [00:16<00:00, 60.69it/s]
Evaluating: 100%|██████████| 218/218 [00:00<00:00, 274.80it/s]


Epoch 8/10
Train Loss: 1.0205
Validation MSE: 0.9173


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 1014/1014 [00:16<00:00, 63.35it/s]
Evaluating: 100%|██████████| 218/218 [00:00<00:00, 267.02it/s]


Epoch 9/10
Train Loss: 1.0199
Validation MSE: 0.9232


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 1014/1014 [00:15<00:00, 63.75it/s]
Evaluating: 100%|██████████| 218/218 [00:00<00:00, 273.20it/s]

Epoch 10/10
Train Loss: 1.0217
Validation MSE: 0.9208





In [11]:
# check first batch of train_loader

for batch_data in train_loader:
    X, y, user_id, item_id = batch_data
    print(X.shape)
    print(y.shape)
    print(user_id.shape)
    print(item_id.shape)
    break

torch.Size([32, 5000])
torch.Size([32])
torch.Size([32])
torch.Size([32])


In [28]:
specific_user_id = 10  # Replace with an actual user ID from your data
specific_user_code = merged_df[merged_df['user_id'] == specific_user_id]['user_id'].astype('category').cat.codes.values[0]

actual_top_5, predicted_top_5 = predict_top_k_games_for_user(model, specific_user_code, merged_df, item_mapping, k=5)
precision = precision_at_k(predicted_top_5, actual_top_5, k=5)

print(f"Actual Top 5 games: {actual_top_5}")
print(f"Predicted Top 5 games: {predicted_top_5}")
print(f"Precision@5: {precision}")


IndexError: index 0 is out of bounds for axis 0 with size 0