In [1]:
from preprocessing_utils import *

In [2]:
user_item_path = '/Volumes/DeepLearner/Search & Recommendation System/Data/australian_users_items_clean.json'
review_path = '/Volumes/DeepLearner/Search & Recommendation System/Data/steam_reviews_clean.json'

user_item_df = load_json_to_df(user_item_path)
review_df = load_review_json_to_df(review_path)

user_item_df.shape, review_df.shape

((5153209, 8), (59305, 8))

In [3]:
# Merge the two datasets on 'user_id' and 'item_id'
merged_df = pd.merge(user_item_df, review_df, how='inner', on=['user_id', 'item_id'])

# Display the first few rows of the merged DataFrame
merged_df.head()

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_count,steam_id,user_url,funny,posted,last_edited,helpful,recommend,review
0,22200,Zeno Clash,271,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,No ratings yet,True,It's unique and worth a playthrough.
1,1250,Killing Floor,10006,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,No ratings yet,True,Simple yet with great replayability. In my opi...
2,43110,Metro 2033,834,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,227300,Euro Truck Simulator 2,551,0,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
4,239030,"Papers, Please",349,0,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,,"Posted November 29, 2013.",,1 of 4 people (25%) found this review helpful,True,Very fun little game to play when your bored o...


In [4]:
merged_df.shape

(46317, 14)

In [5]:
# Check for missing values in the merged DataFrame
missing_values_merged = merged_df.isnull().sum()
missing_values_merged


item_id             0
item_name           0
playtime_forever    0
playtime_2weeks     0
user_id             0
items_count         0
steam_id            0
user_url            0
funny               0
posted              0
last_edited         0
helpful             0
recommend           0
review              0
dtype: int64

## Feature Engineering

1. Converting text into TF-IDF vectors
2. Normalizing playtime_forever

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to 5000 most frequent words for demonstration

# Fit and transform the review text
tfidf_features = tfidf_vectorizer.fit_transform(merged_df['review']).toarray()

# Display the shape to confirm the transformation
tfidf_features.shape


(46317, 5000)

In [5]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the 'playtime_forever' feature
playtime_scaled = scaler.fit_transform(merged_df[['playtime_forever']])

# Display the shape and first few values to confirm the transformation
playtime_scaled.shape, playtime_scaled[:5]


((46317, 1),
 array([[-0.40206923],
        [ 0.04469064],
        [-0.37623196],
        [-0.38921944],
        [-0.39848965]]))

## Preparing the data for training

In [10]:
from sklearn.model_selection import train_test_split

# Combine the features into a single array
import numpy as np
X_combined = np.hstack([tfidf_features, playtime_scaled])

# Target variable: 'recommend' column
y = merged_df['recommend'].values

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_combined, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display the shape of each set to confirm the split
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape


((32421, 5001), (6948, 5001), (6948, 5001), (32421,), (6948,), (6948,))

In [7]:
from torch.utils.data import Dataset

class SteamDataset(Dataset):
    def __init__(self, X, y, user_ids, item_ids):
        self.X = X
        self.y = y
        self.user_ids = user_ids
        self.item_ids = item_ids
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.user_ids[idx], self.item_ids[idx]


In [11]:
from torch.utils.data import DataLoader, TensorDataset
import torch

# Create arrays for user and item IDs corresponding to each row in the feature array
user_ids_array = merged_df['user_id'].astype('category').cat.codes.values
item_ids_array = merged_df['item_id'].astype('category').cat.codes.values

# Split these arrays into training, validation, and test sets
user_ids_train, user_ids_temp = train_test_split(user_ids_array, test_size=0.3, random_state=42)
item_ids_train, item_ids_temp = train_test_split(item_ids_array, test_size=0.3, random_state=42)

user_ids_val, user_ids_test = train_test_split(user_ids_temp, test_size=0.5, random_state=42)
item_ids_val, item_ids_test = train_test_split(item_ids_temp, test_size=0.5, random_state=42)

# Display the shape of each set to confirm the split
#user_ids_train.shape, user_ids_val.shape, user_ids_test.shape, item_ids_train.shape, item_ids_val.shape, item_ids_test.shape

# Convert the numpy arrays to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train)

X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val)

X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test)

user_ids_train_tensor = torch.LongTensor(user_ids_train)
item_ids_train_tensor = torch.LongTensor(item_ids_train)

# Create a TensorDataset that includes user and item IDs
train_dataset = TensorDataset(X_train_tensor, y_train_tensor, user_ids_train_tensor, item_ids_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor, user_ids_val, item_ids_val)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor, user_ids_test, item_ids_test)

# Create a TensorDataset from the tensors
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders for batching
batch_size = 32  # You can change this value based on your system's capabilities

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Confirm the DataLoader setup
len(train_loader), len(val_loader), len(test_loader)


(1014, 218, 218)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class HybridModel(nn.Module):
    def __init__(self, num_text_features, num_numerical_features, num_users, num_items, embedding_dim):
        super(HybridModel, self).__init__()
        
        # Text features
        self.text_layer = nn.Linear(num_text_features, 128)
        
        # Numerical features
        self.numerical_layer = nn.Linear(num_numerical_features, 64)
        
        # User and Item Embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # Dense layers
        self.fc1 = nn.Linear(128 + 64 + 2 * embedding_dim, 256)
        self.fc2 = nn.Linear(256, 64)
        
        # Output layer
        self.output = nn.Linear(64, 1)
        
    def forward(self, text_data, numerical_data, user_id, item_id):
        text_out = F.relu(self.text_layer(text_data))
        num_out = F.relu(self.numerical_layer(numerical_data))
        
        user_embedded = self.user_embedding(user_id)
        item_embedded = self.item_embedding(item_id)
        
        # Concatenating all the features
        concatenated = torch.cat([text_out, num_out, user_embedded, item_embedded], dim=1)
        
        # Passing through dense layers
        x = F.relu(self.fc1(concatenated))
        x = F.relu(self.fc2(x))
        
        # Output layer
        out = torch.sigmoid(self.output(x))
        
        return out


In [15]:
# Initialize the model
model = HybridModel(
    num_text_features=3950,
    num_numerical_features=1,
    num_users=242,  # Total unique users
    num_items=291,  # Total unique items
    embedding_dim=50
)

# Calculate the number of unique users and items in the merged DataFrame
total_unique_users = merged_df['user_id'].nunique()
total_unique_items = merged_df['item_id'].nunique()


# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):  # 10 epochs for demonstration
    for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
        # Forward pass
        outputs = model(text_data=X_batch[:, :3950], numerical_data=X_batch[:, 3950:],
                        user_id=user_ids, item_id=item_ids)
        loss = criterion(outputs, y_batch)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # Print loss every epoch
    print(f'Epoch [{epoch+1}/10], Loss: {loss.item():.4f}')


NameError: name 'user_ids' is not defined