<a href="https://colab.research.google.com/github/lucarenz1997/recommender_systems/blob/main/Hybrid-NCF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hybrid Neural Collaborative Filtering (NCF) Model
** Authors **: Rafaella and Luca
[Linktext](https://)

## Setup

In [57]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Loading and Data Preparation

In [None]:
#Load data
data = pd.read_csv("/content/drive/MyDrive/Recommender/preprocessed.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

data_corr = data.drop(['ts_listen', 'release_date', 'time_of_day'], axis=1)

plt.figure(figsize=(18, 12))
sns.heatmap(data_corr.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix (excluding ts_listen and release_date)")
plt.show()

In [69]:
#Encoding
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
gender_encoder = LabelEncoder()
platform_encoder = LabelEncoder()

data["user_id"] = user_encoder.fit_transform(data["user_id"])
data["media_id"] = item_encoder.fit_transform(data["media_id"])
data['user_gender_enc'] = gender_encoder.fit_transform(data['user_gender'])
data['platform_name_enc'] = platform_encoder.fit_transform(data['platform_name'])

# MinMax scaling to bring it to a 0-1 range
scaler = MinMaxScaler()
data['user_age_scaled'] = scaler.fit_transform(data[['user_age']])
data['song_popularity_7d_scaled'] = scaler.fit_transform(data[['song_popularity_7d']])
data['last_listen_scaled'] = scaler.fit_transform(data[['last_listen']])
data.head(100)

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,album_popularity,songs_listened,song_popularity_7d,artist_popularity_7d,album_popularity_7d,user_gender_enc,platform_name_enc,user_age_scaled,song_popularity_7d_scaled,last_listen_scaled
0,1175,2016-11-28 16:40:24,23564,1437691,0,2003-05-19,0,0,306,0,...,40,28,1,79,5,0,0,0.083333,0.009709,0.0
1,0,2016-11-17 23:19:07,61787,13632884,0,2016-07-22,1,0,239,0,...,152,52,22,63,22,0,1,0.250000,0.213592,0.0
2,2692,2016-11-23 14:20:52,19290,919739,2,2008-09-16,1,0,228,0,...,5,56,0,40,0,1,1,0.083333,0.000000,0.0
3,14,2016-11-24 14:19:09,42166,8980343,4,2014-10-27,0,0,222,1,...,29,10,0,51,3,0,0,0.333333,0.000000,0.0
4,0,2016-11-02 15:07:25,62038,13680778,0,2016-07-29,0,0,183,0,...,73,53,0,32,0,0,0,0.416667,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,25,2016-11-12 17:54:14,12618,338342,1,2009-07-07,2,1,306,1,...,72,16,11,18,14,1,2,0.833333,0.106796,0.0
96,0,2016-11-20 01:10:11,58568,13082992,2,2016-05-06,0,0,253,0,...,1074,55,0,353,121,1,0,0.166667,0.000000,0.0
97,0,2016-11-17 12:36:05,65109,14103674,0,2016-09-24,0,0,248,0,...,62,40,9,9,9,0,0,0.500000,0.087379,0.0
98,7,2016-11-20 07:49:36,12302,324222,13,1995-12-31,0,0,247,1,...,21,75,1,40,3,0,0,0.750000,0.009709,0.0


##Train/Test Split

In [60]:
train, test = train_test_split(data, test_size=0.2, random_state=42)
train.head(100)

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,album_popularity,songs_listened,song_popularity_7d,artist_popularity_7d,album_popularity_7d,user_gender_enc,platform_name_enc,user_age_scaled,song_popularity_7d_scaled,last_listen_scaled
8558,0,2016-11-17 23:56:27,48426,10770214,0,2015-07-17,0,0,255,0,...,2,53,0,6,0,0,0,0.583333,0.000000,0.180767
234700,7,2016-11-24 15:06:50,3606,102427,1,2007-11-26,2,1,197,1,...,102,65,13,22,20,1,2,0.666667,0.126214,0.451659
65135,0,2016-11-06 16:08:14,58648,13091982,0,2016-05-13,0,0,230,0,...,11,259,1,1,1,1,0,0.083333,0.009709,0.256394
237176,0,2016-11-18 13:07:07,60660,13457671,0,2016-07-08,1,0,197,0,...,973,26,74,154,132,0,1,0.083333,0.718447,0.231037
312190,0,2016-11-23 06:37:13,64837,14079078,0,2016-09-20,0,0,194,0,...,930,68,97,992,97,0,0,0.000000,0.941748,0.474622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261620,0,2016-11-12 21:03:47,59373,13246617,4,2016-07-04,0,0,251,1,...,2125,32,20,992,275,1,0,0.750000,0.194175,0.159735
327784,0,2016-11-22 23:34:19,52341,11674708,0,2015-11-13,0,0,233,0,...,500,71,23,96,69,0,0,0.916667,0.223301,0.000002
119518,0,2016-11-12 09:40:16,68991,14530576,2,2016-11-11,1,0,194,0,...,3111,34,27,843,642,0,1,0.166667,0.262136,0.047819
23657,7054,2016-11-14 15:27:59,36786,7476147,1,2014-03-03,0,0,187,1,...,3,70,0,1,1,0,0,0.583333,0.000000,0.000000


## Create PyTorch dataset

In [61]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

class ListenDataset(Dataset):
    def __init__(self, data):
        self.users = torch.tensor(data['user_id'].values, dtype=torch.long)
        self.items = torch.tensor(data['media_id'].values, dtype=torch.long)
        self.ages = torch.tensor(data['user_age_scaled'].values, dtype=torch.float32)
        self.genders = torch.tensor(data['user_gender_enc'].values, dtype=torch.long)
        self.platforms = torch.tensor(data['platform_name_enc'].values, dtype=torch.long)
        self.song_popularity_7d = torch.tensor(data['song_popularity_7d_scaled'].values, dtype=torch.float32)
        self.last_listen = torch.tensor(data['last_listen_scaled'].values, dtype=torch.float32)
        self.labels = torch.tensor(data['is_listened'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return (self.users[idx], self.items[idx], self.ages[idx], self.genders[idx], self.platforms[idx], self.song_popularity_7d[idx], self.last_listen[idx]), self.labels[idx]

# Batch-Processing
train_data = ListenDataset(train)
test_data = ListenDataset(test)

train_loader = DataLoader(train_data, batch_size=512, shuffle=True)
test_loader = DataLoader(test_data, batch_size=512, shuffle=False)


## Model Definition

In [66]:
import torch
import torch.nn as nn

class NCF(nn.Module):
    def __init__(self, num_users, num_items, num_genders, num_platforms, emb_size=64):
        super(NCF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.gender_emb = nn.Embedding(num_genders, emb_size // 2)  # Smaller embedding size for gender
        self.platform_emb = nn.Embedding(num_platforms, emb_size // 2)

        # Update the input dimension: +3 for the scalar features: age, song_popularity_7d, and last_listen
        input_dim = emb_size * 2 + (emb_size // 2) * 2 + 3

        self.fc_layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.5),  # Adding dropout for regularization
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, user, item, age, gender, platform, song_popularity_7d, last_listen):
        user_embedded = self.user_emb(user)
        item_embedded = self.item_emb(item)
        gender_embedded = self.gender_emb(gender)
        platform_embedded = self.platform_emb(platform)

        # Ensure scalar features are 2D: (batch_size, 1)
        age = age.unsqueeze(1)
        song_popularity_7d = song_popularity_7d.unsqueeze(1)
        last_listen = last_listen.unsqueeze(1)

        # Concatenate all features along the last dimension
        x = torch.cat([
            user_embedded,
            item_embedded,
            gender_embedded,
            platform_embedded,
            age,
            song_popularity_7d,
            last_listen
        ], dim=-1)

        return self.fc_layers(x)


## Initiate Model

In [67]:
import torch.optim as optim

# Count unique values for all entities
num_users = data['user_id'].nunique()
num_items = data['media_id'].nunique()
num_genders = data['user_gender_enc'].nunique()
num_platforms = data['platform_name_enc'].nunique()

# Create the model instance with the new parameters
model = NCF(num_users, num_items, num_genders, num_platforms)

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss, suitable for binary labels

## Train Model

In [68]:
# Move the model to the appropriate device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Unpack all six features and the label
        (user, item, age, gender, platform, song_popularity_7d, last_listen), label = batch

        # Move all features and the label to the device
        user = user.to(device)
        item = item.to(device)
        age = age.to(device)
        gender = gender.to(device)
        platform = platform.to(device)
        song_popularity_7d = song_popularity_7d.to(device)  # Corrected line
        last_listen=last_listen.to(device)
        label = label.to(device)

        optimizer.zero_grad()
        # Pass the additional feature song_popularity_7d to the model
        preds = model(user, item, age, gender, platform, song_popularity_7d, last_listen).squeeze()
        loss = criterion(preds, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.6004
Epoch 2, Loss: 0.5614
Epoch 3, Loss: 0.5247
Epoch 4, Loss: 0.5006
Epoch 5, Loss: 0.4819
Epoch 6, Loss: 0.4662
Epoch 7, Loss: 0.4511
Epoch 8, Loss: 0.4364
Epoch 9, Loss: 0.4228
Epoch 10, Loss: 0.4098


## Evaluate Model

In [73]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Loop over threshold values from 0.1 to 0.9
for threshold in np.arange(0.1, 1.0, 0.1):
    all_preds, all_labels = [], []

    # Evaluate the model with the current threshold
    model.eval()
    with torch.no_grad():
      for batch in test_loader:
          # Unpack the batch into features and labels
          (user, item, age, gender, platform, song_popularity_7d, last_listen), label = batch

          # Move each tensor to the device
          user = user.to(device)
          item = item.to(device)
          age = age.to(device)
          gender = gender.to(device)
          platform = platform.to(device)
          song_popularity_7d = song_popularity_7d.to(device)
          last_listen = last_listen.to(device)
          label = label.to(device)

          # Forward pass through the model with all features
          preds = model(user, item, age, gender, platform, song_popularity_7d, last_listen).squeeze()

          # Convert predictions and labels to numpy arrays for metric calculation
          predicted = (preds >= threshold).float()  # Binary classification threshold at 0.5
          all_preds.extend(predicted.cpu().numpy())
          all_labels.extend(label.cpu().numpy())

    # Calculate evaluation metrics
    f1 = f1_score(all_labels, all_preds)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    # Print the metrics for the current threshold
    print(f"Threshold: {threshold:.1f}")
    print(f"  F1-Score : {f1:.4f}")
    print(f"  Accuracy : {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall   : {recall:.4f}")
    print("-" * 30)


Threshold: 0.1
  F1-Score : 0.8150
  Accuracy : 0.6902
  Precision: 0.6891
  Recall   : 0.9972
------------------------------
Threshold: 0.2
  F1-Score : 0.8218
  Accuracy : 0.7114
  Precision: 0.7116
  Recall   : 0.9724
------------------------------
Threshold: 0.3
  F1-Score : 0.8227
  Accuracy : 0.7223
  Precision: 0.7305
  Recall   : 0.9415
------------------------------
Threshold: 0.4
  F1-Score : 0.8181
  Accuracy : 0.7255
  Precision: 0.7485
  Recall   : 0.9019
------------------------------
Threshold: 0.5
  F1-Score : 0.8078
  Accuracy : 0.7226
  Precision: 0.7681
  Recall   : 0.8518
------------------------------
Threshold: 0.6
  F1-Score : 0.7848
  Accuracy : 0.7081
  Precision: 0.7919
  Recall   : 0.7778
------------------------------
Threshold: 0.7
  F1-Score : 0.7303
  Accuracy : 0.6691
  Precision: 0.8255
  Recall   : 0.6548
------------------------------
Threshold: 0.8
  F1-Score : 0.6303
  Accuracy : 0.6011
  Precision: 0.8612
  Recall   : 0.4971
-----------------------