<a href="https://colab.research.google.com/github/lucarenz1997/recommender_systems/blob/main/Hybrid-NCF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hybrid Neural Collaborative Filtering (NCF) Model
** Authors **: Rafaella and Luca
[Linktext](https://)

## 1. Setup

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

Mounted at /content/drive


## 2. Load & data prep

In [None]:
#Load data
data = pd.read_csv("/content/drive/MyDrive/Recommender/preprocessed_train.csv")

#Encoding für `user_id` und `media_id` (nur Collaborative Filtering)**
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
data["user_id"] = user_encoder.fit_transform(data["user_id"])
data["media_id"] = item_encoder.fit_transform(data["media_id"])

data.head(100)


Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,last_listen,days_since_release,genre_popularity,media_popularity,artist_popularity,album_popularity,songs_listened,song_popularity_7d,artist_popularity_7d,album_popularity_7d
0,0,2016-10-15 18:47:17,56752,33140,0.127203,2016-09-16,0.175726,0.804247,0.456897,0.686655,...,0.517236,0.002501,1.000000,0.294941,0.970137,1.000000,0.108974,0.281553,0.829535,0.701774
1,169,2016-10-17 06:18:54,16894,9660,0.434585,2011-02-03,0.682990,0.804247,0.508621,0.686655,...,0.047615,0.050515,0.038209,0.022605,0.089588,0.003717,0.118590,0.019417,0.117593,0.002217
2,0,2016-10-17 10:37:26,56759,33140,0.127203,2016-09-16,0.175726,0.804247,0.728448,0.686655,...,0.479170,0.002548,1.000000,0.262648,0.970137,1.000000,0.108974,0.300971,0.829535,0.701774
3,0,2016-10-17 11:57:41,47891,28075,0.434585,2016-01-29,0.682990,0.804247,0.693966,0.686655,...,0.682868,0.007948,1.000000,0.043057,0.009646,0.007612,0.022436,0.019417,0.004558,0.003326
4,0,2016-10-17 12:35:36,51159,30028,0.434585,2016-05-06,0.682990,0.804247,0.219828,0.686655,...,0.000625,0.005657,1.000000,0.023681,0.007532,0.004249,0.022436,0.048544,0.009116,0.005543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,2016-10-21 13:13:40,43223,25592,0.434585,2015-08-28,0.682990,0.804247,0.672414,0.686655,...,0.360447,0.011641,1.000000,0.048439,0.300608,0.118428,0.019231,0.067961,0.314494,0.119734
96,0,2016-10-21 13:18:33,54635,31997,0.434585,2016-10-14,0.175726,0.054469,0.400862,0.686655,...,0.196690,0.001987,1.000000,0.025834,0.032637,0.015932,0.099359,0.009709,0.031905,0.015521
97,0,2016-10-21 14:01:26,50739,29749,0.434585,2016-04-22,0.682990,0.804247,0.219828,0.686655,...,0.000788,0.006078,1.000000,0.205597,0.045190,0.033811,0.019231,0.281553,0.050137,0.032151
98,56,2016-10-21 14:39:29,2273,1286,0.434585,1995-01-01,0.175726,0.054469,0.129310,0.686655,...,0.575761,0.187989,0.005186,0.000000,0.000000,0.000000,0.099359,0.000000,0.000000,0.000000


### Train-Test-Split

In [None]:
#Train-Test-Split
train, test = train_test_split(data, test_size=0.2, random_state=42)
train.head(100)


Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,last_listen,days_since_release,genre_popularity,media_popularity,artist_popularity,album_popularity,songs_listened,song_popularity_7d,artist_popularity_7d,album_popularity_7d
141061,22,2016-11-11 10:39:29,28324,16590,0.220868,2012-11-19,0.175726,0.804247,0.336207,0.313345,...,0.091299,0.035789,0.072560,0.031216,0.032770,0.005134,0.394231,0.058252,0.033728,0.006652
101826,6,2016-11-08 09:35:14,5558,2976,0.009166,1993-12-31,0.141284,0.141284,0.556034,0.686655,...,0.220489,0.196966,0.252084,0.000000,0.009910,0.012569,0.099359,0.000000,0.007293,0.008869
39223,9,2016-11-03 16:51:50,14267,8058,0.127203,2010-05-07,0.682990,0.804247,0.590517,0.686655,...,0.043068,0.057271,0.074606,0.000000,0.002246,0.000885,0.022436,0.000000,0.003646,0.002217
42170,12,2016-11-03 20:10:01,33064,19297,0.434585,1994-04-12,0.175726,0.804247,0.788793,0.686655,...,0.115495,0.194465,0.048558,0.022605,0.026427,0.004957,0.016026,0.009709,0.021878,0.001109
104449,0,2016-11-08 13:28:38,40145,23651,0.434585,2015-03-30,0.682990,0.054469,0.418103,0.686655,...,0.004206,0.015592,1.000000,0.027987,0.037923,0.015401,0.336538,0.058252,0.041933,0.013304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22933,0,2016-11-02 15:25:59,48566,28510,0.127203,2016-04-22,0.682990,0.804247,0.780172,0.686655,...,0.070007,0.006358,1.000000,0.059203,0.130814,0.049212,0.214744,0.097087,0.160438,0.052106
20482,169,2016-11-02 12:41:42,9442,5350,0.054653,2006-06-23,0.682990,0.804247,0.452586,0.686655,...,0.162471,0.090301,0.038209,0.000000,0.019688,0.000177,0.278846,0.000000,0.020966,0.000000
238103,3,2016-11-19 17:00:36,19510,11327,0.029810,2011-10-17,0.682990,0.804247,0.366379,0.313345,...,0.095882,0.045303,0.019046,0.022605,0.017178,0.003717,0.044872,0.077670,0.030994,0.008869
74249,0,2016-11-05 23:25:50,40727,23953,0.434585,2015-04-22,0.682990,0.804247,0.534483,0.686655,...,0.075809,0.014984,1.000000,0.023681,0.193050,0.003894,0.035256,0.038835,0.176846,0.004435


##  3. Create PyTorch dataset

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

class ListenDataset(Dataset):
    def __init__(self, data):
        self.users = torch.tensor(data["user_id"].values, dtype=torch.long)
        self.items = torch.tensor(data["media_id"].values, dtype=torch.long)
        self.labels = torch.tensor(data["is_listened"].values, dtype=torch.float32)  # Boolean zu Float

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

# Dataset in DataLoader packen (Batch-Verarbeitung)
train_data = ListenDataset(train)
test_data = ListenDataset(test)

train_loader = DataLoader(train_data, batch_size=512, shuffle=True)
test_loader = DataLoader(test_data, batch_size=512, shuffle=False)


## 4. Define the Neural Collaborative Filtering (NCF) model

In [None]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=64):
        super(NCF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)

        self.fc_layers = nn.Sequential(
            nn.Linear(emb_size * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # Sigmoid für binäre Klassifikation
        )

    def forward(self, user, item):
        user_embedded = self.user_emb(user)
        item_embedded = self.item_emb(item)
        x = torch.cat([user_embedded, item_embedded], dim=-1)  # Kombinierte Embeddings
        return self.fc_layers(x)


## 5. Initiate model

In [None]:
num_users = data["user_id"].nunique()
num_items = data["media_id"].nunique()

model = NCF(num_users, num_items)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()  # Perfekt für Boolean-Labels!


## 6. Train the Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):
    model.train()
    total_loss = 0
    for user, item, label in train_loader:
        user, item, label = user.to(device), item.to(device), label.to(device)

        optimizer.zero_grad()
        preds = model(user, item).squeeze()  # 1D Output
        loss = criterion(preds, label)  # Binary Cross-Entropy Loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.6113
Epoch 2, Loss: 0.5619
Epoch 3, Loss: 0.5114
Epoch 4, Loss: 0.4737
Epoch 5, Loss: 0.4382
Epoch 6, Loss: 0.4017
Epoch 7, Loss: 0.3632
Epoch 8, Loss: 0.3231
Epoch 9, Loss: 0.2839
Epoch 10, Loss: 0.2459


## 7. Test model

In [None]:
from torch.utils.data import Dataset, DataLoader

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for user, item, label in test_loader:
        # Correctly unpack and move to device
        user, item, label = user.to(device), item.to(device), label.to(device)
        preds = model(user, item).squeeze()
        predicted = (preds >= 0.5).float()  # Schwelle bei 0.5 für binäre Klassifikation
        correct += (predicted == label).sum().item()
        total += label.size(0)

print(f"Test Accuracy: {correct / total:.4f}")

Test Accuracy: 0.6907


# New approach

In [5]:
#Load data
data = pd.read_csv("/content/drive/MyDrive/Recommender/preprocessed.csv")

#Encoding für `user_id` und `media_id` (nur Collaborative Filtering)**
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
data["user_id"] = user_encoder.fit_transform(data["user_id"])
data["media_id"] = item_encoder.fit_transform(data["media_id"])

data.head(100)


Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,last_listen,days_since_release,genre_popularity,media_popularity,artist_popularity,album_popularity,songs_listened,song_popularity_7d,artist_popularity_7d,album_popularity_7d
0,1175,2016-11-28 16:40:24,23564,1437691,0,2003-05-19,0,0,306,0,...,0.0,4942,726,5,580,40,28,1,79,5
1,0,2016-11-17 23:19:07,61787,13632884,0,2016-07-22,1,0,239,0,...,0.0,118,170067,152,536,152,52,22,63,22
2,2692,2016-11-23 14:20:52,19290,919739,2,2008-09-16,1,0,228,0,...,0.0,2990,18,1,238,5,56,0,40,0
3,14,2016-11-24 14:19:09,42166,8980343,4,2014-10-27,0,0,222,1,...,0.0,759,8259,3,318,29,10,0,51,3
4,0,2016-11-02 15:07:25,62038,13680778,0,2016-07-29,0,0,183,0,...,0.0,96,170067,73,295,73,53,0,32,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,25,2016-11-12 17:54:14,12618,338342,1,2009-07-07,2,1,306,1,...,0.0,2685,12341,64,84,72,16,11,18,14
96,0,2016-11-20 01:10:11,58568,13082992,2,2016-05-06,0,0,253,0,...,0.0,198,170067,8,2347,1074,55,0,353,121
97,0,2016-11-17 12:36:05,65109,14103674,0,2016-09-24,0,0,248,0,...,0.0,54,170067,62,62,62,40,9,9,9
98,7,2016-11-20 07:49:36,12302,324222,13,1995-12-31,0,0,247,1,...,0.0,7630,42872,19,238,21,75,1,40,3


In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Initialize encoders
gender_encoder = LabelEncoder()
platform_encoder = LabelEncoder()

# Fit and transform with label encoding
data['user_gender_enc'] = gender_encoder.fit_transform(data['user_gender'])
data['platform_name_enc'] = platform_encoder.fit_transform(data['platform_name'])

# MinMax scaling for 'user_age' to bring it to a 0-1 range
scaler = MinMaxScaler()
data['user_age_scaled'] = scaler.fit_transform(data[['user_age']])


In [15]:
#Train-Test-Split
train, test = train_test_split(data, test_size=0.2, random_state=42)
train.head(100)

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,media_popularity,artist_popularity,album_popularity,songs_listened,song_popularity_7d,artist_popularity_7d,album_popularity_7d,user_gender_enc,platform_name_enc,user_age_scaled
8558,0,2016-11-17 23:56:27,48426,10770214,0,2015-07-17,0,0,255,0,...,2,30,2,53,0,6,0,0,0,0.583333
234700,7,2016-11-24 15:06:50,3606,102427,1,2007-11-26,2,1,197,1,...,64,113,102,65,13,22,20,1,2,0.666667
65135,0,2016-11-06 16:08:14,58648,13091982,0,2016-05-13,0,0,230,0,...,11,11,11,259,1,1,1,1,0,0.083333
237176,0,2016-11-18 13:07:07,60660,13457671,0,2016-07-08,1,0,197,0,...,536,1114,973,26,74,154,132,0,1,0.083333
312190,0,2016-11-23 06:37:13,64837,14079078,0,2016-09-20,0,0,194,0,...,930,7569,930,68,97,992,97,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261620,0,2016-11-12 21:03:47,59373,13246617,4,2016-07-04,0,0,251,1,...,150,7569,2125,32,20,992,275,1,0,0.750000
327784,0,2016-11-22 23:34:19,52341,11674708,0,2015-11-13,0,0,233,0,...,148,653,500,71,23,96,69,0,0,0.916667
119518,0,2016-11-12 09:40:16,68991,14530576,2,2016-11-11,1,0,194,0,...,181,5083,3111,34,27,843,642,0,1,0.166667
23657,7054,2016-11-14 15:27:59,36786,7476147,1,2014-03-03,0,0,187,1,...,2,5,3,70,0,1,1,0,0,0.583333


In [16]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

class ListenDataset(Dataset):
    def __init__(self, data):
        self.users = torch.tensor(data['user_id'].values, dtype=torch.long)
        self.items = torch.tensor(data['media_id'].values, dtype=torch.long)
        self.ages = torch.tensor(data['user_age_scaled'].values, dtype=torch.float32)
        self.genders = torch.tensor(data['user_gender_enc'].values, dtype=torch.long)
        self.platforms = torch.tensor(data['platform_name_enc'].values, dtype=torch.long)
        self.labels = torch.tensor(data['is_listened'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return (self.users[idx], self.items[idx], self.ages[idx], self.genders[idx], self.platforms[idx]), self.labels[idx]

# Dataset in DataLoader packen (Batch-Verarbeitung)
train_data = ListenDataset(train)
test_data = ListenDataset(test)

train_loader = DataLoader(train_data, batch_size=512, shuffle=True)
test_loader = DataLoader(test_data, batch_size=512, shuffle=False)


In [17]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, num_genders, num_platforms, emb_size=64):
        super(NCF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.gender_emb = nn.Embedding(num_genders, emb_size // 2)  # Smaller embedding size for gender
        self.platform_emb = nn.Embedding(num_platforms, emb_size // 2)

        self.fc_layers = nn.Sequential(
            nn.Linear(emb_size * 2 + emb_size // 2 * 2 + 1, 128),  # +1 for age
            nn.ReLU(),
            nn.Dropout(0.5),  # Adding dropout for regularization
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, user, item, age, gender, platform):
        user_embedded = self.user_emb(user)
        item_embedded = self.item_emb(item)
        gender_embedded = self.gender_emb(gender)
        platform_embedded = self.platform_emb(platform)

        x = torch.cat([user_embedded, item_embedded, gender_embedded, platform_embedded, age.unsqueeze(1)], dim=-1)
        return self.fc_layers(x)


In [18]:
import torch.optim as optim

# Count unique values for all entities
num_users = data['user_id'].nunique()
num_items = data['media_id'].nunique()
num_genders = data['user_gender_enc'].nunique()  # Assuming 'user_gender_enc' has been preprocessed
num_platforms = data['platform_name_enc'].nunique()  # Assuming 'platform_name_enc' has been preprocessed

# Create the model instance with the new parameters
model = NCF(num_users, num_items, num_genders, num_platforms)

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss, suitable for binary labels

# Move the model to the appropriate device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


NCF(
  (user_emb): Embedding(14745, 64)
  (item_emb): Embedding(69606, 64)
  (gender_emb): Embedding(2, 32)
  (platform_emb): Embedding(3, 32)
  (fc_layers): Sequential(
    (0): Linear(in_features=193, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Linear(in_features=64, out_features=32, bias=True)
    (6): ReLU()
    (7): Linear(in_features=32, out_features=1, bias=True)
    (8): Sigmoid()
  )
)

In [19]:
for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        (user, item, age, gender, platform), label = batch
        user, item, age, gender, platform, label = user.to(device), item.to(device), age.to(device), gender.to(device), platform.to(device), label.to(device)
        optimizer.zero_grad()
        preds = model(user, item, age, gender, platform).squeeze()
        loss = criterion(preds, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.6005
Epoch 2, Loss: 0.5578
Epoch 3, Loss: 0.5219
Epoch 4, Loss: 0.4983
Epoch 5, Loss: 0.4805
Epoch 6, Loss: 0.4642
Epoch 7, Loss: 0.4500
Epoch 8, Loss: 0.4352
Epoch 9, Loss: 0.4218
Epoch 10, Loss: 0.4087


In [20]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Evaluation mit F1-Score
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:  # Sichere Zuweisung
        user, item, label = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        preds = model(user, item).squeeze()
        predicted = (preds >= 0.5).float()  # Schwelle bei 0.5 für binäre Klassifikation
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(label.cpu().numpy())

# Berechnung der Evaluationsmetriken
f1 = f1_score(all_labels, all_preds)
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)

# Ergebnisse ausgeben
print(f"F1-Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

AttributeError: 'list' object has no attribute 'to'