<a href="https://colab.research.google.com/github/lucarenz1997/recommender_systems/blob/main/NCF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/lucarenz1997/recommender_systems/blob/main/NCF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Collaborative Filtering (NCF) Model
** Authors **: Rafaella and Luca

## 1. Setup

In [20]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from google.colab import drive
drive.mount('/content/drive')
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 2. Load & data prep

In [21]:
#Load data
data = pd.read_csv("/content/drive/MyDrive/Recommender/sample_preprocessed.csv")

#Encoding für `user_id` und `media_id` (nur Collaborative Filtering)**
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
data["user_id"] = user_encoder.fit_transform(data["user_id"])
data["media_id"] = item_encoder.fit_transform(data["media_id"])

data.head(100)


Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,days_since_release,genre_popularity,media_popularity,artist_popularity,album_popularity,songs_listened,song_popularity_7d,artist_popularity_7d,album_popularity_7d,month
0,10,2016-11-12 22:01:41,10134,299421,1,2002-12-31,2,1,198,1,...,5065,12408,1,45,17,24,0,6,1,11
1,1129,2016-11-10 02:28:23,7568,224543,0,2005-12-05,0,0,223,0,...,3993,249,11,309,16,66,2,46,2,11
2,10,2016-11-02 07:41:53,3867,103376,0,2005-08-22,0,0,201,0,...,4090,12408,13,21,17,87,1,1,1,11
3,0,2016-11-24 17:23:28,63304,14101012,0,2016-09-23,0,0,187,0,...,62,168707,324,584,465,10,35,81,69,11
4,7,2016-11-11 11:55:23,3932,103798,0,1998-01-07,1,0,264,0,...,6883,42397,32,138,32,11,7,25,7,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7,2016-11-01 16:42:12,7424,222779,22,2004-12-31,2,1,262,0,...,4323,42397,1,1,1,7,0,0,0,11
96,723,2016-11-10 04:25:50,16720,623660,0,2010-06-14,0,0,173,0,...,2341,730,1,3,1,104,0,0,0,11
97,7,2016-11-21 10:03:31,27928,6197720,16,2011-06-13,2,1,265,0,...,1988,42397,2,148,32,5,1,26,8,11
98,297,2016-11-03 12:02:38,4877,114005,3,1995-04-25,0,0,224,0,...,7863,6518,145,222,176,36,34,46,38,11


### Train-Test-Split

In [22]:
#Train-Test-Split
train, test = train_test_split(data, test_size=0.2, random_state=42)
train.head(100)


Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,days_since_release,genre_popularity,media_popularity,artist_popularity,album_popularity,songs_listened,song_popularity_7d,artist_popularity_7d,album_popularity_7d,month
223976,0,2016-11-04 20:25:48,45673,10236282,0,2015-05-15,0,0,214,0,...,539,168707,328,1415,727,21,37,178,89,11
114265,0,2016-11-21 19:55:26,57945,13252983,0,2016-06-02,0,0,212,0,...,172,168707,39,75,39,40,8,14,8,11
208183,27,2016-11-02 08:39:58,32329,6893935,5,2013-09-16,0,0,255,1,...,1143,8487,76,927,525,94,10,175,98,11
337457,0,2016-11-15 18:52:31,57844,13234773,4,2016-04-08,0,0,189,1,...,221,168707,75,101,75,15,2,7,2,11
25572,10,2016-11-04 16:00:33,38432,8015598,1,2003-12-31,0,0,195,1,...,4692,12408,9,147,126,3,0,18,17,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271908,25481,2016-11-09 17:51:17,42698,9424202,3,2014-12-22,0,0,221,0,...,688,1596,4,630,27,26,2,100,5,11
257866,25521,2016-11-11 20:16:33,22019,1323378,1,2007-01-01,0,0,247,1,...,3602,392,37,93,37,57,5,16,5,11
222527,0,2016-11-07 07:52:41,48290,11128892,0,2015-09-04,0,0,205,0,...,430,168707,238,3613,238,20,44,581,44,11
173853,0,2016-11-15 09:06:13,64642,14270978,1,2016-10-11,0,0,223,1,...,35,168707,90,120,90,195,9,17,9,11


##  3. Create PyTorch dataset

In [23]:
class ListenDataset(Dataset):
    def __init__(self, data):
        self.users = torch.tensor(data["user_id"].values, dtype=torch.long)
        self.items = torch.tensor(data["media_id"].values, dtype=torch.long)
        self.labels = torch.tensor(data["is_listened"].values, dtype=torch.float32)  # Boolean zu Float

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

# Dataset in DataLoader packen (Batch-Verarbeitung)
train_data = ListenDataset(train)
test_data = ListenDataset(test)

train_loader = DataLoader(train_data, batch_size=512, shuffle=True)
test_loader = DataLoader(test_data, batch_size=512, shuffle=False)


## 4. NCF Model Definition

In [24]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=64):
        super(NCF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)

        self.fc_layers = nn.Sequential(
            nn.Linear(emb_size * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # Sigmoid für binäre Klassifikation
        )

    def forward(self, user, item):
        user_embedded = self.user_emb(user)
        item_embedded = self.item_emb(item)
        x = torch.cat([user_embedded, item_embedded], dim=-1)  # Kombinierte Embeddings
        return self.fc_layers(x)


## 5. Initiate model

In [25]:
num_users = data["user_id"].nunique()
num_items = data["media_id"].nunique()

model = NCF(num_users, num_items)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()  # Perfekt für Boolean-Labels!


## 6. Train the Model

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):
    model.train()
    total_loss = 0
    for user, item, label in train_loader:
        user, item, label = user.to(device), item.to(device), label.to(device)

        optimizer.zero_grad()
        preds = model(user, item).squeeze()  # 1D Output
        loss = criterion(preds, label)  # Binary Cross-Entropy Loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    #print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.6123
Epoch 2, Loss: 0.5542
Epoch 3, Loss: 0.5096
Epoch 4, Loss: 0.4765
Epoch 5, Loss: 0.4444
Epoch 6, Loss: 0.4102
Epoch 7, Loss: 0.3734
Epoch 8, Loss: 0.3357
Epoch 9, Loss: 0.2991
Epoch 10, Loss: 0.2640


## 7. Test model

In [29]:
import numpy as np
import torch
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Set model to evaluation mode
model.eval()

# Define threshold search range
thresholds = np.arange(0.1, 1.0, 0.1)

# Variables to store the best threshold and F1-score
best_f1 = 0
best_threshold = 0
results = []

# Iterate over different thresholds
for threshold in thresholds:
    all_preds, all_labels = [], []  # Reset for each threshold

    with torch.no_grad():
        for batch in test_loader:
            user, item, label = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            preds = model(user, item).squeeze()
            predicted = (preds >= threshold).float()  # Convert probabilities to binary predictions
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(label.cpu().numpy())

    # Compute metrics
    f1 = f1_score(all_labels, all_preds)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    # Store results
    results.append((threshold, f1, accuracy, precision, recall))

    # Check if this threshold gives the best F1-score
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

    # Print results
    print(f"Threshold: {threshold:.1f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print("-" * 30)

# Print the best threshold
print(f"\n✅ Best Threshold: {best_threshold:.1f} with F1-Score: {best_f1:.4f}")

# Use the best threshold for final predictions
final_predictions = (all_preds >= best_threshold).astype(int)


Threshold: 0.1
F1-Score: 0.8092
Accuracy: 0.7017
Precision: 0.7172
Recall: 0.9283
------------------------------
Threshold: 0.2
F1-Score: 0.8028
Accuracy: 0.7016
Precision: 0.7303
Recall: 0.8914
------------------------------
Threshold: 0.3
F1-Score: 0.7954
Accuracy: 0.6988
Precision: 0.7404
Recall: 0.8593
------------------------------
Threshold: 0.4
F1-Score: 0.7872
Accuracy: 0.6948
Precision: 0.7499
Recall: 0.8285
------------------------------
Threshold: 0.5
F1-Score: 0.7770
Accuracy: 0.6891
Precision: 0.7599
Recall: 0.7949
------------------------------
Threshold: 0.6
F1-Score: 0.7624
Accuracy: 0.6793
Precision: 0.7699
Recall: 0.7549
------------------------------
Threshold: 0.7
F1-Score: 0.7410
Accuracy: 0.6650
Precision: 0.7831
Recall: 0.7032
------------------------------
Threshold: 0.8
F1-Score: 0.7065
Accuracy: 0.6419
Precision: 0.8001
Recall: 0.6326
------------------------------
Threshold: 0.9
F1-Score: 0.6299
Accuracy: 0.5930
Precision: 0.8281
Recall: 0.5083
--------------

In [33]:
def recommend_songs(user_id, model, item_encoder, top_k=10):
    """
    Returns the top-K recommended songs for a given user along with their predicted scores.

    :param user_id: The ID of the user for whom recommendations should be generated.
    :param model: The trained recommendation model.
    :param item_encoder: The LabelEncoder for `media_id`, used to convert IDs back to original values.
    :param top_k: Number of songs to recommend.
    :return: DataFrame containing the recommended `media_id`s and their predicted scores.
    """
    model.eval()  # Set model to evaluation mode

    # Retrieve all possible `media_id`s
    all_items = torch.tensor(data["media_id"].unique(), dtype=torch.long).to(device)

    # Prepare user ID (same length as the number of items)
    user_tensor = torch.tensor([user_id] * len(all_items), dtype=torch.long).to(device)

    # Compute predictions
    with torch.no_grad():
        scores = model(user_tensor, all_items).squeeze()

    # Sort scores in descending order and select the top-K items
    top_items = torch.argsort(scores, descending=True)[:top_k]

    # Convert predicted `media_id`s back to original values
    recommended_songs = item_encoder.inverse_transform(all_items[top_items].cpu().numpy())
    predicted_scores = scores[top_items].cpu().numpy()

    # Output as DataFrame
    recommendations = pd.DataFrame({"Recommended Media_IDs": recommended_songs, "Predicted Score": predicted_scores})

    return recommendations

# Example: Generate recommendations for `user_id = 123`
user_id_example = 123  # Example ID (adjust as needed)
recommended_songs = recommend_songs(user_id_example, model, item_encoder)

# Display results
print(recommended_songs.head(10))  # Display the top 10 recommended songs with scores


   Recommended Media_IDs  Predicted Score
0              134945408         0.999649
1               64332453         0.999474
2                7667285         0.999381
3               67238739         0.999371
4              122439656         0.999337
5              125970763         0.999336
6              133200140         0.999183
7               65245764         0.999056
8              128956378         0.999017
9              135261310         0.999002
