<a href="https://colab.research.google.com/github/lucarenz1997/recommender_systems/blob/main/Hybrid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hybrid Neural Collaborative Filtering (NCF) Model

In [43]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from google.colab import drive
drive.mount('/content/drive')
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load data

In [44]:
#Load data
data = pd.read_csv("/content/drive/MyDrive/Recommender/sample_preprocessed.csv")

# Encoding categorical variables
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
genre_encoder = LabelEncoder()
gender_encoder = LabelEncoder()
platform_encoder = LabelEncoder()
artist_encoder = LabelEncoder()
age_scaler = MinMaxScaler()
song_popularity_7d_scaler = MinMaxScaler()

data.head(100)



Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,days_since_release,genre_popularity,media_popularity,artist_popularity,album_popularity,songs_listened,song_popularity_7d,artist_popularity_7d,album_popularity_7d,month
0,10,2016-11-12 22:01:41,3092645,299421,1,2002-12-31,2,1,198,1,...,5065,12408,1,45,17,24,0,6,1,11
1,1129,2016-11-10 02:28:23,2247915,224543,0,2005-12-05,0,0,223,0,...,3993,249,11,309,16,66,2,46,2,11
2,10,2016-11-02 07:41:53,917717,103376,0,2005-08-22,0,0,201,0,...,4090,12408,13,21,17,87,1,1,1,11
3,0,2016-11-24 17:23:28,132625720,14101012,0,2016-09-23,0,0,187,0,...,62,168707,324,584,465,10,35,81,69,11
4,7,2016-11-11 11:55:23,921901,103798,0,1998-01-07,1,0,264,0,...,6883,42397,32,138,32,11,7,25,7,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7,2016-11-01 16:42:12,2225892,222779,22,2004-12-31,2,1,262,0,...,4323,42397,1,1,1,7,0,0,0,11
96,723,2016-11-10 04:25:50,6744852,623660,0,2010-06-14,0,0,173,0,...,2341,730,1,3,1,104,0,0,0,11
97,7,2016-11-21 10:03:31,63103296,6197720,16,2011-06-13,2,1,265,0,...,1988,42397,2,148,32,5,1,26,8,11
98,297,2016-11-03 12:02:38,1044131,114005,3,1995-04-25,0,0,224,0,...,7863,6518,145,222,176,36,34,46,38,11


In [45]:
data["user_id"] = user_encoder.fit_transform(data["user_id"])
data["media_id"] = item_encoder.fit_transform(data["media_id"])
data["genre_id"] = genre_encoder.fit_transform(data["genre_id"])
data["user_gender"] = gender_encoder.fit_transform(data["user_gender"])
data["platform_name"] = platform_encoder.fit_transform(data["platform_name"])
data["artist_id"] = artist_encoder.fit_transform(data["artist_id"])
data["user_age"] = age_scaler.fit_transform(data[["user_age"]])
data["song_popularity_7d"] = song_popularity_7d_scaler.fit_transform(data[["song_popularity_7d"]])

In [46]:
# Train-Test-Split
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Custom Dataset class
class HybridListenDataset(Dataset):
    def __init__(self, data):
        self.users = torch.tensor(data["user_id"].values, dtype=torch.long)
        self.items = torch.tensor(data["media_id"].values, dtype=torch.long)
        self.genres = torch.tensor(data["genre_id"].values, dtype=torch.long)
        self.genders = torch.tensor(data["user_gender"].values, dtype=torch.long)
        self.platforms = torch.tensor(data["platform_name"].values, dtype=torch.long)
        self.artists = torch.tensor(data["artist_id"].values, dtype=torch.long)
        self.ages = torch.tensor(data["user_age"].values, dtype=torch.float32)
        self.song_popularity_7d = torch.tensor(data["song_popularity_7d"].values, dtype=torch.float32)
        self.labels = torch.tensor(data["is_listened"].values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return (self.users[idx], self.items[idx], self.genres[idx], self.genders[idx],
                self.platforms[idx], self.artists[idx], self.ages[idx], self.song_popularity_7d[idx], self.labels[idx])

# Dataset & DataLoader
train_data = HybridListenDataset(train)
test_data = HybridListenDataset(test)

train_loader = DataLoader(train_data, batch_size=512, shuffle=True)
test_loader = DataLoader(test_data, batch_size=512, shuffle=False)

In [47]:
# NCF Hybrid Model
class NCFHybrid(nn.Module):
    def __init__(self, num_users, num_items, num_genres, num_genders, num_platforms, num_artists, emb_size=64):
        super(NCFHybrid, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.genre_emb = nn.Embedding(num_genres, emb_size // 2)
        self.gender_emb = nn.Embedding(num_genders, emb_size // 2)
        self.platform_emb = nn.Embedding(num_platforms, emb_size // 2)
        self.artist_emb = nn.Embedding(num_artists, emb_size // 2)
        self.age_fc = nn.Linear(1, emb_size // 2)
        self.popularity_fc = nn.Linear(1, emb_size // 2)

        input_size = emb_size * 2 + (emb_size // 2) * 6
        self.fc_layers = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, user, item, genre, gender, platform, artist, age, popularity):
        user_embedded = self.user_emb(user)
        item_embedded = self.item_emb(item)
        genre_embedded = self.genre_emb(genre)
        gender_embedded = self.gender_emb(gender)
        platform_embedded = self.platform_emb(platform)
        artist_embedded = self.artist_emb(artist)
        age_embedded = self.age_fc(age.unsqueeze(1))
        popularity_embedded = self.popularity_fc(popularity.unsqueeze(1))

        x = torch.cat([user_embedded, item_embedded, genre_embedded,
                       gender_embedded, platform_embedded, artist_embedded, age_embedded, popularity_embedded], dim=-1)
        return self.fc_layers(x)

# Model Training
num_users = data["user_id"].nunique()
num_items = data["media_id"].nunique()
num_genres = data["genre_id"].nunique()
num_genders = data["user_gender"].nunique()
num_platforms = data["platform_name"].nunique()
num_artists = data["artist_id"].nunique()

model = NCFHybrid(num_users, num_items, num_genres, num_genders, num_platforms, num_artists)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

for epoch in range(10):
    model.train()
    total_loss = 0
    for user, item, genre, gender, platform, artist, age, popularity, label in train_loader:
        user, item, genre, gender, platform, artist, age, popularity, label = (user.to(device), item.to(device), genre.to(device),
                                                                               gender.to(device), platform.to(device), artist.to(device),
                                                                               age.to(device), popularity.to(device), label.to(device))
        optimizer.zero_grad()
        preds = model(user, item, genre, gender, platform, artist, age, popularity).squeeze()
        loss = criterion(preds, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.5915
Epoch 2, Loss: 0.5432
Epoch 3, Loss: 0.5057
Epoch 4, Loss: 0.4765
Epoch 5, Loss: 0.4485
Epoch 6, Loss: 0.4186
Epoch 7, Loss: 0.3872
Epoch 8, Loss: 0.3546
Epoch 9, Loss: 0.3216
Epoch 10, Loss: 0.2897


In [48]:
# Model Evaluation
model.eval()
thresholds = np.arange(0.1, 1.0, 0.1)
best_f1 = 0
best_threshold = 0
results = []

for threshold in thresholds:
    all_preds, all_labels = [], []
    with torch.no_grad():
        for user, item, genre, gender, platform, artist, age, popularity, label in test_loader:
            user, item, genre, gender, platform, artist, age, popularity, label = (
                user.to(device), item.to(device), genre.to(device), gender.to(device),
                platform.to(device), artist.to(device), age.to(device), popularity.to(device), label.to(device)
            )
            preds = model(user, item, genre, gender, platform, artist, age, popularity).squeeze()
            predicted = (preds >= threshold).float()
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(label.cpu().numpy())

    f1 = f1_score(all_labels, all_preds)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    results.append((threshold, f1, accuracy, precision, recall))
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

    print(f"Threshold: {threshold:.1f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print("-" * 30)

print(f"\n✅ Best Threshold: {best_threshold:.1f} with F1-Score: {best_f1:.4f}")

Threshold: 0.1
F1-Score: 0.8126
Accuracy: 0.7057
Precision: 0.7179
Recall: 0.9360
------------------------------
Threshold: 0.2
F1-Score: 0.8090
Accuracy: 0.7090
Precision: 0.7319
Recall: 0.9041
------------------------------
Threshold: 0.3
F1-Score: 0.8043
Accuracy: 0.7091
Precision: 0.7427
Recall: 0.8770
------------------------------
Threshold: 0.4
F1-Score: 0.7979
Accuracy: 0.7067
Precision: 0.7520
Recall: 0.8498
------------------------------
Threshold: 0.5
F1-Score: 0.7899
Accuracy: 0.7028
Precision: 0.7620
Recall: 0.8200
------------------------------
Threshold: 0.6
F1-Score: 0.7766
Accuracy: 0.6943
Precision: 0.7734
Recall: 0.7799
------------------------------
Threshold: 0.7
F1-Score: 0.7541
Accuracy: 0.6785
Precision: 0.7874
Recall: 0.7236
------------------------------
Threshold: 0.8
F1-Score: 0.7149
Accuracy: 0.6508
Precision: 0.8055
Recall: 0.6427
------------------------------
Threshold: 0.9
F1-Score: 0.6395
Accuracy: 0.6009
Precision: 0.8316
Recall: 0.5195
--------------

In [49]:
# Function to recommend songs with star ratings
def recommend_songs_with_star_ratings(user_id, model, item_encoder, best_threshold, top_k=10):
    model.eval()
    all_items = torch.tensor(data["media_id"].unique(), dtype=torch.long).to(device)
    user_tensor = torch.tensor([user_id] * len(all_items), dtype=torch.long).to(device)

    # Get user features
    user_data = data[data["user_id"] == user_id].iloc[0]
    user_genre = torch.tensor([user_data["genre_id"]] * len(all_items), dtype=torch.long).to(device)
    user_gender = torch.tensor([user_data["user_gender"]] * len(all_items), dtype=torch.long).to(device)
    user_platform = torch.tensor([user_data["platform_name"]] * len(all_items), dtype=torch.long).to(device)
    user_artist = torch.tensor([user_data["artist_id"]] * len(all_items), dtype=torch.long).to(device)
    user_age = torch.tensor([user_data["user_age"]] * len(all_items), dtype=torch.float32).to(device)
    user_popularity = torch.tensor([user_data["song_popularity_7d"]] * len(all_items), dtype=torch.float32).to(device)

    with torch.no_grad():
        scores = model(user_tensor, all_items, user_genre, user_gender, user_platform, user_artist, user_age, user_popularity).squeeze()

    top_items = torch.argsort(scores, descending=True)[:top_k]
    top_scores = scores[top_items].cpu().numpy()
    scaler = MinMaxScaler()
    top_scores_normalized = scaler.fit_transform(top_scores.reshape(-1, 1)).flatten()

    percentile_33 = np.percentile(top_scores_normalized, 33)
    percentile_66 = np.percentile(top_scores_normalized, 66)

    def score_to_star_rating(score, raw_score):
        if raw_score < best_threshold:
            return ""
        elif score < percentile_33:
            return "⭐"
        elif score < percentile_66:
            return "⭐⭐"
        else:
            return "⭐⭐⭐"

    star_ratings = [score_to_star_rating(score, raw_score) for score, raw_score in zip(top_scores_normalized, top_scores)]
    recommended_songs = item_encoder.inverse_transform(all_items[top_items].cpu().numpy())

    recommendations_df = pd.DataFrame({
        "Recommended Media_IDs": recommended_songs,
        "Predicted Score": top_scores,
        "Normalized Score": top_scores_normalized,
        "Star Rating": star_ratings
    })

    return recommendations_df

user_id_example = 123
recommended_songs_df = recommend_songs_with_star_ratings(user_id_example, model, item_encoder, best_threshold)
from IPython.display import display
display(recommended_songs_df)

Unnamed: 0,Recommended Media_IDs,Predicted Score,Normalized Score,Star Rating
0,68935758,0.999912,1.0,⭐⭐⭐
1,3821960,0.999904,0.933594,⭐⭐⭐
2,123495130,0.999896,0.87207,⭐⭐⭐
3,1090517,0.999877,0.712891,⭐⭐⭐
4,90305555,0.99986,0.572266,⭐⭐
5,133895044,0.999858,0.554688,⭐⭐
6,99976960,0.999848,0.480469,⭐⭐
7,10214505,0.999819,0.239746,⭐
8,124262680,0.999795,0.044922,⭐
9,122047354,0.99979,0.0,⭐
