In [1]:
import pandas as pd
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


In [2]:
# Load the datasets
football_news_df = pd.read_csv('../data/news_test.csv', encoding='latin1')
matches_df = pd.read_csv('../data/matches.csv', encoding='latin1')
users_df = pd.read_csv('../data/users.csv')
bets_df = pd.read_csv('../data/bets.csv')
liked_clubs_df = pd.read_csv('../data/liked_clubs.csv')
clubs_df = pd.read_csv('../data/clubs.csv')
feedback_file_path = '../data/feedback.csv'

try:
    feedback_df = pd.read_csv(feedback_file_path)
except FileNotFoundError:
    feedback_df = pd.DataFrame(columns=['user_id', 'news_id', 'action', 'rating'])

## Algo 2: Neural recommendation 

In [3]:
def prepare_data(feedback_df):
    feedback_df = feedback_df[feedback_df['action'] == 'rated'].dropna(subset=['rating'])
    feedback_df['rating'] = feedback_df['rating'].astype(float)
    user_ids = feedback_df['user_id'].astype('category').cat.codes
    news_ids = feedback_df['news_id'].astype('category').cat.codes
    feedback_df['user_id'] = user_ids
    feedback_df['news_id'] = news_ids

    num_users = user_ids.max() + 1
    num_items = news_ids.max() + 1
    return feedback_df, num_users, num_items

# Prepare betting and liked_clubs data
def get_user_preferences(user_id, bets_df, liked_clubs_df, clubs_df):
    # Teams from bets
    user_teams = bets_df[bets_df['user_id'] == user_id]['selected_team'].unique().tolist()

    # Clubs from liked_clubs
    liked_club_ids = liked_clubs_df[liked_clubs_df['user_id'] == user_id]['club_id'].unique()
    liked_clubs = clubs_df[clubs_df['id'].isin(liked_club_ids)]['club'].unique().tolist()

    return user_teams, liked_clubs

feedback_df, num_users, num_items = prepare_data(feedback_df)

In [4]:
class FeedbackDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_id'].values, dtype=torch.long)
        self.items = torch.tensor(df['news_id'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]
    
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=50):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, user_ids, item_ids):
        user_embed = self.user_embedding(user_ids)
        item_embed = self.item_embedding(item_ids)
        x = torch.cat([user_embed, item_embed], dim=-1)
        return self.fc_layers(x).squeeze()

In [5]:
# Prepare feedback_df
feedback_df = feedback_df[feedback_df['action'] == 'rated'].dropna(subset=['rating'])
feedback_df['rating'] = feedback_df['rating'].astype(float)
feedback_df['user_id'] = feedback_df['user_id'].astype('category').cat.codes
feedback_df['news_id'] = feedback_df['news_id'].astype('category').cat.codes

num_users = feedback_df['user_id'].nunique()
num_items = feedback_df['news_id'].nunique()

# Split into train and test sets
train_df, test_df = train_test_split(feedback_df, test_size=0.2, random_state=42)
train_dataset = FeedbackDataset(train_df)
test_dataset = FeedbackDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize model, loss, and optimizer
model = NCF(num_users, num_items)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
model = NCF(num_users, num_items)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
# step 1 : train model
def train(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for user_ids, item_ids, ratings in train_loader:
            optimizer.zero_grad()
            predictions = model(user_ids, item_ids)
            loss = criterion(predictions, ratings)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")
        
# step 2 : evaluate model
def evaluate_recommendations(model, test_loader, top_k=10):
    model.eval()
    precision_list = []
    recall_list = []
    ndcg_list = []

    with torch.no_grad():
        for user_ids, item_ids, ratings in test_loader:
            for user_id in user_ids.unique():
                user_item_ids = item_ids[user_ids == user_id]

                # Get predictions for all items for the current user
                all_items = torch.arange(num_items, dtype=torch.long)
                user_ids_full = torch.full((num_items,), user_id.item(), dtype=torch.long)
                predictions = model(user_ids_full, all_items)
                
                # Get top K recommendations
                top_k_items = predictions.argsort(descending=True)[:top_k]

                # True items 
                true_items = user_item_ids.numpy()
                
                #precision, recall, and NDCG
                recommended_items = top_k_items.numpy()
                relevant = np.isin(recommended_items, true_items).astype(int)

                precision = np.sum(relevant) / top_k
                recall = np.sum(relevant) / len(true_items)
                
                # NDCG calculation
                dcg = np.sum(relevant / np.log2(np.arange(2, len(relevant) + 2)))
                idcg = np.sum(1 / np.log2(np.arange(2, len(true_items) + 2))) if len(true_items) > 0 else 1
                ndcg = dcg / idcg

                precision_list.append(precision)
                recall_list.append(recall)
                ndcg_list.append(ndcg)
                
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_ndcg = np.mean(ndcg_list)

    print(f"Average Precision@{top_k}: {avg_precision:.4f}")
    print(f"Average Recall@{top_k}: {avg_recall:.4f}")
    print(f"Average NDCG@{top_k}: {avg_ndcg:.4f}")

    return avg_precision, avg_recall, avg_ndcg

# step 3 : get recommendation of user
def recommend_for_user(model, user_id, num_recommendations=10):
    model.eval()
    all_items = torch.arange(num_items, dtype=torch.long)
    user_ids = torch.full((num_items,), user_id, dtype=torch.long)
    with torch.no_grad():
        predictions = model(user_ids, all_items)
    top_items = predictions.argsort(descending=True)[:num_recommendations]
    return top_items.numpy()

# step 4 : generate personalized recommendations
def personalized_recommendations(user_id, model, articles_df, num_recommendations=10):
    # Get top-N recommendations from the model
    recommended_items = recommend_for_user(model, user_id, num_recommendations)
    recommended_articles = articles_df.iloc[recommended_items][['News ID', 'Title']]

    # Add preferred teams and liked clubs
    user_teams, liked_clubs = get_user_preferences(user_id, bets_df, liked_clubs_df, clubs_df)
    additional_recommendations = articles_df[
        articles_df['Content'].str.contains('|'.join(user_teams + liked_clubs), case=False, na=False)
    ][['News ID', 'Title']]

    combined_recommendations = pd.concat([recommended_articles, additional_recommendations]).drop_duplicates()

    return combined_recommendations


In [11]:
train(model, train_loader, criterion, optimizer, epochs=10)

# Evaluate the model
precision, recall, ndcg = evaluate_recommendations(model, test_loader, top_k=10)

user_id = 0  # Example user ID
recommendations = personalized_recommendations(user_id, model, football_news_df, num_recommendations=10)
# print("Recommendations for User:")
# print(recommendations)


Epoch 1, Loss: 0.29445644840598106
Epoch 2, Loss: 0.24500812962651253
Epoch 3, Loss: 0.22364230826497078
Epoch 4, Loss: 0.20027923583984375
Epoch 5, Loss: 0.18289390951395035
Epoch 6, Loss: 0.16001443192362785
Epoch 7, Loss: 0.13900696486234665
Epoch 8, Loss: 0.12377487495541573
Epoch 9, Loss: 0.11949182488024235
Epoch 10, Loss: 0.10745514929294586
Average Precision@10: 0.0375
Average Recall@10: 0.0699
Average NDCG@10: 0.0358
