In [1]:
import numpy as np
import pandas as pd
from math import log2

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
football_news_df = pd.read_csv("news_test.csv", encoding="latin1")
matches_df = pd.read_csv("matches.csv", encoding="latin1")
users_df = pd.read_csv("users.csv")
bets_df = pd.read_csv("bets.csv")
feedback_df = pd.read_csv('feedback.csv')

# Part 1: Recommendation system

In [3]:
def preprocess_data(news_df):
    news_df = news_df.dropna(subset=["Content", "Title"])
    news_df["Content"] = news_df["Content"].fillna("")
    return news_df


def recommend_articles(user_id, football_news_df, bets_df, feedback_df, top_n=20):
    user_bets = bets_df[bets_df['user_id'] == user_id]
    preferred_teams = user_bets['selected_team'].unique()

    relevant_articles = football_news_df[
        football_news_df['Content'].str.contains('|'.join(preferred_teams), case=False, na=False)
    ]['News ID'].tolist()

    not_interested = feedback_df[
        (feedback_df['user_id'] == user_id) & (feedback_df['action'] == 'not_interested')
    ]['news_id'].tolist()

    filtered_articles = football_news_df[~football_news_df['News ID'].isin(not_interested)]

    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_articles['Content'])

    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

    user_feedback = feedback_df[feedback_df['user_id'] == user_id]
    for _, feedback in user_feedback.iterrows():
        if feedback['action'] == 'rated':
            news_index = filtered_articles[filtered_articles['News ID'] == feedback['news_id']].index[0]
            similarity_matrix[:, news_index] *= (1 + feedback['rating'] / 5.0)

    recommended_indices = similarity_matrix.sum(axis=1).argsort()[-top_n:][::-1]
    recommendations = filtered_articles.iloc[recommended_indices]
    
    return recommendations['News ID'].tolist()


## 1. Algo: tf-idf

In [4]:
class RecommendationSystem:
    def __init__(self, football_news_df, bets_df, feedback_df):
        self.football_news_df = football_news_df
        self.bets_df = bets_df
        self.feedback_df = feedback_df

    def recommend(self, user_id, k=10):
        return recommend_articles(
            user_id=user_id,
            football_news_df=self.football_news_df,
            bets_df=self.bets_df,
            feedback_df=self.feedback_df,
            top_n=k
        )


In [6]:
# system_instance
system_instance = RecommendationSystem(football_news_df, bets_df, feedback_df)

# content_embeddings
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
content_embeddings = tfidf_vectorizer.fit_transform(football_news_df['Content']).toarray()

news_id_to_index = {news_id: idx for idx, news_id in enumerate(football_news_df['News ID'])}

# global_popularity

global_popularity = feedback_df['news_id'].value_counts().to_dict()

max_interactions = max(global_popularity.values(), default=1)
global_popularity = {news_id: count / max_interactions for news_id, count in global_popularity.items()}





In [7]:
def recommend_articles(user_id, football_news_df, bets_df, feedback_df, top_n=20):
    user_bets = bets_df[bets_df['user_id'] == user_id]
    preferred_teams = user_bets['selected_team'].unique()

    # Filter articles related to the user's preferred teams
    relevant_articles = football_news_df[
        football_news_df['Content'].str.contains('|'.join(preferred_teams), case=False, na=False)
    ]['News ID'].tolist()

    # Handle case where no articles are relevant
    if not relevant_articles:
        return []  # Return an empty list if no articles are relevant

    not_interested = feedback_df[
        (feedback_df['user_id'] == user_id) & (feedback_df['action'] == 'not_interested')
    ]['news_id'].tolist()

    filtered_articles = football_news_df[~football_news_df['News ID'].isin(not_interested)]

    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_articles['Content'])

    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

    user_feedback = feedback_df[feedback_df['user_id'] == user_id]
    for _, feedback in user_feedback.iterrows():
        if feedback['action'] == 'rated':
            news_index = filtered_articles[filtered_articles['News ID'] == feedback['news_id']].index[0]
            similarity_matrix[:, news_index] *= (1 + feedback['rating'] / 5.0)

    if similarity_matrix.shape[0] == 0:
        return []  # Return empty if no recommendations are possible

    recommended_indices = similarity_matrix.sum(axis=1).argsort()[-top_n:][::-1]
    recommendations = filtered_articles.iloc[recommended_indices]

    return recommendations['News ID'].tolist()

# Part 2: Evaluation of recommendation system

## === Evaluation functions ===

### 1. Predictive quality metrics

In [8]:
def precision_at_k(recommended, relevant, k):
    recommended_at_k = recommended[:k]
    hits = len(set(recommended_at_k) & set(relevant))
    return hits / k


def recall_at_k(recommended, relevant, k):
    recommended_at_k = recommended[:k]
    hits = len(set(recommended_at_k) & set(relevant))
    return hits / len(relevant) if len(relevant) > 0 else 0


def f_score_at_k(precision, recall):
    return (
        2 * (precision * recall) / (precision + recall)
        if (precision + recall) > 0
        else 0
    )

### 2. Ranking quality metrics

In [9]:
def mrr(recommended, relevant):
    for i, item in enumerate(recommended):
        if item in relevant:
            return 1 / (i + 1)
    return 0


def average_precision(recommended, relevant):
    hits, precision_sum = 0, 0
    for i, item in enumerate(recommended):
        if item in relevant:
            hits += 1
            precision_sum += hits / (i + 1)
    return precision_sum / len(relevant) if relevant else 0


def ndcg(recommended, relevant, k):
    recommended_at_k = recommended[:k]
    dcg = sum(
        [1 / log2(i + 2) if recommended_at_k[i] in relevant else 0 for i in range(k)]
    )
    idcg = sum([1 / log2(i + 2) for i in range(min(len(relevant), k))])
    return dcg / idcg if idcg > 0 else 0

### 3. Behavioral metrics

In [10]:
def diversity(recommended, content_embeddings):
    similarities = cosine_similarity(content_embeddings[recommended])
    return 1 - np.mean(similarities[np.triu_indices(len(similarities), k=1)])


def novelty(recommended, global_popularity):
    return np.mean([1 - global_popularity[item] for item in recommended])


def serendipity(recommended, relevant, global_popularity):
    unexpected_items = [item for item in recommended if item not in relevant]
    return np.mean([1 - global_popularity[item] for item in unexpected_items])

## === Evaluation pipeline ===

In [11]:
def evaluate_system(system, test_data, content_embeddings, global_popularity, k=10):
    precision_scores, recall_scores, f_scores = [], [], []
    mrr_scores, map_scores, ndcg_scores = [], [], []
    diversity_scores, novelty_scores, serendipity_scores = [], [], []

    for user_id, user_data in test_data.groupby("user_id"):
        relevant = set(user_data["relevant_items"].values[0])
        recommended = system.recommend(user_id, k=k)

        # Skip evaluation if no recommendations are generated
        if not recommended:
            continue

        # Predictive metrics
        precision = precision_at_k(recommended, relevant, k)
        recall = recall_at_k(recommended, relevant, k)
        f_score = f_score_at_k(precision, recall)

        # Ranking metrics
        mrr_score = mrr(recommended, relevant)
        map_score = average_precision(recommended, relevant)
        ndcg_score = ndcg(recommended, relevant, k)

        # Behavioral metrics
        diversity_score = diversity(recommended, content_embeddings)
        novelty_score = novelty(recommended, global_popularity)
        serendipity_score = serendipity(recommended, relevant, global_popularity)

        # Append scores
        precision_scores.append(precision)
        recall_scores.append(recall)
        f_scores.append(f_score)
        mrr_scores.append(mrr_score)
        map_scores.append(map_score)
        ndcg_scores.append(ndcg_score)
        diversity_scores.append(diversity_score)
        novelty_scores.append(novelty_score)
        serendipity_scores.append(serendipity_score)

    return {
        "Precision@K": np.mean(precision_scores) if precision_scores else 0,
        "Recall@K": np.mean(recall_scores) if recall_scores else 0,
        "F-Score@K": np.mean(f_scores) if f_scores else 0,
        "MRR": np.mean(mrr_scores) if mrr_scores else 0,
        "MAP": np.mean(map_scores) if map_scores else 0,
        "NDCG": np.mean(ndcg_scores) if ndcg_scores else 0,
        "Diversity": np.mean(diversity_scores) if diversity_scores else 0,
        "Novelty": np.mean(novelty_scores) if novelty_scores else 0,
        "Serendipity": np.mean(serendipity_scores) if serendipity_scores else 0,
    }

### 3. Testing

In [12]:
def generate_test_data(users_df, bets_df, feedback_df, football_news_df, top_n=5):
    test_data = []

    for user_id in users_df['id']:
        user_bets = bets_df[bets_df['user_id'] == user_id]
        preferred_teams = user_bets['selected_team'].unique()

        relevant_articles = football_news_df[
            football_news_df['Content'].str.contains('|'.join(preferred_teams), case=False, na=False)
        ]['News ID'].tolist()

        positive_feedback = feedback_df[
            (feedback_df['user_id'] == user_id) & 
            (feedback_df['action'] == 'rated') & 
            (feedback_df['rating'] > 3)
        ]['news_id'].tolist()

        all_relevant = list(set(relevant_articles + positive_feedback))

        np.random.shuffle(all_relevant)
        relevant_items = all_relevant[:top_n]

        if relevant_items:
            test_data.append({'user_id': user_id, 'relevant_items': relevant_items})

    return pd.DataFrame(test_data)


In [13]:
test_data = generate_test_data(users_df, bets_df, feedback_df, football_news_df, top_n=5)
print(test_data)
# Evaluate the recommendation system
results = evaluate_system(
    system=system_instance,
    test_data=test_data,
    content_embeddings=content_embeddings,
    global_popularity=global_popularity,
    k=10
)

print("Evaluation Results:")
print(results)

  user_id                                     relevant_items
0   user1  [F8A13A98CF86B50D309D351C29548DFB, 94C4B539EF0...
1   user2  [A22F8CCDD1E49ADC580862AA628391F5, 1C66D7CABC6...
2   user3  [8E8E0613343445244062F87A9D7EBBBD, A041C0AE64E...


IndexError: index 0 is out of bounds for axis 0 with size 0