In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import heapq

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
census_df = pd.read_csv('data/webpage_rec_data/census.csv')
content_df = pd.read_csv('data/webpage_rec_data/content.csv')
test = pd.read_csv('data/webpage_rec_data/test.csv')
train = pd.read_csv('data/webpage_rec_data/train.csv')

In [3]:
test['URL_PATH'] = test['URL_PATH'].apply(lambda x: '/'+ x.lstrip('/en'))
test = test.reindex(index=test.index[::-1])

In [4]:
print(train.shape, test.shape)

(344914, 4) (147820, 4)


In [5]:
from abc import ABC, abstractmethod


class AbstractRecommender(ABC):
    @abstractmethod
    def observe(self, user_interaction):
        """Observe user interaction event as a Pandas Series."""
        pass

    @abstractmethod
    def recommend(self, user_id, n):
        """Return a list of n recommendations for a given user."""
        pass


TFIDF Accuracy: 8.60%

Random Accuracy: 28.92%

Popularity Accuracy: 30.94%


## TFIDF
Reccomend using top n similar content to the user's latest viewed page

In [6]:
class TFIDFRecommender(AbstractRecommender):
    def __init__(self):
        self.last_page_viewed = None
        self.vectorizer = None
        self.fitter = None
        self.contents = None
        self.top_pages = None
        
    def observe(self, new_page):
        self.last_page_viewed[new_page['USER_ID']] = new_page['URL_PATH']
        
    def recommend(self, user_id, n):
        if user_id in self.last_page_viewed:
            last_viewed_page = self.last_page_viewed[user_id]
            if last_viewed_page in self.contents.index:
                data = self.contents.loc[last_viewed_page]
                trans_text = self.vectorizer.transform([data])
                cosine_similarities = linear_kernel(trans_text, self.fitter).flatten()
                doc_index = self.contents.index.get_loc(last_viewed_page)
                cosine_similarities[doc_index] = -1
                related_docs_indices = cosine_similarities.argsort()[:-n-1:-1]
                return self.contents.iloc[related_docs_indices].index.values
            else:
                return np.random.choice(self.top_pages,n)    
        else:
            return np.random.choice(self.top_pages,n)

    def train(self, content, train):
        self.vectorizer = TfidfVectorizer()
        content= content[['url', 'title', 'article_content']].dropna()
        self.contents = content['title'] + ' ' + content['article_content']
        self.contents.index = content['url']
        self.fitter = self.vectorizer.fit_transform(self.contents)

        train['URL_PATH'] = train['URL_PATH'].apply(lambda x: '/' + x.lstrip('/en'))
        self.top_pages = train['URL_PATH'].value_counts().index[:10]
        first_encounter_df = train.drop_duplicates(subset='USER_ID', keep='first')
        self.last_page_viewed = first_encounter_df.set_index('USER_ID')['URL_PATH'].to_dict()
        


## Random user history
Reccomend using from randoming n pages from user's history

In [7]:
class RandomHistoryRecommender(AbstractRecommender):
    def __init__(self):
        self.user_interactions = None

    def observe(self, new_user_interaction):
        self.user_interactions.loc[len(self.user_interactions)] = new_user_interaction

    def recommend(self, user_id, n):
        user_interaction = self.user_interactions[self.user_interactions['USER_ID'] == str(user_id)]['URL_PATH']
        if user_interaction.shape[0] == 0:
            available_items = self.user_interactions['URL_PATH'].nunique()
            n = min(n, available_items)  # Ensure n doesn't exceed the number of available items
            return self.user_interactions['URL_PATH'].sample(n).values
        
        probabilities = user_interaction.value_counts(normalize=True)
        
        n = min(n, len(probabilities))
        
        return np.random.choice(probabilities.index, size=n, p=probabilities.values, replace=False)


    def train(self, train_df):
        self.user_interactions = train_df.copy()
        self.user_interactions['URL_PATH'] = self.user_interactions['URL_PATH'].apply(lambda x: '/' + x.lstrip('/en'))
        self.user_interactions = self.user_interactions.reset_index(drop=True)


## Popularity with time decay
popularity = $\sum\frac{1}{1+ \alpha(T-t)}$; page popularity diminishes over time

In [8]:
def time_popularity(T, t, alpha):
    return 1 / (1 + alpha * (T - t))

In [9]:
class Recommender(AbstractRecommender):
    def __init__(self):
        # Track the last page viewed per user
        self.last_page_viewed = {}

        # Popularity data per user
        # 'user_id: {url_path: [popularity, last_row_num]}'
        self.user_time_popularity = {}

        # Popularity heap per user
        # 'user_id: [(popularity, url_path)]'
        self.user_popularity_heap = {}

        self.alpha = 0.1

    def observe(self, user_interaction):
        # Extract user_id and URL path from the interaction
        user_id = user_interaction["USER_ID"]
        url_path = user_interaction["URL_PATH"]
        row_number = user_interaction['ROW_NUM']

        # Initialize data structures for the user if they don't exist
        if user_id not in self.user_time_popularity:
            self.user_time_popularity[user_id] = {}
            self.user_popularity_heap[user_id] = []

        user_popularity = self.user_time_popularity[user_id]

        # Update popularity for the URL path for the specific user
        if url_path in user_popularity:
            tmp = user_popularity[url_path]
            t = tmp[1]
            popularity = time_popularity(row_number, t, self.alpha)
            tmp[0] += popularity
            tmp[1] = row_number
            heapq.heapify(self.user_popularity_heap[user_id])
        else:
            popularity = time_popularity(row_number, row_number, self.alpha)
            tmp = [popularity, row_number, url_path]
            user_popularity[url_path] = tmp
            heapq.heappush(self.user_popularity_heap[user_id], tmp)

    def recommend(self, user_id, n):
        # Check if the user has any interaction data
        if user_id not in self.user_popularity_heap:
            return []

        # Get the top 'n' most popular items for the specific user
        result = heapq.nlargest(n, self.user_popularity_heap[user_id])
        return [i[2] for i in result]

    def train(self, user_interactions):
        for _, s in user_interactions.iterrows():
            self.observe(s)


In [10]:
n = 3
tfidf_recommender = TFIDFRecommender()
random_recommender = RandomHistoryRecommender()
popularity_recommender = Recommender()

train
print('Train...')
tfidf_recommender.train(content_df, train)
random_recommender.train(train)
popularity_recommender.train(train.reindex(index=train.index[::-1]))

total_num = 0
tfidf_right_prediction = 0
random_right_prediction = 0
popularity_right_prediction = 0


print('Test...')
for _, r in test.iterrows():
    user_id_str = str(r['USER_ID'])
    tfidf_rec = tfidf_recommender.recommend(user_id_str, 3)
    rand_rec = random_recommender.recommend(user_id_str, 3)
    pop_rec = popularity_recommender.recommend(user_id_str, 3)

    true_path = r['URL_PATH']
    tfidf_right_prediction += int(true_path in tfidf_rec)
    random_right_prediction += int(true_path in rand_rec)
    popularity_right_prediction += int(true_path in pop_rec)

    tfidf_recommender.observe(r)
    random_recommender.observe(r)
    popularity_recommender.observe(r)
    total_num += 1
    # if total_num % 50000 == 0:
    #     tfidf_accuracy = tfidf_right_prediction / total_num
    #     random_accuracy = random_right_prediction / total_num
    #     popularity_accuracy = popularity_right_prediction / total_num

    #     print(f"TFIDF Accuracy: {tfidf_accuracy:.2%}")
    #     print(f"Random Accuracy: {random_accuracy:.2%}")
    #     print(f"Popularity Accuracy: {popularity_accuracy:.2%}")


Train...
Test...


In [11]:
tfidf_accuracy = tfidf_right_prediction / total_num
random_accuracy = random_right_prediction / total_num
popularity_accuracy = popularity_right_prediction / total_num

print(f"TFIDF Accuracy: {tfidf_accuracy:.2%}")
print(f"Random Accuracy: {random_accuracy:.2%}")
print(f"Popularity Accuracy: {popularity_accuracy:.2%}")


TFIDF Accuracy: 8.60%
Random Accuracy: 28.92%
Popularity Accuracy: 30.94%
