In [47]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import scipy.sparse as sp
import numpy as np

In [48]:
def load_data(path):
    """
        Load events from files and convert to dataframe.
    """
    map_lst=[]
    for f in os.listdir(path):
        file_name=os.path.join(path,f)
        if os.path.isfile(file_name):
            for line in open(file_name):
                obj = json.loads(line.strip())
                if not obj is None:
                    map_lst.append(obj)
    return pd.DataFrame(map_lst)

In [49]:
#Functions used to process input data
def make_lower_case(text):
    """
        Process text into lower case.
    """
    try:
        return text.lower()
    except:
        return ""
    
def remove_stop_words(text):
    """
        Remove stop words from text.
    """
    text = text.split()
    stops = set(stopwords.words("norwegian"))
    text = [w for w in text if not w in stops]
    texts = [w for w in text if w.isalpha()]
    texts = " ".join(texts)
    return texts

def remove_punctuation(text):
    """
        Remove punctuation from text
    """
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

def remove_html(text):
    """
        Remove HTML tags from text
    """
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [50]:
def content_processing(df):
    """
        Applying text processing to the text contained in each title
    """
    df = df.loc[(df["url"] != "http://adressa.no")]
    df = df.dropna(subset=['title'])
    df = df.drop_duplicates(subset='title', keep="first")

    df['cleaned_title'] = df['title'].apply(func = make_lower_case)
    df['cleaned_title'] = df.cleaned_title.apply(func = remove_stop_words)
    df['cleaned_title'] = df.cleaned_title.apply(func = remove_punctuation)
    df['cleaned_title'] = df.cleaned_title.apply(func = remove_html)

    return df

In [51]:
#Load data into df
df = load_data("active1000")

#Processing of data to remove stop words, punctuation, and other irrelevant information
df_updated = content_processing(df)

#Split data into a traing and testing set 
train_data, test_data = train_test_split(df_updated, test_size=0.2, random_state=42)

#Vectorize the training data using the processed title
tfidf = TfidfVectorizer()
train_tfidf_matrix = tfidf.fit_transform(train_data['cleaned_title'])

In [52]:
def recommend_articles(article, index, number):
    """
        Returns a number of recommended articles to the article provided
    """
    # vectorize the article keywords
    article_vec = tfidf.transform([article])
    # calculate the similarity
    sim_scores = cosine_similarity(article_vec, train_tfidf_matrix)
    # get the most similar articles
    sim_scores = sim_scores[index]
    article_indices = sim_scores.argsort()[::-1][:number]
    article_indices = np.array(article_indices).reshape(-1)
    return train_data['title'].iloc[article_indices]

In [53]:
#Example user
user_id = "cx:ib1vo01vq38f2mqc:20lut6o1pv35i"  
user_data = df[df['userId'] == user_id]
last_article = user_data['title'].iloc[-1]

#Recommend articles to a user based on their last read artivle
print(recommend_articles(last_article, 0, 15))

2207202    Her fortviler Jarstein etter scoringen som kan...
2099171    «Vi skal til Champions League med Molde før vi...
1936005                           Her er laget ditt, Nilsen!
1345646    Ble oppdaget av landslagsstjerne – nå er sørle...
76743                               Dette kan koste deg dyrt
1854612               Se Champions League-trekningen direkte
1788643    Leicester-eventyret fortsetter: Til kvartfinal...
195540                      Vil ha kun de beste med på laget
1152243                      Mathallens stamkunder fortviler
1154       Her er scoringen som holder Liverpool inne i g...
2031225      Jarstein var uaktuell som kaptein for Lagerbäck
477585            Nå er Elabdellaoui klar for Premier League
426886     Kjøreturen mellom Trondheim og Steinkjer kan k...
1964063    Denne superbussveien på 750 meter kan koste 50...
1142043    Toppdommeren bytter ut Premier League med Saud...
Name: title, dtype: object


In [54]:
def eval(df): 
    """
        Evaluation function that prints average precision, recall and f1 score to a given data set
    """
    # Split the data into a training set and a test set
    holdout = test_data
    train = train_data

    # Evaluate the recommender system on the holdout set
    precision_list = []
    recall_list = []

    for user, article_title in holdout[['userId', 'title']].values:

        # Get the ground-truth similar articles for the current user and article
        ground_truth = df[(df['userId'] == user) & (df['title'] != article_title)]['title'].unique()
        
        # Get the index of the current article in the training set
        if article_title in train['title'].values:
            idx = train[train['title'] == article_title].index[0]
        else:
            idx = None
            
        top_titles = recommend_articles(article_title, idx)
        
        # Compute the precision and recall for the recommended articles
        relevant = set(ground_truth)
        retrieved = set(top_titles)
        intersection = relevant.intersection(retrieved)
        precision = len(intersection) / len(retrieved)
        if len(relevant) > 0:
            recall = len(intersection) / len(relevant)
        else:
            recall = 0.0
        precision_list.append(precision)
        recall_list.append(recall)

    # Compute the average precision, recall, and F1 score
    avg_precision = sum(precision_list) / len(precision_list)
    avg_recall = sum(recall_list) / len(recall_list)
    f1_score = 2 * ((avg_precision * avg_recall) / (avg_precision + avg_recall))

    print("Average precision: " +str(avg_precision))
    print("Average recall: " +str(avg_recall))
    print("Average f1 score: " +str(f1_score))

In [55]:
eval(df=df_updated)

Average precision: 0.10387735780290482
Average recall: 0.7991647944733982
Average f1 score: 0.1838565942680337
