In [1]:
import os
import json
import numpy as np
import pandas as pd
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import re
import string
import random
import scipy.sparse as sp

In [2]:
def load_data(path):
    """
        Load events from files and convert to dataframe.
    """
    map_lst=[]
    for f in os.listdir(path):
        file_name=os.path.join(path,f)
        if os.path.isfile(file_name):
            for line in open(file_name):
                obj = json.loads(line.strip())
                if not obj is None:
                    map_lst.append(obj)
    return pd.DataFrame(map_lst)

In [3]:
def make_lower_case(text):
    try:
        return text.lower()
    except:
        return ""
    
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("norwegian"))
    text = [w for w in text if not w in stops]
    texts = [w for w in text if w.isalpha()]
    texts = " ".join(texts)
    return texts

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [5]:
def content_processing(df):
    df = df.loc[(df["url"] != "http://adressa.no")]
    df = df.dropna(subset=['title'])
    df = df.drop_duplicates(subset='title', keep="first")

    df['cleaned_title'] = df['title'].apply(func = make_lower_case)
    df['cleaned_title'] = df.cleaned_title.apply(func = remove_stop_words)
    df['cleaned_title'] = df.cleaned_title.apply(func = remove_punctuation)

    return df

In [6]:
df = load_data("active1000")

df_updated = content_processing(df)

train_data, test_data = train_test_split(df_updated, test_size=0.2, random_state=42)

# vectorize the train and test data
tfidf = TfidfVectorizer()
train_tfidf_matrix = tfidf.fit_transform(train_data['title'])
test_tfidf_matrix = tfidf.transform(test_data['title'])

# calculate the similarity between train and test data
cosine_sim = cosine_similarity(test_tfidf_matrix, train_tfidf_matrix)

In [7]:
def recommend_articles(article):
    # vectorize the article keywords
    article_vec = tfidf.transform([article])
    # calculate the similarity
    sim_scores = cosine_similarity(article_vec, train_tfidf_matrix)
    # get the most similar articles
    sim_scores = sim_scores[0]
    article_indices = sim_scores.argsort()[::-1][:10]
    #article_scores = sim_scores[article_indices]
    #article_indices = article_indices[article_scores > threshold]
    return train_data['title'].iloc[article_indices]

In [8]:
def evaluate_system(test_data, recommend_articles):
    precision = 0
    recall = 0
    for i in range(len(test_data)):
        keywords = test_data.iloc[i]['title']
        actual_article = set(test_data.iloc[i]['title'].split('|'))
        recommended_articles = set(recommend_articles(keywords))
        # calculate precision and recall
        if len(recommended_articles) > 0:
            precision += len(actual_article & recommended_articles) / len(recommended_articles)
            recall += len(actual_article & recommended_articles) / len(actual_article)
    precision /= len(test_data)
    recall /= len(test_data)
    f1_score = 2 * precision * recall / (precision + recall)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 Score:', f1_score)

In [10]:
def evaluate_system_alt(recommender_function, input_data, true_data, **kwargs):
    """
    Evaluate a recommender system based on precision and recall.

    Parameters:
        recommender_function (function): A function that takes in an input item and generates a set of recommended items.
        input_data (list): A list of input items to test the recommender system.
        true_data (dict): A dictionary that maps each input item to a set of true recommended items.
        **kwargs: Additional keyword arguments to pass to the recommender function.

    Returns:
        precision (float): The precision of the recommender system.
        recall (float): The recall of the recommender system.
    """
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    for input_item in input_data:
        recommended_items = recommender_function(input_item, **kwargs)
        true_items = true_data[input_item]
        if len(recommended_items) > 0:
            for recommended_item in recommended_items:
                if recommended_item in true_items:
                    true_positives += 1
                else:
                    false_positives += 1
            for true_item in true_items:
                if true_item not in recommended_items:
                    false_negatives += 1
        else:
            false_negatives += len(true_items)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    #f1_score = 2 * precision * recall / (precision + recall)
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    #print(f"f1_score: {f1_score:.2f}")
    return precision, recall

In [11]:
#train_df = train_df[:10000]

#recommend_articles("Erna", train_df)

print(recommend_articles("Erna"))

#evaluate_system(test_df, recommend_articles("Erna"))

print(" ")

print(evaluate_system_alt(recommend_articles, train_data, test_data))

#print(recommend_articles("Erna", cosine_sim, df_updated))

1663878                                    Erna slår tilbake
1921534      Her kommer Erna Solberg med en budsjettlekkasje
359372                       Erna Solberg mintes trafikkofre
786810                    Erna sender lærerne på skolebenken
984091     Erna Solberg kjørte bil for første gang på tre år
1725906    Erna Solberg vil prioritere vekst og arbeidspl...
22526        Lysbakken kritisk til Erna Solbergs nyttårstale
113307     Trøndersk Høyre-topp mener ulveopprøret kan fe...
1328870    Statsminister Erna Solberg til Kaci Kullmann F...
1693139    Erna Solberg tror ikke ja til eggdonasjon vill...
Name: title, dtype: object
 
Precision: 0.00
Recall: 0.00
(0.0, 0.0)


In [None]:
#Not used
def cosine(df):
    tfidf_vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0)

    # Compute the TF-IDF matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_title'])

    # Convert the TF-IDF matrix to a sparse matrix
    sparse_tfidf_matrix = sp.csr_matrix(tfidf_matrix)

    # Compute the cosine similarity matrix
    cosine_sim = cosine_similarity(sparse_tfidf_matrix)
    
    return cosine_sim