# Lab 4 - recommendations for news services

## Preparation

 * download and unpack the dataset: https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip
   * read more here: https://learn.microsoft.com/en-us/azure/open-datasets/dataset-microsoft-news
 * [optinal] create a python virtual enviroment
 `python3 -m venv ./recsyslab4`
 * install needed libraries:
 `pip install nltk sklearn`

## Part 1 - data preparation and preprocessing

In [1]:
# import needed packages and libraries

import codecs
from collections import defaultdict
import math
import re
from string import punctuation

import nltk
nltk.download('stopwords')
nltk.download('rslp')

from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

# Chance to improve the performance time with knn algorithms
from sklearn.neighbors import NearestNeighbors

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\piotr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\piotr\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [2]:
# defining constants

PATH = './MINDsmall_train'
STOPWORDS = set(stopwords.words('english'))

In [3]:
# loading text metadata

def parse_news_entry(entry):
    news_id, category, subcategory, title, abstract = entry.split('\t')[:5]
    return {
        'news_id': news_id,
        'category': category,
        'subcategory': subcategory,
        'title': title,
        'abstract': abstract
    }

def get_news_metadata():
    with codecs.open(f'{PATH}/news.tsv', 'r', 'UTF-8') as f:
        raw = [x for x in f.read().split('\n') if x]
        parsed_entries = [parse_news_entry(entry) for entry in raw]
        return {x['news_id']: x for x in parsed_entries}

news = get_news_metadata()
news_ids = sorted(list(news.keys()))
news_indices = {x[1]: x[0] for x in enumerate(news_ids)}
print(len(news))

51282


## Part 2 - TF-IDF

In [4]:
# text normalization

def preprocess_text(text):
    # removing punctuation and digits. Converting to lowercase
    text = text.translate(str.maketrans("", "", punctuation+"0123456789")).lower()   
    # tokenization
    text = text.split()
    # removing stopwords
    text = [word for word in text if word not in stopwords.words('english')]
    return text

def stem_texts(corpus):
#     stemmer = RSLPStemmer()
#     stemmer = WordNetLemmatizer.()
    stemmer = LancasterStemmer()
#     stemmer = PorterStemmer()
#     stemmer = SnowballStemmer()
    return [[stemmer.stem(word) for word in preprocess_text(text)] for text in corpus]

texts = [news[news_id]['abstract'] for news_id in news_ids]
stemmed_texts = stem_texts(texts)

In [5]:
# Let's compare some text before and after processing

print(texts[2] + '\n')
print(' '.join(stemmed_texts[2]))

"I think we have a really good team, and a team that can really do some special, good things because that group is very close in there." - Brian Schmetzer

think real good team team real spec good thing group clos bri schmetzer


In [6]:
# create a list of all words in the corpus

def get_all_words_sorted(corpus):
    return sorted({word for text in corpus for word in text})

wordlist = get_all_words_sorted(stemmed_texts)
word_indices = {x[1]: x[0] for x in enumerate(wordlist)}
print(len(wordlist))

37021


In [13]:
# calculate the number of texts in which each of the words appeard
# If it appeared multiple times in one text we still count it only once

def get_document_frequencies(corpus, wordlist):
    freqs = {}
    for text in corpus:
        for word in set(text):
            if word in freqs:
                freqs[word] += 1
            else: freqs[word] = 1

    return freqs

document_frequency = get_document_frequencies(stemmed_texts, wordlist)

In [14]:
# check the results

document_frequency

{'fartlek': 1,
 'justin': 178,
 'capit': 212,
 'vet': 712,
 'nat': 2070,
 'gam': 2874,
 'emphas': 29,
 'season': 2563,
 'washington': 812,
 'somewh': 74,
 'astro': 404,
 'regul': 227,
 'meet': 581,
 'confer': 1,
 'concern': 328,
 'within': 232,
 'conundr': 1,
 'eastern': 167,
 'wednesday': 2140,
 'loss': 797,
 'wretch': 1,
 'airpl': 42,
 'altuv': 35,
 'crop': 43,
 'jos': 85,
 'verland': 55,
 'prestig': 4,
 'ski': 88,
 'remind': 107,
 'on': 4125,
 'remark': 80,
 'team': 2485,
 'run': 1128,
 'assembl': 71,
 'enco': 81,
 'two': 3113,
 'calm': 49,
 'playerson': 2,
 'bri': 191,
 'spec': 629,
 'clos': 1223,
 'thing': 840,
 'schmetzer': 4,
 'good': 902,
 'real': 1041,
 'think': 656,
 'group': 724,
 'sport': 519,
 'octob': 523,
 'day': 2502,
 'battery': 66,
 'say': 3094,
 'beach': 420,
 'charg': 1065,
 'vero': 2,
 'deputy': 356,
 'spit': 14,
 'americ': 282,
 'hat': 163,
 'man': 2542,
 'gre': 621,
 'mak': 2231,
 'slap': 21,
 'red': 513,
 'sonom': 67,
 'put': 709,
 'surg': 113,
 'throughout': 24

In [15]:
# calculate the number of word occurrences in each text

def get_term_frequencies(corpus, news_indices):
    freqs = {news : {} for news in news_indices}
    for news,idx in news_indices.items():
        text = corpus[idx]
        for word in text:
            if word in freqs[news]: 
                freqs[news][word] += 1
            else: freqs[news][word] = 1
    return freqs

term_frequency = get_term_frequencies(stemmed_texts, news_indices)

In [16]:
# check the results

term_frequency[news_ids[2]]

{'think': 1,
 'real': 2,
 'good': 2,
 'team': 2,
 'spec': 1,
 'thing': 1,
 'group': 1,
 'clos': 1,
 'bri': 1,
 'schmetzer': 1}

In [21]:
# calculate the tf_idf metric

def calculate_tf_idf(term_frequency, document_frequency, corpus_size):
    tf_idf = {}
    for news, word_freqs in term_frequency.items():
        tf_idf[news] = {}
        for word, freq in word_freqs.items():
            tf = freq/len(word_freqs)
            idf = math.log(corpus_size/document_frequency[word])
            tf_idf[news][word] = tf*idf
    return tf_idf

tf_idf = calculate_tf_idf(term_frequency, document_frequency, len(news_ids))

In [22]:
# check the results

tf_idf[news_ids[2]]

{'think': 0.4358934303449984,
 'real': 0.7794316047558209,
 'good': 0.8080961144662899,
 'team': 0.6054134307726688,
 'spec': 0.44009638356936326,
 'thing': 0.41116932005567136,
 'group': 0.42603037000083566,
 'clos': 0.37360329567069006,
 'bri': 0.5592821664347443,
 'schmetzer': 0.9458800731274183}

## Part 3 - text similarity

In [45]:
# calculate the distance between 2 elements/texts
# test different metrics and chose one

def cosine_dist(tf_idf, id1, id2): 
    num = sum([tf_idf[id1][word]*tf_idf[id1][word] for word in tf_idf[id1] if word in tf_idf[id2]])
    if num == 0: return 0
    den = math.sqrt(sum(v**2 for v in tf_idf[id1].values())*sum(v**2 for v in tf_idf[id2].values()))
    return num/den

def euclidean_dist(tf_idf, id1, id2):
    all_elems = list(set(tf_idf[id1].keys()) | set(tf_idf[id2].keys()))
    vec1 = [0 if word not in tf_idf[id1] else tf_idf[id1][word] for word in all_elems]
    vec2 = [0 if word not in tf_idf[id2] else tf_idf[id2][word] for word in all_elems]
    return math.sqrt(sum([(vec1[i]-vec2[i])**2 for i in range(len(all_elems))]))

def fake_jaccard(tf_idf, id1, id2):
    set1 = set(tf_idf[id1].keys())
    set2 = set(tf_idf[id2].keys())
    overlap = set1 & set2
    set1 -= overlap
    set2 -= overlap
    
    num = sum([tf_idf[id1][word] for word in overlap]) + sum([tf_idf[id2][word] for word in overlap])
    den = num + 2*sum([tf_idf[id1][word] for word in set1]) + 2*sum([tf_idf[id2][word] for word in set2])
    return num/den

def weighted_jaccard(tf_idf, id1, id2):
    set1 = set(tf_idf[id1].keys())
    set2 = set(tf_idf[id2].keys())
    overlap = set1 & set2
    set1 -= overlap
    set2 -= overlap
    
    num = sum([min(tf_idf[id1][word],tf_idf[id2][word]) for word in overlap])
    den = sum([min(tf_idf[id1][word],tf_idf[id2][word]) for word in overlap]) + sum([tf_idf[id1][word] for word in set1]) + sum([tf_idf[id2][word] for word in set2])
    return num/den

def jaccard(tf_idf, id1, id2):
    set1 = set(tf_idf[id1].keys())
    set2 = set(tf_idf[id2].keys())
    overlap = set1 & set2
    all_elements = set1 | set2
    return len(overlap)/len(all_elements)
    
def calculate_distance(tf_idf, id1, id2):  
    return cosine_dist(tf_idf, id1, id2)

calculate_distance(tf_idf, news_ids[2], news_ids[1])

0.18075826340883294

In [46]:
# for a given text find k most similar ones
# remember about the proper sorting order for used metric
# remember to not include the currently checked text. 
# It would always be the most similar because it's identical

def get_k_most_similar_news(tf_idf, n_id, k):
    similarities = [(id2,calculate_distance(tf_idf,n_id, id2)) for id2 in tf_idf if id2 != n_id]
    similarities = sorted(similarities,key= lambda x: x[1],reverse=True)
    return [x[0] for x in similarities[:k]]

def print_k_most_similar_news(tf_idf, n_id, k, corpus, news_indices):
    similar = get_k_most_similar_news(tf_idf, n_id, k)
    print(f'id: {n_id}, text: {corpus[news_indices[n_id]]}')
    print(f'\n{k} most similar:')
    for s_id in similar:
        print(f'\nid: {s_id}, text: {corpus[news_indices[s_id]]}')

print_k_most_similar_news(tf_idf, news_ids[42337], 5, texts, news_indices)

id: N58544, text: A MAN claims he has created a car that might solve the world's traffic congestion problems. Rick Woodbury from Spokane, Washington USA, is the president, founder and sole employee of 'Commuter Cars.' The carmaker's flagship model is the 2005 super slim two-seater Tango T600, a high-performance electric car that preceded Tesla. Rick told BTV: "I started this company 21 years ago   it was based on an idea that I came up with in 1982." He was inspired by the shocking traffic congestion he had to face on a daily basis. "I used to drive a Porsche from Beverly Hills to Hermosa Beach every day and the traffic was horrendous," explained Rick. What really made Rick think about a solution was the fact that in most of the cars he would see in his commute were occupied by lone drivers. "I noticed that everybody around me was a single occupant in a car, taking up the whole lane," Rick said. Living and working in Los Angeles also helped inspire Rick's unique creation. "I thought, e