# Laboratorium 4 - rekomendacje dla portali informacyjnych

## Przygotowanie

 * pobierz i wypakuj dataset: https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip
   * więcej możesz poczytać tutaj: https://learn.microsoft.com/en-us/azure/open-datasets/dataset-microsoft-news
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab4`
 * zainstaluj potrzebne biblioteki:
 `pip install nltk sklearn`

## Część 1. - przygotowanie danych

In [6]:
# importujemy wszystkie potrzebne pakiety

import codecs
from collections import defaultdict # mozesz uzyc zamiast zwyklego slownika, rozwaz wplyw na czas obliczen
import math
import re
from string import punctuation

import nltk
nltk.download('stopwords')
nltk.download('rslp')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import RSLPStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

# mozesz uzyc do obliczania najbardziej podobnych tekstow zamiast liczenia "na piechote"
# ale pamietaj o dostosowaniu formatu danych
from sklearn.neighbors import NearestNeighbors

[nltk_data] Downloading package stopwords to /home/michal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/michal/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package punkt to /home/michal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# definiujemy potrzebne zmienne

PATH = './MINDsmall_train'
STOPWORDS = set(stopwords.words('english'))

In [8]:
# wczytujemy metadane artykułów

def parse_news_entry(entry) -> dict[str, str]:
    news_id, category, subcategory, title, abstract = entry.split('\t')[:5]
    return {
        'news_id': news_id,
        'category': category,
        'subcategory': subcategory,
        'title': title,
        'abstract': abstract
    }

def get_news_metadata() -> dict[str, dict]:
    with codecs.open(f'{PATH}/news.tsv', 'r', 'UTF-8') as f:
        raw = [x for x in f.read().split('\n') if x]
        parsed_entries = [parse_news_entry(entry) for entry in raw]
        return {x['news_id']: x for x in parsed_entries}

news = get_news_metadata()
news_ids = sorted(list(news.keys()))
news_indices = {x[1]: x[0] for x in enumerate(news_ids)}
print(news_indices)
print(len(news))

{'N10': 0, 'N100': 1, 'N1000': 2, 'N10000': 3, 'N10001': 4, 'N10002': 5, 'N10003': 6, 'N10004': 7, 'N10005': 8, 'N10007': 9, 'N10009': 10, 'N1001': 11, 'N10010': 12, 'N10011': 13, 'N10013': 14, 'N10014': 15, 'N10015': 16, 'N10016': 17, 'N1002': 18, 'N10021': 19, 'N10022': 20, 'N10023': 21, 'N10024': 22, 'N10025': 23, 'N10026': 24, 'N10027': 25, 'N10029': 26, 'N1003': 27, 'N10031': 28, 'N10032': 29, 'N10033': 30, 'N10034': 31, 'N10035': 32, 'N10037': 33, 'N10038': 34, 'N10039': 35, 'N1004': 36, 'N10040': 37, 'N10041': 38, 'N10042': 39, 'N10044': 40, 'N10046': 41, 'N10047': 42, 'N10048': 43, 'N10049': 44, 'N10051': 45, 'N10052': 46, 'N10053': 47, 'N10055': 48, 'N10056': 49, 'N10057': 50, 'N10058': 51, 'N10059': 52, 'N1006': 53, 'N10060': 54, 'N10061': 55, 'N10062': 56, 'N10063': 57, 'N10064': 58, 'N10065': 59, 'N10066': 60, 'N10067': 61, 'N10068': 62, 'N1007': 63, 'N10070': 64, 'N10072': 65, 'N10073': 66, 'N10074': 67, 'N10075': 68, 'N10076': 69, 'N10077': 70, 'N10078': 71, 'N10079': 72,

## Część 2. - TF-IDF

In [9]:
import string
from typing import List
# normalizujemy teksty na potrzeby dalszego przetwarzania

def preprocess_text(text) -> List[str]:
    text = ' '.join(text.split())
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([i for i in text if not i.isdigit()])   
    text = text.lower() 
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [w for w in tokens if not w in stop_words]
    return filtered_sentence

def stem_texts(corpus) -> List[List[str]]:
    stemmer = SnowballStemmer('english')
    return [[stemmer.stem(word) for word in preprocess_text(text)] for text in corpus]

texts = [news[news_id]['abstract'] for news_id in news_ids]
stemmed_texts = stem_texts(texts)

In [10]:
# porownajmy teksty przed i po przetworzeniu

print(texts[2] + '\n')
print(' '.join(stemmed_texts[2]))

"I think we have a really good team, and a team that can really do some special, good things because that group is very close in there." - Brian Schmetzer

think realli good team team realli special good thing group close brian schmetzer


In [11]:
# tworzymy liste wszystkich slow w korpusie

def get_all_words_sorted(corpus: List[List[str]]):
    sorted_words = set()
    for text in corpus:
        for token in text:
            sorted_words.add(token)
    sorted_list = sorted(list(sorted_words))
    return sorted_list


wordlist = get_all_words_sorted(stemmed_texts)
word_indices = {word: index for index, word in enumerate(wordlist)}
print(len(wordlist))

41846


In [12]:
# obliczamy liczbe tekstow, w ktorych wystapilo kazde ze slow
# pamietaj, ze jesli slowo wystapilo w danym tekscie wielokrotnie, to liczymy je tylko raz

def get_document_frequencies(corpus, wordlist):
    # return {word -> count}}
    per_word_frequency = defaultdict(lambda: 0)
    for text in corpus:
        unique_words = set()
        for word in text:
            if word not in unique_words:
                per_word_frequency[word] += 1
                unique_words.add(word)
    return per_word_frequency

document_frequency = get_document_frequencies(stemmed_texts, wordlist)
print(document_frequency)

defaultdict(<function get_document_frequencies.<locals>.<lambda> at 0x7fc7dc0200d0>, {'fartlek': 1, 'washington': 810, 'somewher': 33, 'eastern': 167, 'sky': 146, 'astro': 404, 'assembl': 71, 'one': 4098, 'airplan': 42, 'nation': 1629, 'capit': 196, 'team': 2360, 'confer': 384, 'conundrum': 2, 'playerson': 2, 'meet': 581, 'wednesday': 2140, 'wretch': 1, 'loss': 797, 'game': 2865, 'run': 1096, 'two': 3113, 'veteran': 686, 'calm': 46, 'concern': 328, 'crop': 43, 'justin': 177, 'verland': 55, 'jose': 83, 'altuv': 35, 'emphas': 19, 'encourag': 81, 'remind': 107, 'teammat': 143, 'remark': 80, 'regular': 227, 'season': 2563, 'prestig': 4, 'within': 232, 'think': 655, 'realli': 400, 'good': 900, 'special': 437, 'thing': 840, 'group': 723, 'close': 1013, 'brian': 185, 'schmetzer': 4, 'day': 2501, 'sport': 519, 'octob': 523, 'vero': 2, 'beach': 420, 'man': 1776, 'charg': 986, 'batteri': 66, 'deputi': 356, 'say': 3094, 'spit': 8, 'slap': 21, 'make': 2168, 'america': 393, 'great': 526, 'hat': 58,

In [13]:
# obliczamy liczbe wystapien kazdego slowa w kazdym tekscie

def get_term_frequencies(corpus, news_indices):
    results = {news_id: defaultdict( lambda: 0) for news_id in news_indices}
    # return {news_id -> {word -> count}}
    for news_id, index in news_indices.items():
        for word in corpus[index]:
            results[news_id][word] += 1
    return results

term_frequency = get_term_frequencies(stemmed_texts, news_indices)

In [14]:
# sprawdzmy wyniki

term_frequency[news_ids[2]]

defaultdict(<function __main__.get_term_frequencies.<locals>.<dictcomp>.<lambda>()>,
            {'think': 1,
             'realli': 2,
             'good': 2,
             'team': 2,
             'special': 1,
             'thing': 1,
             'group': 1,
             'close': 1,
             'brian': 1,
             'schmetzer': 1})

In [15]:
# obliczamy metryke tf_idf
def calculate_tf_idf(term_frequency, document_frequency, corpus_size):
    # return {news_id -> {word -> tf_idf}}
    results = {news_id: dict() for news_id in term_frequency}
    for news_id, count_per_word in term_frequency.items():
        for word, count in count_per_word.items():
            results[news_id][word] = count * math.log(corpus_size / document_frequency[word])
    return results


tf_idf = calculate_tf_idf(term_frequency, document_frequency, len(news_ids))

In [16]:
# sprawdzmy wyniki

tf_idf[news_ids[2]]

{'think': 4.360459856758821,
 'realli': 9.707261090572182,
 'good': 8.085400658139525,
 'team': 6.157356388748834,
 'special': 4.765161897298483,
 'thing': 4.111693200556713,
 'group': 4.261685870235309,
 'close': 3.9244235881453897,
 'brian': 5.624739267315748,
 'schmetzer': 9.458800731274183}

## Część 3. - Podobieństwo tekstów

In [5]:
# obliczmy odleglosc miedzy dwoma artykulami
# przetestuj rozne metryki odleglosci i wybierz najlepsza

import numpy as np

def cosine_similarity(tfidf_vector1, tfidf_vector2):
    dot_product = np.dot(tfidf_vector1, tfidf_vector2)
    norm_vector1 = np.linalg.norm(tfidf_vector1)
    norm_vector2 = np.linalg.norm(tfidf_vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)

    return similarity

def euklides_distance(tfidf_vector1, tfidf_vector2):
    return np.linalg.norm(np.array(tfidf_vector1) - np.array(tfidf_vector2))

def calculate_distance(tf_idf, id1, id2):
    first_dict = tf_idf[id1]
    sec_dict = tf_idf[id2]
    all_words = set(list(first_dict.keys()) + list(sec_dict.keys()))
    first_vector = [first_dict.get(word, 0) for word in all_words]
    second_vector = [sec_dict.get(word, 0) for word in all_words]
    similarity_score = cosine_similarity(first_vector, second_vector)
    return similarity_score

print
calculate_distance(tf_idf, news_ids[2], news_ids[1])

NameError: name 'tf_idf' is not defined

In [19]:
# wyznaczmy k najpodobniejszych tekstow do danego
# pamietaj o odpowiedniej kolejnosci sortowania w zaleznosci od wykorzystanej metryki
# pamietaj, zeby wsrod podobnych tekstow nie bylo danego

def get_k_most_similar_news(tf_idf, n_id, k):
    scores = {}
    for news_id in tf_idf:
        if news_id != n_id:
            scores[news_id] = calculate_distance(tf_idf, n_id, news_id)
    results = [score[0] for score in sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]]
    return results

def print_k_most_similar_news(tf_idf, n_id, k, corpus, news_indices):
    similar = get_k_most_similar_news(tf_idf, n_id, k)
    print(f'id: {n_id}, text: {corpus[news_indices[n_id]]}')
    print(f'\n{k} most similar:')
    for s_id in similar:
        print(f'\nid: {s_id}, text: {corpus[news_indices[s_id]]}')

print_k_most_similar_news(tf_idf, news_ids[42337], 5, texts, news_indices)

id: N58544, text: A MAN claims he has created a car that might solve the world's traffic congestion problems. Rick Woodbury from Spokane, Washington USA, is the president, founder and sole employee of 'Commuter Cars.' The carmaker's flagship model is the 2005 super slim two-seater Tango T600, a high-performance electric car that preceded Tesla. Rick told BTV: "I started this company 21 years ago   it was based on an idea that I came up with in 1982." He was inspired by the shocking traffic congestion he had to face on a daily basis. "I used to drive a Porsche from Beverly Hills to Hermosa Beach every day and the traffic was horrendous," explained Rick. What really made Rick think about a solution was the fact that in most of the cars he would see in his commute were occupied by lone drivers. "I noticed that everybody around me was a single occupant in a car, taking up the whole lane," Rick said. Living and working in Los Angeles also helped inspire Rick's unique creation. "I thought, e