# Metody Obliczeniowe w Nauce i Technice Laboratorium 6
## Article searcher
### Paweł Gorgolewski

In [11]:
import os
import numpy as np
import pickle
import csv
import re
import math
import wikipedia as wiki
import random
import nltk
nltk.download('wordnet')

from typing import List, Dict
from wikipedia.exceptions import WikipediaException
from collections import namedtuple, defaultdict
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer
from scipy import sparse

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pawel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Generowanie artykułów z wikipedii
Dump indeksów artykułów z wikipedii został pobrany poprzez link `https://dumps.wikimedia.org/enwiki/20220401/`

Wykorzystujac pobrany dump, zaciągamy zawartość tytułu poprzez bilbiotekę `wikipedia`. Załadowany w ten sposób artykuł, zapisywany jest do folderu `data` z wykorzystaniem pythonowego `pickla`
Aby pobrać arykuły, należy uruchomić poniższy blok z odkomentowanymi dwoma ostatnimi liniami.

In [12]:
def get_first_n_titles(n=3500, file='data\wiki-pages-indexes.txt'):
    titles = []
    with open(file, "r", encoding='utf8') as f:
        csv_reader = csv.reader(f, delimiter=":")
        for row in csv_reader:
            if n < 0:
                break

            title = row[-1]
            if '/' in title:
                continue

            titles.append(title)
            n-=1

    random.shuffle(titles)
    return titles

def get_articles_content_and_save_pickle(titles: List[str]):
    not_matched = 0
    for title in titles:
        try:
            wiki_page = wiki.page(title)
            wiki_content = wiki_page.content
            wiki_url = wiki_page.url
            title_dict = {'content': wiki_content, 'url': wiki_url}
            pickle.dump(title_dict, open(f'data\\{title}', "wb"))
        except WikipediaException:
            not_matched += 1
        except Exception as ex:
            print(f"EXCEPTION CAUGHT!\n{str(ex)}")

    print(f"Not matched {not_matched} titles")

#titles_to_load = get_first_n_titles()
#get_articles_content_and_save_pickle(titles_to_load)

## Tworzenie słownika z nazwą artykułu jako klucz oraz tekstem jako wartość
Poniższy kod zapisuje stworzoną strukturę do ścieżki `.\data\dumps`. Jest także możliwość wczytania zapisanej wcześniej struktry przy użyciu funkjci `get_articles_from_dump`

In [13]:
def get_articles(to_omit_file='wiki-pages-indexes.txt'):
    data_by_title = dict()
    for title in pickles():
        if title == to_omit_file or "dump" in title:
            continue

        data_by_title[title] = pickle.load(open(f"data\\{title}", "rb"))

    pickle.dump(data_by_title, open(f"data\\dumps\\articles_dict_dump_of_len_{len(data_by_title)}", "wb"))
    return data_by_title

def pickles(path='data'):
    for content in os.listdir(path):
        if os.path.isfile(os.path.join(path, content)):
            yield content

def get_articles_from_dump(dirpath="data\\dumps\\", dump_name="articles_dict_dump_of_len_2890"):
    path = os.path.join(dirpath, dump_name)
    return pickle.load(open(path, "rb"))

Aby stworzyć nowy słownik, należy odkomentowac pierwszą i zakomentować drugą linijkę poniższego bloku

In [14]:
#loaded_articles = get_articles()
loaded_articles = get_articles_from_dump()

## ArticleParser
Poniższa klasa wykonuje wszystkie czynności potrzebne do późniejszego wyszukiwania artykułów poprzez metodę `parse_artciles_and_prepare_term_by_document`. Wykonuje ona następujące czynności:
1. przetwarza teksty artykułów
2. tworzy `bags_of_words`
3. tworzy rzadka macierz wektorów cech term-by-document
4. przetwarza macierz wektorów cech używając IDF
5. normalizuje wektory z macierzy cech
6. tworzy i zapisuje nową macierz przy użyciu svd (domyślne k to 1200)

Po jej wykonaniu, należy użyć metody `find_articles`, która wypisze najbardziej trafne artykuły. Aby usunąć szumy, należy podać argument `Ak_matrix=True`, dzięki czemu algorytm wykona się na macierzy Ak (domyślne `k` do svd to 200)

In [None]:
class ArticlesParser:
    def __init__(self, articles: Dict[str, Dict[str,str]]):
        self.articles = articles
        self.parsed_articles = dict()
        self.ids_by_unique_word = None      # Dict[word: str, id: int]
        self.term_by_document = None        # sparse_matrix
        self.all_unique_words = None        # List[str]
        self.all_articles_titles = None     # List[str]
        self.all_words_data = defaultdict(lambda: 0)
        self.lemmatizer = WordNetLemmatizer()
        self.Ak_matrix = None

    def parse_artciles_and_prepare_term_by_document(self):
        print("STARTED")
        self.parse_articles()
        print("AFTER ARTICLE PARSING")
        self.create_bags_of_words()
        print("AFTER CREATING BAGS")
        self.create_term_by_document_matrix()
        print("AFTER CREATING DOCUMENT BY TERM")
        self.multiply_term_by_document_by_IDF()
        print("AFTER MULTIPLYING BY IDF")
        self.normalize_vectors()
        print("AFTER NORMALIZATION")
        self.get_Ak_from_term_by_document()
        print("AFTER ALL")

    def normalize_vectors(self):
        for i in range(len(self.all_articles_titles)):
            vector = self.term_by_document.getcol(i)
            vector_norm = self.get_norm_from_vector(vector)
            self.term_by_document[:, i] /= vector_norm

    def parse_content(self, content: str, is_article=True):
        content = content.lower()
        content = re.sub(r'[^\w\s]','', content)
        content = re.sub('[0-9]','', content)
        content = re.sub(' {2} +',' ', content)
        content = remove_stopwords(content)

        words_data = defaultdict(lambda: 0)
        words_count = 0
        for word in content.split():
            lemmatized_word = self.lemmatizer.lemmatize(word, pos='v')
            words_data[lemmatized_word] += 1
            if is_article:
                self.all_words_data[lemmatized_word] += 1
            words_count+=1

        return dict(words_data=words_data, words_count=words_count)

    def parse_articles(self):
        self.all_articles_titles = list(self.articles.keys())
        for article in self.all_articles_titles:
            self.parsed_articles[article] = dict()
            self.parsed_articles[article]['content_data'] = self.parse_content(self.articles[article]['content'])

        self.all_unique_words = list(self.all_words_data.keys())
        self.ids_by_unique_word = {self.all_unique_words[i]: i for i in range(len(self.all_unique_words))}

    def create_bags_of_words(self):
        for article in self.all_articles_titles:
            self.parsed_articles[article]['bag_of_words'] = \
                self.create_bag_of_words(self.parsed_articles[article]['content_data'])

    def create_bag_of_words(self, content_data):
        article_unique_words = content_data['words_data'].keys()
        vector = sparse.dok_matrix(np.zeros((len(self.ids_by_unique_word), 1)))
        for word in article_unique_words: #TODO maybe jakiś numpy mapping or coś
            vector[self.ids_by_unique_word[word], 0] = content_data['words_data'][word]

        vector /= content_data['words_count']
        return sparse.csr_matrix(vector)

    def create_term_by_document_matrix(self):
        amount_of_articles = len(self.all_articles_titles)
        amount_of_words = len(self.all_unique_words)
        self.term_by_document = sparse.lil_matrix((amount_of_words, amount_of_articles))

        for i in range(amount_of_articles):
            self.term_by_document[:,i] = self.parsed_articles[self.all_articles_titles[i]]['bag_of_words']

    def multiply_term_by_document_by_IDF(self):
        articles_count = len(self.all_articles_titles)
        self.term_by_document = sparse.csr_matrix(self.term_by_document)
        for word in self.all_unique_words:
            articles_with_word = self.calculate_articles_with_word(word)
            idf = math.log(articles_count / articles_with_word)
            id_of_word = self.ids_by_unique_word[word]
            self.term_by_document[id_of_word] *= idf


    def calculate_articles_with_word(self, word: str):
        return sum(1 for article in self.parsed_articles.values() if word in article['content_data'].words_data)

    @staticmethod
    def get_norm_from_vector(vector):
        return math.sqrt(vector.power(2).sum())
        
    def find_articles(self, query, artciles_num_to_return, Ak_matrix=False):
        if Ak_matrix:
            matrix = self.Ak_matrix
        else:
            matrix = self.term_by_document

        query_words_data = self.parse_content(content=query, is_article=False)
        vector = self.create_bag_of_words(query_words_data)
        vector_norm =  self.get_norm_from_vector(vector)

        probabilities = []
        for i in range(len(self.all_articles_titles)):
            article = matrix.getcol(i)
            product = (vector.T @ article)[0,0] #just getting first val
            divider = vector_norm * self.get_norm_from_vector(article)
            document_cosinus = product / divider
            probabilities.append((document_cosinus, i))

        probabilities.sort(key=lambda t: t[0], reverse=True)

        print(f"Articles found for guery: {query}")
        for probability, index in probabilities[:artciles_num_to_return]:
            article = self.all_articles_titles[index]
            print(f"\tARTICLE: {article}\t\tPROBABILITY: {probability}\t\tURL: {self.articles[article]['url']}")

    def get_Ak_from_term_by_document(self, k=200):
        u, s, vt = sparse.linalg.svds(self.term_by_document, k=k)
        print("AFTER SVD")
        u = sparse.csr_matrix(u)
        s = sparse.diags(s)
        vt = sparse.csr_matrix(vt)
        self.Ak_matrix = u @ s @ vt


In [None]:
parser = ArticlesParser(loaded_articles)

# For creating everything use this:
#parser.parse_artciles_and_prepare_term_by_document()

# For loading dump structures use this:
parser.parse_articles()
parser.term_by_document = pickle.load(open(f"data\\dumps\\parser_term_by_document_with_{len(parser.articles)}_articles", "rb"))
parser.Ak_matrix = pickle.load(open(f"data\\dumps\\parser_Ak_matrix_with_{len(parser.articles)}_articles", "rb"))

## Wyszukiwanie dokumentów

In [None]:
parser.find_articles("Football", 5)

In [None]:
#parser.find_articles("Football", 5, True)

## Zapisywanie wyliczonych struktur

In [None]:
def save_structures():
    pickle.dump(parser.ids_by_unique_word, open(f"data\\dumps\\parser_ids_by_unique_word_with_{len(parser.articles)}_articles", "wb"))
    pickle.dump(parser.all_unique_words, open(f"data\\dumps\\parser_all_unique_words_with_{len(parser.articles)}_articles", "wb"))
    pickle.dump(parser.all_articles_titles, open(f"data\\dumps\\parser_all_articles_titles_with_{len(parser.articles)}_articles", "wb"))
    pickle.dump(parser.Ak_matrix, open(f"data\\dumps\\parser_Ak_matrix_with_{len(parser.articles)}_articles", "wb"))
    pickle.dump(parser.term_by_document, open(f"data\\dumps\\parser_term_by_document_with_{len(parser.articles)}_articles", "wb"))

save_structures()

Wnioski:
1) Należałoby poprawić implementację, ponieważ wszelkie instrukcje zajmują bardzo dużo czasu.
2) Wyszkiwarka zdaje się zwracać odpowiednie wyniki