# Metody Obliczeniowe w Nauce i Technice Laboratorium 6
## Page searcher
### Paweł Gorgolewski

In [32]:
import os
import numpy as np
import pickle
import csv
import re
import wikipedia as wiki
from typing import List, Dict, DefaultDict
from wikipedia.exceptions import WikipediaException
from collections import namedtuple, defaultdict
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

from functools import reduce
from itertools import chain
from collections import Counter
from time import time
from concurrent.futures import ProcessPoolExecutor


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pawel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Generating wiki dump
Dump indeksów artykułów z wikipedii został pobrany poprzez link
https://dumps.wikimedia.org/enwiki/20220401/

Mając tytuły wszystkich artykuł, zaciągamy zawartość tytułu poprzez bilbiotekę *wikipedia*. Następnie zapisujemy pickle pythonowe w celu łatwego ładowania do programu

In [25]:
def get_first_n_titles(n=1500, file='data\wiki-pages-indexes.txt'):
    titles = []
    with open(file, "r", encoding='utf8') as f:
        csv_reader = csv.reader(f, delimiter=":")
        for row in csv_reader:
            if n < 0:
                break

            title = row[-1]
            if '/' in title:
                continue

            titles.append(title)
            n-=1

    return titles

def get_articles_content_and_save_pickle(titles: List[str]):
    not_matched = 0
    for title in titles:
        try:
            wiki_page = wiki.page(title)
            wiki_content = wiki_page.content
            #now saving pickle
            pickle.dump(wiki_content, open(f'data\\{title}', "wb"))
        except WikipediaException:
            not_matched += 1

    print(f"Not matched {not_matched} titles")

#titles = get_first_n_titles()
#get_articles_content_and_save_pickle(titles)

## Ładowanie artykułów

In [26]:
def get_articles(to_omit_file='wiki-pages-indexes.txt'):
    data_by_title = dict()
    for title in pickles():
        if title == to_omit_file or "dump" in title:
            continue

        data_by_title[title] = pickle.load(open(f"data\\{title}", "rb"))

    pickle.dump(data_by_title, open(f"data\\dumps\\articles_dict_dump_of_len_{len(data_by_title)}", "wb"))
    return data_by_title

def pickles(path='data'):
    for content in os.listdir(path):
        if os.path.isfile(os.path.join(path, content)):
            yield content

def get_articles_from_dump(dirpath="data\\dumps\\", dump_name="articles_dict_dump_of_len_1255"):
    path = os.path.join(dirpath, dump_name)
    return pickle.load(open(path, "rb"))

Używam wcześniej zapisanych struktur, aby nie tracić czasu na tworzenie ich.

In [27]:
#articles = get_articles()
articles = get_articles_from_dump()

## Tworzenie zbioru wszystkich słów

In [33]:
ArticleData = namedtuple('ArticleData', ['words_data', 'words_count', 'unique_words_count'])
class ArticlesParser:
    def __init__(self, articles: Dict[str, str]):
        self.articles = articles
        self.parsed_articles = dict()
        self.all_words_count = 0
        self.ids_by_unique_word = None
        self.all_words_data = defaultdict(lambda: 0)
        self.lemmatizer = WordNetLemmatizer()

    def parse_article(self, content: str):
        content = content.lower()
        content = re.sub(r'[^\w\s]','', content)
        content = re.sub('[0-9]','', content)
        content = re.sub(' {2} +',' ', content)
        content = remove_stopwords(content)

        words_data = defaultdict(lambda: 0)
        words_count = 0
        for word in content.split():
            lemmatized_word = self.lemmatizer.lemmatize(word, pos='v')
            words_data[lemmatized_word] += 1
            self.all_words_data[lemmatized_word] += 1
            words_count+=1

        self.all_words_count += words_count
        return ArticleData(words_data=words_data, words_count=words_count, unique_words_count=len(words_data))

    def parse_articles(self):
        for article in articles:
            self.parsed_articles[article] = dict()
            self.parsed_articles[article]['article_data'] = self.parse_article(articles[article])
        
        all_words = list(self.all_words_data.keys())
        self.ids_by_unique_word = {all_words[i]: i for i in range(len(all_words))}

    def create_bag_of_words(self, article):
        article_data = self.parsed_articles[article]['article_data']
        article_unique_words = article_data.words_data.keys()
        vector = [0 for i in range(len(self.ids_by_unique_word))]
        for word in article_unique_words:
            vector[self.ids_by_unique_word[word]] = self.parsed_articles[article]['article_data'].words_data[word]
            
    
parser = ArticlesParser(articles)
parser.parse_articles()
print(parser.all_words_count)


2609801
0


In [35]:
print(list(parser.parsed_articles.keys()))


['A', 'A cappella', 'A Clockwork Orange (novel)', 'A Fire Upon the Deep', 'A fortiori', 'A priori and a posterior knowledge', 'A priori and a posteriori knowledge', 'A roll', 'A. A. Milne', 'A. E. van Vogt', 'A. J. Ayer', 'A.A. Milne', 'A.D', 'A.E. van Vogt', 'A360media', 'Aa', 'Aachen', 'Aage Bohr', 'Aal', 'Aalborg Municipality', 'Aaliyah', 'Aardvark', 'Aardwolf', 'Aargau', 'Aarhus', 'Aaron', 'Aarons Rod', 'Ab', 'Ab urbe condita', 'Ababda people', 'AbacuS', 'Abacá', 'Abadan, Iran', 'Abaddon', 'Abadeh', 'AbalonE', 'Abana River', 'Abandonment of an easement', 'Abandonment of domicile', 'Abandonment of railways', 'Abano Terme', 'Abatement in heraldry', 'Abatement of a nuisance', 'Abatement of debts and legacies', 'Abati', 'Abatis', 'Abattoir', 'Abba Arikha', 'Abba Mari', 'AbbadideS', 'Abbahu', 'Abbas Mirza', 'Abbe number', 'AbbesS', 'AbbevilleFrance', 'AbbeY', 'Abbotsford, Scottish Borders', 'Abbreviation', 'Abbreviations', 'Abbreviator', 'ABC', 'Abd al-Latif al-Baghdadi', 'Abd al-Rahman

In [37]:
print(parser.articles['Atomic orbitals'])

In atomic theory and quantum mechanics, an atomic orbital is a mathematical function describing the location and wave-like behavior of an electron in an atom. This function can be used to calculate the probability of finding any electron of an atom in any specific region around the atom's nucleus. The term atomic orbital may also refer to the physical region or space where the electron can be calculated to be present, as predicted by the particular mathematical form of the orbital.Each orbital in an atom is characterized by a set of values of the three quantum numbers n, ℓ, and ml, which respectively correspond to the electron's energy, angular momentum, and an angular momentum vector component (the magnetic quantum number). Alternative to the magnetic quantum number, the orbitals are often labeled by the associated harmonic polynomials (e.g. xy, x2−y2). Each such orbital can be occupied by a maximum of two electrons, each with its own projection of spin 
  
    
      
        
      