In [1]:
from data_manager import Document, Vector
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.probability import FreqDist
from string import punctuation
import json
import pickle

Ostateczny alfabet zawierał 97308 wyrazów.

In [2]:
stop_words = set([word.lower() for word in stopwords.words('english')])
snowball_stemmer = SnowballStemmer('english')

def process_text(text: str) -> dict[str, int]:
    text = "".join(list(map(lambda c: " " if c in punctuation else c, text)))
    words = [word.lower() for sentence in sent_tokenize(text) for word in word_tokenize(sentence)]
    words = [word for word in words if word not in stop_words]
    words = [snowball_stemmer.stem(word) for word in words]
    return dict(FreqDist(words))

In [3]:
docs: list[Document] = []
with open("data/RawDocuments.pickle", "rb") as f:
    while True:
        try:
            docs.append(pickle.load(f))
        except EOFError:
            break

In [4]:
%%time
vectors: list[Vector] = []
for doc in docs:
    vectors.append(Vector(doc.question_title, doc.question_id, process_text(doc.text)))

CPU times: total: 15min 25s
Wall time: 16min 6s


In [5]:
with open("data/Documents.pickle", "wb") as f:
    for vec in vectors:
        pickle.dump(vec, f)

In [6]:
initial_alphabet = {}
word_count = 0
for vec in vectors:
    for word in vec.vector:
        if word not in initial_alphabet:
            initial_alphabet[word] = 1
        else:
            initial_alphabet[word] += 1
        word_count += 1

In [7]:
print(len(initial_alphabet), word_count)

676754 21235997


In [8]:
def remove_words_with_count_below_2(alphabet):
    new_alphabet = {}
    for word in alphabet:
        if alphabet[word] > 2:
            new_alphabet[word] = alphabet[word]
    return new_alphabet

In [9]:
initial_alphabet = remove_words_with_count_below_2(initial_alphabet)

In [10]:
print(len(initial_alphabet))

98021


In [11]:
word_c = {}
for v in vectors:
    for w in v.vector:
        if w not in word_c:
            word_c[w] = 1
        else:
            word_c[w] += 1

In [12]:
def remove_weird_words(alphabet):
    new_alphabet = {}
    for word in alphabet:
        if len(word) <= 20:
            new_alphabet[word] = alphabet[word]
    return new_alphabet

In [13]:
initial_alphabet = remove_weird_words(initial_alphabet)

In [14]:
print(len(initial_alphabet))

97308


In [15]:
ind = 0
indexed_alphabet = {}
for word in initial_alphabet:
    indexed_alphabet[word] = ind
    ind += 1

In [16]:
with open("data/alphabet.json", "w", encoding="latin-1") as f:
    json.dump(indexed_alphabet, f)

In [17]:
for vec in vectors:
    new_vector = {}
    for word in vec.vector:
        if word in indexed_alphabet:
            new_vector[word] = vec.vector[word]
    vec.vector = new_vector

In [18]:
with open("data/Bag_of_words.pickle", "wb") as f:
    for vec in vectors:
        pickle.dump(vec, f)