In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
book_paths = [
    '/mnt/d/Projects/masters-thesis/data/books/a5/42466',
    '/mnt/d/Projects/masters-thesis/data/books/28/10479',
    '/mnt/d/Projects/masters-thesis/data/books/00/19',
    '/mnt/d/Projects/masters-thesis/data/books/63/25456',
    '/mnt/d/Projects/masters-thesis/data/books/25/9703'
]

In [3]:
def get_sentences_from_book(book_path: str) -> list[str]:
    with open(book_path, 'r') as file_descriptor:
        lines = file_descriptor.readlines()
        lines = [line.strip() for line in lines]
        lines = [line for line in lines if line != '']

        sentences = sum([sent_tokenize(line) for line in lines], [])
    
    return sentences


formatting_symbols = ['E>', 'E$', 'D>', 'D$', '@', 'C>', 'C$', 'P>', 'P$', 
                      'S>', 'S$', '\t', '|', '>', '#']

def cleanup_formatting(sentence: str) -> str:
    for symbol in formatting_symbols:
        while symbol in sentence:
            sentence = sentence.replace(symbol, '')
    return sentence

def tokenize_sentence(sentence: str) -> list[str]:
    
    sentence = sentence.lower()
    tokenized = word_tokenize(sentence)

    return tokenized

def parse_book(book_path: str) -> set[str]:
    sentences = get_sentences_from_book(book_path)
    sentences = [cleanup_formatting(sentence) for sentence in sentences]
    words = [tokenize_sentence(sentence) for sentence in sentences if sentence != '']

    words = sum(words, [])
    words = [word.lower() for word in words]
    return set(words)

In [12]:
words = set()
for book_path in book_paths[:1]:
    words = words | parse_book(book_path)

len(words)

13936

In [13]:
df = pd.DataFrame(words, columns=['дума'])
df['произход'] = 'bg'
df

Unnamed: 0,дума,произход
0,показват,bg
1,бебешка,bg
2,проговоря,bg
3,случаен,bg
4,потопят,bg
...,...,...
13931,зн…,bg
13932,спрял,bg
13933,забили,bg
13934,дългичко,bg


In [14]:
loanwords = pd.read_csv('/mnt/d/Projects/masters-thesis/data/loanwords_only.csv')

loanwords = pd.concat([loanwords, df])
loanwords = loanwords.dropna()

loanwords.to_csv('/mnt/d/Projects/masters-thesis/data/loanwords_3.csv', index=False)

In [7]:
loanwords = pd.read_csv('/mnt/d/Projects/masters-thesis/data/loanwords.csv')
del loanwords[loanwords.columns[0]]

In [10]:
loanwords.to_csv('/mnt/d/Projects/masters-thesis/data/loanwords_2.csv', index=False)