In [1]:
import os
import pandas as pd

from itertools import accumulate
from typing import List, Set, Optional
from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize

import transformer_model.src.constants as const

## Изречения

In [None]:
def tokenize_file_sentences(filename) -> Set[str]:
    lines = read_file(filename)

    lines = [line.strip() for line in lines]
    lines = [line for line in lines if line != '']

    tokens_per_line = [sent_tokenize(line) for line in lines]

    print(tokens_per_line)


In [3]:
all_filenames = sum([[os.path.join(dirpath, filename) for filename in filenames] for dirpath, _, filenames in os.walk('/mnt/d/Projects/masters-thesis/data/transcriptions')], [])

f'Amount of books: {len(all_filenames)}'

'Amount of books: 47825'

In [None]:
def get_sentences_from_book(book_path: str) -> List[str]:
    print(book_path)
    with open(book_path, 'r') as file_descriptor:
        lines = file_descriptor.readlines()
        lines = [line.strip() for line in lines]
        lines = [line for line in lines if line != '']

        sentences = sum([sent_tokenize(line) for line in lines], [])
    
    return sentences

Текстовете съдържат форматиране, което е специфично за Читанка, и трябва да се премахне. Описано е [тук](https://forum.chitanka.info/topic511.html).

In [None]:
formatting_symbols = ['E>', 'E$', 'D>', 'D$', '@', 'C>', 'C$', 'P>', 'P$', 
                      'S>', 'S$', '\t', '|', '>', '#']

def cleanup_formatting(sentence: str) -> str:
    for symbol in formatting_symbols:
        while symbol in sentence:
            sentence = sentence.replace(symbol, '')
    return sentence

def tokenize_sentence(sentence: str, word_to_id: dict[str, int]) -> List[str]:
    
    sentence = sentence.lower()
    tokenized = word_tokenize(sentence)

    for token in tokenized:
        if token not in word_to_id:
            word_to_id[token] = len(word_to_id)
        
    tokenized = [word_to_id[token] for token in tokenized]
    return sentence, tokenized

In [None]:
word_to_id = {}
sample_books = get_sentences_from_book(all_filenames[0])
sample_books = [cleanup_formatting(sentence) for sentence in sample_books]
sample_books = [tokenize_sentence(sentence, word_to_id) for sentence in sample_books if sentence != '']

sample_books[0]

## Ударения

In [None]:
def parse_emphasis(word: str):
    """
    Input: а`виобра`нш
    Output: [0, 6]
    """
    emphasis_positions = [i for i, letter in enumerate(word) if letter == '`']

    # i' = i - 1 - amount of previous ` symbols
    corrected_emphasis_positions = tuple(position - 1 - i for i, position in enumerate(emphasis_positions))

    return corrected_emphasis_positions

parse_emphasis('а`виобра`нш')

In [None]:
words = pd.read_csv('data/words.csv', header=None, names=['word', 'transcription'])


words['emphasis_indexes'] = words['transcription'].apply(parse_emphasis)
words.drop(columns=['transcription'], inplace=True)


words.to_csv('data/emphasis.csv', index=False)
words.head()


In [None]:
from transcription_generation import TranscriptionGeneration

## Генериране на двойки изречение - ударение

In [None]:
def generate_data_from_book(book_path: str, transcription_generator: TranscriptionGeneration, 
                            word_to_id: dict[str, int], transcription_to_id: dict[str, int]):
    senteces = get_sentences_from_book(book_path)
    senteces = [cleanup_formatting(sentence) for sentence in senteces]
    senteces = [tokenize_sentence(sentence, word_to_id) for sentence in senteces if sentence != '']

    tokens = [token for _, token in senteces]   
    senteces = [sentence for sentence, _ in senteces]
    transcriptions = [transcription_generator.generate_transcription(sentence) for sentence in senteces]

    tokenized_transcription = [tokenize_sentence(transcription, transcription_to_id)[1] for transcription in transcriptions]
    df = pd.DataFrame({'sentence': senteces, 'tokens': tokens, 
                       'transcription': transcriptions, 'transcription_tokens': tokenized_transcription})

    return df

In [None]:
if not os.path.exists('data/transcriptions'):
    os.makedirs('data/transcriptions')
    word_to_id = {}
    transcription_to_id = {}

    transcription = TranscriptionGeneration()
    for filename in tqdm(all_filenames):
        df = generate_data_from_book(filename, transcription, word_to_id, transcription_to_id)
        new_filename = filename.replace('books', 'transcriptions')
        os.makedirs(os.path.dirname(new_filename), exist_ok=True)
        df.to_csv(new_filename, index=False)
    
    word_to_id_df = pd.DataFrame({'word': list(word_to_id.keys()), 'id': list(word_to_id.values())})
    word_to_id_df.to_csv('data/word_to_id.csv', index=False)

    transcription_to_id_df = pd.DataFrame({'transcription': list(transcription_to_id.keys()),
                                             'id': list(transcription_to_id.values())})
    transcription_to_id_df.to_csv('data/transcription_to_id.csv', index=False)

In [None]:
!du -sch data/transcriptions

## Dataset indexing

In [2]:
files = [os.path.join(root, file)  for root, dirs, files in os.walk('/mnt/d/Projects/masters-thesis/data/transcriptions') for file in files]

In [3]:
def get_amount_of_entries(filepath: str) -> int:
    with open(filepath) as fp:
        return sum(1 for _ in fp) - 1

In [4]:
filepath_to_size = {filename: get_amount_of_entries(filename) for filename in files}

KeyboardInterrupt: 

In [11]:
len(filepath_to_size)

47825

In [12]:
filepath_to_size_df = pd.DataFrame({'filepath': list(filepath_to_size.keys()),
                                             'size': list(filepath_to_size.values())})

filepath_to_size_df = filepath_to_size_df.sort_values(by=['filepath'])
filepath_to_size_df['end_index'] = list(accumulate(filepath_to_size_df['size']))
filepath_to_size_df['start_index'] = [0] + list(filepath_to_size_df['end_index'].iloc()[:-1])

filepath_to_size_df.head()

Unnamed: 0,filepath,size,end_index,start_index
0,/mnt/d/Projects/masters-thesis/data/transcript...,10731,10731,0
1,/mnt/d/Projects/masters-thesis/data/transcript...,136,10867,10731
2,/mnt/d/Projects/masters-thesis/data/transcript...,13136,24003,10867
3,/mnt/d/Projects/masters-thesis/data/transcript...,12301,36304,24003
4,/mnt/d/Projects/masters-thesis/data/transcript...,12298,48602,36304


In [15]:
lines_count = filepath_to_size_df['size'].sum()
split = int(const.TRAIN_TEST_SPLIT * lines_count)

split

64492427

In [None]:
split = int(const.TRAIN_TEST_SPLIT * lines_count)

In [15]:
filepath_to_size_df.to_csv('/mnt/d/Projects/masters-thesis/data/filepath_to_size.csv', index=False)

## Stats

In [7]:
df = pd.read_csv(files[1])
df.head()

Unnamed: 0,sentence,tokens,transcription,transcription_tokens
0,"тук е пълно с гении, докторе.","[283, 65, 530, 187, 129444, 8, 44327, 58]","tok ɛ pɐlno s gɛnii , doktorɛ .","[283, 65, 529, 187, 128247, 8, 44188, 58]"
1,"тук всеки е написал нещо изумително, или изобр...","[283, 382, 65, 3264, 119, 56219, 8, 396, 68696...","tok vsɛki ɛ nʌpisʌl nɛʃto izomitɛlno , ili izo...","[283, 382, 65, 3258, 119, 56029, 8, 396, 68443..."
2,"тук ухае на велики идеи и велики творби, пък а...","[283, 67329, 3, 15763, 4280, 5, 15763, 7793, 8...","tok oxʌɛ nʌ vɛliki idɛi i vɛliki tvorbi , pɐk ...","[283, 67085, 3, 15716, 4271, 5, 15716, 7776, 8..."
3,тънък и дълъг дървен цилиндър.,"[3446, 5, 1072, 16521, 12915, 58]",tɐnɐk i dɐlɐg dɐrvɛn tsilindɐr .,"[3440, 5, 1069, 16473, 12879, 58]"
4,"долу съм вкопан в земята, затова дойдох при те...","[2072, 28, 51855, 30, 573, 8, 59, 8749, 138, 1...","dolo sɐm vkopʌn v zɛmjɐtʌ , zʌtovʌ dojdox pri ...","[1558, 28, 51680, 30, 572, 8, 59, 8730, 138, 1..."


In [8]:
vocabilaries = pd.read_csv('/mnt/d/Projects/masters-thesis/data/word_to_id.csv')
vocabilaries.shape[0]

2648747

In [14]:
print(f'Amount of books: {len(all_filenames)}')
print(f'Amount of different tokens: {vocabilaries.shape[0]}')

filepath_to_size_path = '/mnt/d/Projects/masters-thesis/data/filepath_to_size.csv'
if os.path.exists(filepath_to_size_path):
    filepath_to_size = pd.read_csv(filepath_to_size_path)
    print(filepath_to_size['size'].sum())

Amount of books: 47825
Amount of different tokens: 2648747
80615534


# Word-based model

In [10]:
from transcription_generation import TranscriptionGeneration

transcription = TranscriptionGeneration()

In [11]:
words = pd.read_csv('data/emphasis.csv').drop(columns=['emphasis_indexes'])

words['transcription'] = words['word'].apply(transcription.generate_transcription)
words.head()

Unnamed: 0,word,transcription
0,аванпост,ʌvʌnpost
1,авиоас,ʌvioʌs
2,авиобос,ʌviobos
3,авиобранш,ʌviobrʌnʃ
4,авиоград,ʌviogrʌd


Tokenizing the text

In [12]:
def vectorize_word(word: str, word_to_id: dict[str, int]) -> List[str]:
    for token in word:
        if token not in word_to_id:
            word_to_id[token] = len(word_to_id)
        
    vectorized = [word_to_id[token] for token in word]
    return vectorized

In [13]:
word_to_id = {}
transcription_to_id = {}

words['word_vector'] = words['word'].apply(vectorize_word, args=(word_to_id,))
words['transcription_vector'] = words['transcription'].apply(vectorize_word, args=(transcription_to_id,))

words.head()

Unnamed: 0,word,transcription,word_vector,transcription_vector
0,аванпост,ʌvʌnpost,"[0, 1, 0, 2, 3, 4, 5, 6]","[0, 1, 0, 2, 3, 4, 5, 6]"
1,авиоас,ʌvioʌs,"[0, 1, 7, 4, 0, 5]","[0, 1, 7, 4, 0, 5]"
2,авиобос,ʌviobos,"[0, 1, 7, 4, 8, 4, 5]","[0, 1, 7, 4, 8, 4, 5]"
3,авиобранш,ʌviobrʌnʃ,"[0, 1, 7, 4, 8, 9, 0, 2, 10]","[0, 1, 7, 4, 8, 9, 0, 2, 10]"
4,авиоград,ʌviogrʌd,"[0, 1, 7, 4, 11, 9, 0, 12]","[0, 1, 7, 4, 11, 9, 0, 12]"


In [16]:
words.to_csv('data/word-based/words.csv', index=False)

word_to_id_df = pd.DataFrame({'word': list(word_to_id.keys()), 'id': list(word_to_id.values())})
word_to_id_df.to_csv('data/word-based/word_to_id.csv', index=False)

transcription_to_id_df = pd.DataFrame({'transcription': list(transcription_to_id.keys()), 'id': list(transcription_to_id.values())})
transcription_to_id_df.to_csv('data/word-based/transcription_to_id.csv', index=False)

# Single-word

In [2]:
import pandas as pd
from src.dataset_utils import load_files

files = load_files('/mnt/d/Projects/masters-thesis/data/transcriptions')