In [5]:
import os
import pandas as pd
from typing import List, Set, Optional

from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize

## Изречения

In [2]:
def tokenize_file_sentences(filename) -> Set[str]:
    lines = read_file(filename)

    lines = [line.strip() for line in lines]
    lines = [line for line in lines if line != '']

    tokens_per_line = [sent_tokenize(line) for line in lines]

    print(tokens_per_line)


In [1]:
all_filenames = sum([[os.path.join(dirpath, filename) for filename in filenames] for dirpath, _, filenames in os.walk('data/books')], [])

f'Amount of books: {len(all_filenames)}'

'Amount of books: 47825'

In [4]:
def get_sentences_from_book(book_path: str) -> List[str]:
    print(book_path)
    with open(book_path, 'r') as file_descriptor:
        lines = file_descriptor.readlines()
        lines = [line.strip() for line in lines]
        lines = [line for line in lines if line != '']

        sentences = sum([sent_tokenize(line) for line in lines], [])
    
    return sentences

Текстовете съдържат форматиране, което е специфично за Читанка, и трябва да се премахне. Описано е [тук](https://forum.chitanka.info/topic511.html).

In [5]:
formatting_symbols = ['E>', 'E$', 'D>', 'D$', '@', 'C>', 'C$', 'P>', 'P$', 
                      'S>', 'S$', '\t', '|', '>', '#']

def cleanup_formatting(sentence: str) -> str:
    for symbol in formatting_symbols:
        while symbol in sentence:
            sentence = sentence.replace(symbol, '')
    return sentence

def tokenize_sentence(sentence: str, word_to_id: dict[str, int]) -> List[str]:
    
    sentence = sentence.lower()
    tokenized = word_tokenize(sentence)

    for token in tokenized:
        if token not in word_to_id:
            word_to_id[token] = len(word_to_id)
        
    tokenized = [word_to_id[token] for token in tokenized]
    return sentence, tokenized

In [6]:
word_to_id = {}
sample_books = get_sentences_from_book(all_filenames[0])
sample_books = [cleanup_formatting(sentence) for sentence in sample_books]
sample_books = [tokenize_sentence(sentence, word_to_id) for sentence in sample_books if sentence != '']

sample_books[0]

data/books/0d/3425


('като гледам муцуните на конете и лицата на хората, безбрежния жив поток, отприщен по моя воля и втурнал се наникъде през пурпурната залезна степ, често си мисля: къде съм аз в този поток?',
 [0,
  1,
  2,
  3,
  4,
  5,
  6,
  3,
  7,
  8,
  9,
  10,
  11,
  8,
  12,
  13,
  14,
  15,
  5,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  8,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  11,
  32])

## Ударения

In [7]:
def parse_emphasis(word: str):
    """
    Input: а`виобра`нш
    Output: [0, 6]
    """
    emphasis_positions = [i for i, letter in enumerate(word) if letter == '`']

    # i' = i - 1 - amount of previous ` symbols
    corrected_emphasis_positions = tuple(position - 1 - i for i, position in enumerate(emphasis_positions))

    return corrected_emphasis_positions

parse_emphasis('а`виобра`нш')

(0, 6)

In [8]:
words = pd.read_csv('data/words.csv', header=None, names=['word', 'transcription'])


words['emphasis_indexes'] = words['transcription'].apply(parse_emphasis)
words.drop(columns=['transcription'], inplace=True)


words.to_csv('data/emphasis.csv', index=False)
words.head()


Unnamed: 0,word,emphasis_indexes
0,аванпост,"(2,)"
1,авиоас,"(0, 4)"
2,авиобос,"(0, 5)"
3,авиобранш,"(0, 6)"
4,авиоград,"(0, 6)"


In [9]:
from transcription_generation import TranscriptionGeneration

## Генериране на двойки изречение - ударение

In [13]:
def generate_data_from_book(book_path: str, transcription_generator: TranscriptionGeneration, 
                            word_to_id: dict[str, int], transcription_to_id: dict[str, int]):
    senteces = get_sentences_from_book(book_path)
    senteces = [cleanup_formatting(sentence) for sentence in senteces]
    senteces = [tokenize_sentence(sentence, word_to_id) for sentence in senteces if sentence != '']

    tokens = [token for _, token in senteces]   
    senteces = [sentence for sentence, _ in senteces]
    transcriptions = [transcription_generator.generate_transcription(sentence) for sentence in senteces]

    tokenized_transcription = [tokenize_sentence(transcription, transcription_to_id)[1] for transcription in transcriptions]
    df = pd.DataFrame({'sentence': senteces, 'tokens': tokens, 
                       'transcription': transcriptions, 'transcription_tokens': tokenized_transcription})

    return df

In [17]:
if not os.path.exists('data/transcriptions'):
    os.makedirs('data/transcriptions')
    word_to_id = {}
    transcription_to_id = {}

    transcription = TranscriptionGeneration()
    for filename in tqdm(all_filenames):
        df = generate_data_from_book(filename, transcription, word_to_id, transcription_to_id)
        new_filename = filename.replace('books', 'transcriptions')
        os.makedirs(os.path.dirname(new_filename), exist_ok=True)
        df.to_csv(new_filename, index=False)
    
    word_to_id_df = pd.DataFrame({'word': list(word_to_id.keys()), 'id': list(word_to_id.values())})
    word_to_id_df.to_csv('data/word_to_id.csv', index=False)

    transcription_to_id_df = pd.DataFrame({'transcription': list(transcription_to_id.keys()),
                                             'id': list(transcription_to_id.values())})
    transcription_to_id_df.to_csv('data/transcription_to_id.csv', index=False)

  0%|          | 0/47825 [00:00<?, ?it/s]

data/books/0d/3425
data/books/0d/3348
data/books/0d/3366
data/books/0d/3374
data/books/0d/3536
data/books/0d/3353
data/books/0d/3481
data/books/0d/3433
data/books/0d/3478
data/books/0d/3381
data/books/0d/3454
data/books/0d/3469
data/books/0d/3424
data/books/0d/3409
data/books/0d/3513
data/books/0d/3343
data/books/0d/3355
data/books/0d/3426
data/books/0d/3403
data/books/0d/3449
data/books/0d/3528
data/books/0d/3405
data/books/0d/3505
data/books/0d/3568
data/books/0d/3520
data/books/0d/3390
data/books/0d/3477
data/books/0d/3385
data/books/0d/3464
data/books/0d/3456
data/books/0d/3443
data/books/0d/3367
data/books/0d/3404
data/books/0d/3440
data/books/0d/3531
data/books/0d/3393
data/books/0d/3566
data/books/0d/3414
data/books/0d/3442
data/books/0d/3506
data/books/0d/3501
data/books/0d/3581
data/books/0d/3444
data/books/0d/3508
data/books/0d/3334
data/books/0d/3351
data/books/0d/3378
data/books/0d/3435
data/books/0d/3386
data/books/0d/3411
data/books/0d/3570
data/books/0d/3580
data/books/0

In [1]:
!du -sch data/transcriptions

27G	data/transcriptions
27G	total


## Stats

In [4]:
files = [os.path.join(root, file)  for root, dirs, files in os.walk('data/transcriptions') for file in files]
files

['data/transcriptions/0d/3425',
 'data/transcriptions/0d/3348',
 'data/transcriptions/0d/3366',
 'data/transcriptions/0d/3374',
 'data/transcriptions/0d/3536',
 'data/transcriptions/0d/3353',
 'data/transcriptions/0d/3481',
 'data/transcriptions/0d/3433',
 'data/transcriptions/0d/3478',
 'data/transcriptions/0d/3381',
 'data/transcriptions/0d/3454',
 'data/transcriptions/0d/3469',
 'data/transcriptions/0d/3424',
 'data/transcriptions/0d/3409',
 'data/transcriptions/0d/3513',
 'data/transcriptions/0d/3343',
 'data/transcriptions/0d/3355',
 'data/transcriptions/0d/3426',
 'data/transcriptions/0d/3403',
 'data/transcriptions/0d/3449',
 'data/transcriptions/0d/3528',
 'data/transcriptions/0d/3405',
 'data/transcriptions/0d/3505',
 'data/transcriptions/0d/3568',
 'data/transcriptions/0d/3520',
 'data/transcriptions/0d/3390',
 'data/transcriptions/0d/3477',
 'data/transcriptions/0d/3385',
 'data/transcriptions/0d/3464',
 'data/transcriptions/0d/3456',
 'data/transcriptions/0d/3443',
 'data/t

In [2]:
df = pd.read_csv(files[1])
df.head()

NameError: name 'pd' is not defined

In [21]:
vocabilaries = pd.read_csv('data/word_to_id.csv')
vocabilaries.shape[0]

2648747

In [23]:
print(f'Amount of books: {len(all_filenames)}')
print(f'Amount of different tokens: {vocabilaries.shape[0]}')


Amount of books: 47825
Amount of different tokens: 2648747
Amount of sentences: 47825


In [2]:
def get_amount_of_entries(filepath: str) -> int:
    with open(filepath) as fp:
        return sum(1 for _ in fp) - 1

In [8]:
filepath_to_size = {filename: get_amount_of_entries(filename) for filename in files}

In [None]:
filepath_to_size_df = pd.DataFrame({'filepath': list(filepath_to_size.keys()),
                                             'size': list(filepath_to_size.values())})
filepath_to_size_df.to_csv('data/filepath_to_size.csv', index=False)