In [1]:
import os
import pandas as pd

from typing import List, Set

from nltk.tokenize import sent_tokenize

## Изречения

In [2]:
def tokenize_file_sentences(filename) -> Set[str]:
    lines = read_file(filename)

    lines = [line.strip() for line in lines]
    lines = [line for line in lines if line != '']

    tokens_per_line = [sent_tokenize(line) for line in lines]

    print(tokens_per_line)


In [3]:
all_filenames = sum([[os.path.join(dirpath, filename) for filename in filenames] for dirpath, _, filenames in os.walk('data/books')], [])

f'Amount of books: {len(all_filenames)}'

'Amount of books: 47825'

In [4]:
def get_sentences_from_book(book_path: str) -> List[str]:
    print(book_path)
    with open(book_path, 'r') as file_descriptor:
        lines = file_descriptor.readlines()
        lines = [line.strip() for line in lines]
        lines = [line for line in lines if line != '']

        sentences = sum([sent_tokenize(line) for line in lines], [])
    
    return sentences

Текстовете съдържат форматиране, което е специфично за Читанка, и трябва да се премахне. Описано е [тук](https://forum.chitanka.info/topic511.html).

In [23]:
formatting_symbols = ['E>', 'E$', 'D>', 'D$', '@', 'C>', 'C$', 'P>', 'P$', 
                      'S>', 'S$', '\t', '|', '>', '#']

In [24]:
def cleanup_formatting(sentence: str) -> str:
    for symbol in formatting_symbols:
        while symbol in sentence:
            sentence = sentence.replace(symbol, '')
    return sentence

In [25]:
sample_books = get_sentences_from_book(all_filenames[0])
sample_books = [cleanup_formatting(sentence) for sentence in sample_books]
sample_books = [sentence for sentence in sample_books if sentence != '']

sample_books[:5]

data/books/0d/3425


['Като гледам муцуните на конете и лицата на хората, безбрежния жив поток, отприщен по моя воля и втурнал се наникъде през пурпурната залезна степ, често си мисля: къде съм Аз в този поток?',
 'Чингиз Хан',
 'По ред причини името на действителния автор на настоящия ръкопис, създаден през първата половина на двайсетте години в един от манастирите във Вътрешна Монголия, не може да бъде назовано.',
 'Затова публикуваме ръкописа от името на редактора, който го е подготвил за печат.',
 'От оригинала са отпаднали описанията на някои магически процедури и значителни по обем спомени на разказвача за живота му в Петербург от преди революцията (така нареченият „Петербургски период“).']

## Ударения

In [8]:
def parse_emphasis(word: str):
    """
    Input: а`виобра`нш
    Output: [0, 6]
    """
    emphasis_positions = [i for i, letter in enumerate(word) if letter == '`']

    # i' = i - 1 - amount of previous ` symbols
    corrected_emphasis_positions = tuple(position - 1 - i for i, position in enumerate(emphasis_positions))

    return corrected_emphasis_positions

parse_emphasis('а`виобра`нш')

(0, 6)

In [9]:
words = pd.read_csv('data/words.csv', header=None, names=['word', 'transcription'])


words['emphasis_indexes'] = words['transcription'].apply(parse_emphasis)
words.drop(columns=['transcription'], inplace=True)


words.to_csv('data/emphasis.csv', index=False)
words.head()


Unnamed: 0,word,emphasis_indexes
0,аванпост,"(2,)"
1,авиоас,"(0, 4)"
2,авиобос,"(0, 5)"
3,авиобранш,"(0, 6)"
4,авиоград,"(0, 6)"


In [10]:
from transcription_generation import TranscriptionGeneration

## Генериране на двойки изречение - ударение

In [34]:
def generate_data_from_book(book_path: str, transcription_generator: TranscriptionGeneration):
    senteces = get_sentences_from_book(book_path)
    senteces = [cleanup_formatting(sentence) for sentence in senteces]
    senteces = [sentence for sentence in senteces if sentence != '']

    transcriptions = [transcription_generator.generate_transcription(sentence) for sentence in senteces]

    df = pd.DataFrame({'sentence': senteces, 'transcription': transcriptions})

    return df

In [37]:
if not os.path.exists('data/transcriptions'):
    transcription = TranscriptionGeneration()
    for filename in all_filenames:
        df = generate_data_from_book(filename, transcription)
        new_filename = filename.replace('books', 'transcriptions')
        os.makedirs(os.path.dirname(new_filename), exist_ok=True)
        df.to_csv(new_filename, index=False)

data/books/0d/3425
data/books/0d/3348
data/books/0d/3366
data/books/0d/3374
data/books/0d/3536
data/books/0d/3353
data/books/0d/3481
data/books/0d/3433
data/books/0d/3478
data/books/0d/3381
data/books/0d/3454
data/books/0d/3469
data/books/0d/3424
data/books/0d/3409
data/books/0d/3513
data/books/0d/3343
data/books/0d/3355
data/books/0d/3426
data/books/0d/3403
data/books/0d/3449
data/books/0d/3528
data/books/0d/3405
data/books/0d/3505
data/books/0d/3568
data/books/0d/3520
data/books/0d/3390
data/books/0d/3477
data/books/0d/3385
data/books/0d/3464
data/books/0d/3456
data/books/0d/3443
data/books/0d/3367
data/books/0d/3404
data/books/0d/3440
data/books/0d/3531
data/books/0d/3393
data/books/0d/3566
data/books/0d/3414
data/books/0d/3442
data/books/0d/3506
data/books/0d/3501
data/books/0d/3581
data/books/0d/3444
data/books/0d/3508
data/books/0d/3334
data/books/0d/3351
data/books/0d/3378
data/books/0d/3435
data/books/0d/3386
data/books/0d/3411
data/books/0d/3570
data/books/0d/3580
data/books/0

In [38]:
!du -sch data/transcriptions

16G	data/transcriptions
16G	total
