# Text fragmentation

## Overview
This demo shows text fragmentation and retells post-processing with parallel corpus:
- text fragmentation with localization (english, russian)
- model retells post-processing
- translating english retells to russian
- writing valuable data to parallel corpus

#### Text fragmentation with localization (english, russian)
The FragmentsBuilder parses the text from book corpus into json-format file with: 
- prompt
- text fragments of preset words limit
- some info about text frame (like first word id and last word id in book corpus, chapter). 

First of all build book corpus from md file.

In [2]:
from tg.grammar_ru.corpus import CorpusBuilder
from tg.grammar_ru.corpus import CorpusReader
from pathlib import Path

In [None]:
CorpusBuilder.convert_interformat_folder_to_corpus(
    Path('./files/eng_crime_and_puhishment.base.zip'),
    Path('./source/book/eng'),
    ['book']
)

In [None]:
CorpusBuilder.convert_interformat_folder_to_corpus(
    Path('./files/ru_crime_and_puhishment.base.zip'),
    Path('./source/book/ru'),
    ['book']
)

In [3]:
eng_book_reader = CorpusReader(Path('./files/eng_crime_and_puhishment.base.zip'))
eng_book = eng_book_reader.get_toc().index

ru_book_reader = CorpusReader(Path('./files/ru_crime_and_puhishment.base.zip'))
ru_book = ru_book_reader.get_toc().index

Fragmentation of english and russian books.

In [4]:
from tg.ca.book_fragments.fragments_builder import FragmentsBuilder
from tg.ca.book_fragments.localizators.ru_localizator import RussianLocalizator
from tg.ca.book_fragments.localizators.eng_localizator import EnglishLocalizator

In [None]:
eng_corpus = Path('./files/eng_crime_and_puhishment.base.zip')

eng_fragments_builder = FragmentsBuilder(
    eng_corpus, 
    output_path='./files/fragments', 
    file_name="eng_crime_and_punishment_fragments", 
    localizator=EnglishLocalizator()
)

eng_fragments_builder.construct_fragments_json()

In [None]:
ru_corpus = Path('./files/ru_crime_and_puhishment.base.zip')

ru_fragments_builder = FragmentsBuilder(
    ru_corpus, 
    output_path='./files/fragments', 
    file_name="ru_crime_and_punishment_fragments", 
    localizator=RussianLocalizator(),
    prompt='{}'
)

ru_fragments_builder.construct_fragments_json()

#### Model retells post-processing

Model returns json file with retells and some log info, after that retells are cleared and prettified,
then prepared texts are parsed into corresponding corpuses.

In [5]:
from tg.ca.book_fragments.utils.parse_retells_to_corpus import parse_retells_to_corpus

In [6]:
parse_retells_to_corpus(
    Path('./files/fragments/eng_crime_and_punishment_fragments.json'),
    Path('./source/retell/eng/eng_crime_and_punishment_fragments.json'),
    Path('./files/eng_crime_and_punishment_retell.base.zip'),
)

In [7]:
parse_retells_to_corpus(
    Path('./files/fragments/ru_crime_and_punishment_fragments.json'),
    Path('./source/retell/ru/ru_crime_and_punishment_fragments.json'),
    Path('./files/ru_crime_and_punishment_retell.base.zip'),
)

In [None]:
ru_retell_reader = CorpusReader(Path('./files/ru_crime_and_punishment_retell.base.zip'))
ru_retell = ru_retell_reader.get_toc().index
ru_retell_reader.get_frames().first()

In [None]:
eng_retell_reader = CorpusReader(Path('./files/eng_crime_and_punishment_retell.base.zip'))
eng_retell = eng_retell_reader.get_toc().index
eng_retell_reader.get_toc()

Functions for convenient parallel corpus assemblying.

In [9]:
import pandas as pd

def add_relation(df_1,df_2,name_1,name_2):
    rel_1 = pd.DataFrame({'file_1':df_1, 'file_2':df_2,'relation_name':f"{name_1}_{name_2}"})
    rel_2 = pd.DataFrame({'file_1':df_2, 'file_2':df_1,'relation_name':f"{name_2}_{name_1}"})
    rel = pd.concat([rel_1,rel_2])
    return rel

def add_dfs(name):
    frames = list(name.get_frames())
    dfs = dict(zip(name.get_toc().index,frames))

    return dfs

Record english and russian books corpuses.

In [None]:
eng_book_reader = CorpusReader(Path('./files/eng_crime_and_puhishment.base.zip'))
eng_book_reader.get_toc()

In [None]:
ru_book_reader = CorpusReader(Path('./files/ru_crime_and_puhishment.base.zip'))
ru_book_reader.get_toc()

In [16]:
CorpusBuilder.update_parallel_data(
    Path('./files/parallel_corpus.zip'),
    add_dfs(eng_book_reader),
    "eng_book",
    None)

CorpusBuilder.update_parallel_data(
    Path('./files/parallel_corpus.zip'),
    add_dfs(ru_book_reader),
    "ru_book",
    None)

Record english and russian retells, add relations to books.

In [14]:
eng_retell_reader = CorpusReader(Path('./files/eng_crime_and_punishment_retell.base.zip'))
eng_retell_reader.get_toc()

Unnamed: 0_level_0,filename,timestamp,part_index,token_count,character_count,ordinal,max_id
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c5e4cd7a-de31-4edc-b516-80919ca28e2e,,2024-01-15 15:49:18.101647,0,1111,4745,0,1112
39682f5e-2cc2-4508-a03c-11c8c4569c31,,2024-01-15 15:49:18.257646,1,1871,8318,1,12984
6e2ffc99-2d23-4379-9bcf-ef65409e71b5,,2024-01-15 15:49:18.350646,2,768,3470,2,23753
c34ace1a-9718-44f6-aacf-b27897e1fd16,,2024-01-15 15:49:18.509646,3,1606,7099,3,35360
708c8fbb-585b-419f-8b04-d7966861aff3,,2024-01-15 15:49:18.666646,4,1308,5539,4,46669
fbc04c3f-6faa-485f-b81f-8a6601df3d73,,2024-01-15 15:49:18.848647,5,1724,7486,5,58394
ba9c9c95-82ba-44d8-b3a8-7e49def0276f,,2024-01-15 15:49:19.016647,6,1391,5820,6,69786
ff12d4c6-addd-4caf-b9ff-5f5f995dda34,,2024-01-15 15:49:19.330647,7,2304,10145,7,82091
0487b31a-a58f-4bd5-84bd-96b94f6b78e7,,2024-01-15 15:49:19.492646,8,1466,6362,8,93558
7bb1aeb1-2d5f-4e72-880a-5963a197c01d,,2024-01-15 15:49:19.639647,9,1904,8681,9,105463


In [15]:
ru_retell_reader = CorpusReader(Path('./files/ru_crime_and_punishment_retell.base.zip'))
ru_retell_reader.get_toc()

Unnamed: 0_level_0,filename,timestamp,part_index,token_count,character_count,ordinal,max_id
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
93929f1a-7900-44f6-8509-49b4bacb06ae,,2024-01-15 15:49:25.760659,0,724,3811,0,725
00f00596-9959-4246-961f-5ec9da9605c1,,2024-01-15 15:49:25.961645,1,2870,14396,1,13596
caecc37d-c1d8-4dac-a1bb-0f5f4b971b2d,,2024-01-15 15:49:26.063645,2,1030,5133,2,24627
4f6e402f-0399-43b0-9bb8-837dc1783b71,,2024-01-15 15:49:26.169646,3,1352,7025,3,35980
2999de05-28f9-49a2-a661-f82809975d21,,2024-01-15 15:49:26.293647,4,1392,6940,4,47373
2845cb14-8659-4f5b-bb77-22f7223fb6bd,,2024-01-15 15:49:26.492647,5,1697,9290,5,59071
cec6f89a-acf8-44e0-9555-cc24030b29c6,,2024-01-15 15:49:26.636645,6,1733,8433,6,70805
2e5a649a-6121-4878-ba53-ae2ad2809830,,2024-01-15 15:49:26.810645,7,2501,12361,7,83307
54bbed2a-233d-427a-b0cc-b7b0346d6ab1,,2024-01-15 15:49:26.922647,8,1371,6851,8,94679
f47b979b-567d-4699-b28e-9b0c3332e239,,2024-01-15 15:49:27.068645,9,1656,8291,9,106336


In [17]:
CorpusBuilder.update_parallel_data(
    Path('./files/parallel_corpus.zip'),
    add_dfs(eng_retell_reader),
    "eng_retell",
    add_relation(eng_book_reader.get_toc().index, eng_retell_reader.get_toc().index, "eng_book", "eng_retell"))

CorpusBuilder.update_parallel_data(
    Path('./files/parallel_corpus.zip'),
    add_dfs(ru_retell_reader),
    "ru_retell",
    add_relation(ru_book_reader.get_toc().index, ru_retell_reader.get_toc().index, "ru_book", "ru_retell"))

  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_w

In [19]:
path_parallel_corpus = Path('./files/parallel_corpus.zip')

In [20]:
!pip install googletrans==3.1.0a0



Add data with translated english retell.

In [21]:
from tg.ca.utils_translate import translate_subcorpus

translate_subcorpus(path_parallel_corpus,"eng_retell")

  0%|          | 0/1 [00:00<?, ?it/s]

  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_w

Finally, show parallel corpus contents.

In [24]:
reader = CorpusReader(path_parallel_corpus)
df = reader.get_toc()
df.subcorpus_name.unique()

array(['eng_book', 'ru_book', 'eng_retell', 'ru_retell', 'ru_translate'],
      dtype=object)

Clear corpuses

In [26]:
import os
from pathlib import Path

# os.remove(Path('./files/eng_crime_and_puhishment.base.zip'))
# os.remove(Path('./files/ru_crime_and_puhishment.base.zip'))
os.remove(Path('./files/eng_crime_and_punishment_retell.base.zip'))
os.remove(Path('./files/ru_crime_and_punishment_retell.base.zip'))
os.remove(Path('./files/translate.base.zip'))