# Seq2Vec Overview

In [1]:
from tg.grammar_ru.corpus import ParallelCorpus, CorpusReader
from tg.grammar_ru.common.separator.df_viewer import DfViewer
from tg.ca import Seq2VecMatcher
import pandas as pd
from pathlib import Path

### Now we can use the Parallel Corpus class to have access to various subcorpuses.

In [2]:
parallel_corpus = ParallelCorpus(Path('./files/parallel_corpus.zip'))

In [3]:
book_reader: CorpusReader = parallel_corpus.ru_book
book_firts_chapter_id = book_reader.get_toc().filename[0]
book_chapter_df = book_reader.get_frames([book_firts_chapter_id]).first()

### Now we know id of first chapter in the book. 

### We can use parallel corpus to get the first chapter of the retell by mapping.

In [4]:
retell_chapter_df = parallel_corpus.get_mapped_data([book_firts_chapter_id],['ru_book_ru_retell']).first()['ru_book_ru_retell']

### Finally, we can see which sentences from the book and retell match.

#### For this task we will use the Seq2VecConverter class, get_matches method.

In [5]:
matcher = Seq2VecMatcher()
matched_ids, matched_sentences = matcher.get_matches(book_chapter_df,retell_chapter_df,need_matching_df=True)

### Now we can see the matched sentences.

In [6]:
df_to_display = pd.merge(book_chapter_df,matched_ids,left_on='sentence_id',right_on='sentence_id',how='left')
sentences_to_display = matcher.viewer.to_sentences_strings(df_to_display,'MatchedWith').to_frame()
sentences_to_display = pd.merge(df_to_display,sentences_to_display,left_on='MatchedWith',right_on='MatchedWith',how='left')

In [7]:
sentences_to_display

Unnamed: 0,word_id,sentence_id,word_index,paragraph_id,word_tail,word,word_type,word_length,file_id,corpus_id,MatchedWith,word_print
0,0,0,0,0,1,В,ru,1,5db7a05d-3e68-49d1-ac12-dcea52df2264,parallel_corpus.zip,631302,"В начале июля, в чрезвычайно жаркое время, под..."
1,1,0,1,0,1,начале,ru,6,5db7a05d-3e68-49d1-ac12-dcea52df2264,parallel_corpus.zip,631302,"В начале июля, в чрезвычайно жаркое время, под..."
2,2,0,2,0,0,июля,ru,4,5db7a05d-3e68-49d1-ac12-dcea52df2264,parallel_corpus.zip,631302,"В начале июля, в чрезвычайно жаркое время, под..."
3,3,0,3,0,1,",",punct,1,5db7a05d-3e68-49d1-ac12-dcea52df2264,parallel_corpus.zip,631302,"В начале июля, в чрезвычайно жаркое время, под..."
4,4,0,4,0,1,в,ru,1,5db7a05d-3e68-49d1-ac12-dcea52df2264,parallel_corpus.zip,631302,"В начале июля, в чрезвычайно жаркое время, под..."
...,...,...,...,...,...,...,...,...,...,...,...,...
3238,3238,183,4,49,1,будто,ru,5,5db7a05d-3e68-49d1-ac12-dcea52df2264,parallel_corpus.zip,631326,Но никто не разделял его счастия; молчаливый т...
3239,3239,183,5,49,1,в,ru,1,5db7a05d-3e68-49d1-ac12-dcea52df2264,parallel_corpus.zip,631326,Но никто не разделял его счастия; молчаливый т...
3240,3240,183,6,49,1,некотором,ru,9,5db7a05d-3e68-49d1-ac12-dcea52df2264,parallel_corpus.zip,631326,Но никто не разделял его счастия; молчаливый т...
3241,3241,183,7,49,0,волнении,ru,8,5db7a05d-3e68-49d1-ac12-dcea52df2264,parallel_corpus.zip,631326,Но никто не разделял его счастия; молчаливый т...


In [8]:
DfViewer().highlight('word_print','auto').tooltip('MatchedWith').to_html_display(sentences_to_display)