In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from grammar_ru.corpus import CorpusReader, CorpusBuilder
from diplom.utils.corpus_utils import CorpusFramework
from diplom.utils.dialog_markuper import DialogMarkupFeaturizer
from grammar_ru.features import SnowballFeaturizer
import matplotlib.pyplot as plt
#reported speach.

In [38]:
def make_merged_df(bundle, only_dialog = False):
    src, dialog_markup = bundle.src, bundle.dialog_markup
    merged = dialog_markup.merge(chapter.src, on='word_id')
    return merged[(merged.dialog_token_type != 'none')] if only_dialog else merged

def get_chapter_bundle(author, series_id, chapter_id):
    book = corpus_framework.get_book_by_author_and_series_id(author,series_id)
    chapter = corpus_framework.get_chapter_as_bundle(book, chapter_id)
    return chapter

In [3]:
path_corpus = Path("./data/corpora/diplom.featurized.zip")
corpus = CorpusReader(path_corpus)
corpus_framework = CorpusFramework(corpus)

In [36]:
authors = corpus.get_toc().author.unique()
djordan,rowling,martin = authors
info_columns = ['word_id','sentence_id','word', 'dialog_token_type', 'dialog_id']

In [41]:
djordan_chapter = get_chapter_bundle(djordan,1,1)
rowling_chapter = get_chapter_bundle(rowling,1,1)
martin_chapter = get_chapter_bundle(martin,1,1)

In [44]:
djordan_dialog = make_merged_df(rowling_chapter)

In [46]:
book = corpus_framework.get_book_by_author_and_series_id(author,0)
chapter = corpus_framework.get_chapter_as_bundle(book, 1)

In [25]:
CorpusBuilder.featurize_corpus(
    path_corpus,
    Path(f"./data/corpora/diplom.featurized.zip"),
    [
        DialogMarkupFeaturizer(['"','“','”','“']),
    ]
)

In [2]:
path_to_dialog = Path("./data/corpora/diplom.featurized.zip")
CorpusBuilder.featurize_corpus(
    path_to_dialog,
    Path(f"./data/corpora/diplom.complete.featurized.zip"),
    [
        SnowballFeaturizer(language="eng"),
    ]
)

In [46]:
src.loc[src.word_type=='punct'].word.value_counts()

In [None]:
merged.loc[(merged.dialog_id > 40) & (merged.dialog_id < 50),info_columns]

In [6]:
def test_all_book_and_chapters(corpus,dialog_dashes):
    authors = corpus.get_toc().author.unique()
    djordan = authors[0]
    for book_id in tqdm(range(15)):
        book = corpus_framework.get_book_by_author_and_series_id(djordan,book_id)
        for chapter_id in range(len(book)):
            chapter = corpus_framework.get_chapter_as_bundle(book, chapter_id)
            DialogMarkupFeaturizer(dialog_dashes).featurize(chapter)
            src = chapter.src
            dialog_only = chapter.dialog_markup
            merged = dialog_only.merge(src, on='word_id')
            check_to_monotonness(merged)
            print(f"\n b_id: {book_id}, c_id: {chapter_id} --------------------------\n")

In [None]:
test_all_book_and_chapters(corpus,['"','“','”'])

In [23]:
check_to_monotonness(merged)

In [12]:
print(corpus_framework.show_as_text(dialog_only))

In [9]:
chapter.src

In [19]:
#_src = chapter.src
df = db.src.copy()
dialog_columns, dialog_dash = ['dialog_type', 'dialog_id', 'dialog_token_type'], ['"', '“', '”']
df[dialog_columns] = 'text', 0, 'none'
in_dialog = False
dialog_id = 0
prev_paragraph_id,prev_paragraph_dialog_id = df.paragraph_id.min(), 0
dialog_buffer = []


def dialog_trigger(df, buffer, id, dialog_id, in_dialog):
    for word_id, dialog_token_type in buffer:
        df.loc[word_id, dialog_columns] = 'dialog', dialog_id, dialog_token_type, 
    df.loc[id, 'dialog_token_type'] = 'dialog-dash'
    df.loc[id, 'dialog_type'] = 'dialog'
    dialog_id += 1
    df.loc[id, 'dialog_id'] = dialog_id
    dialog_id += 1
    in_dialog = not in_dialog
    return dialog_id, in_dialog, []


for row in df.itertuples():
    word, word_type, word_id, sentence_id, paragraph_id = row.word, row.word_type, row.word_id, row.sentence_id, row.paragraph_id
    
    if prev_paragraph_id != paragraph_id:
        if in_dialog:
            in_dialog =  False
            df.loc[df.paragraph_id == prev_paragraph_id, dialog_columns] = 'wrong',prev_paragraph_dialog_id + 1, 'none'
            dialog_id = prev_paragraph_dialog_id + 2
        else:
            prev_paragraph_dialog_id = dialog_id
            prev_paragraph_id = paragraph_id    
            
    if in_dialog:
        if word in dialog_dash:  #exit from dialog
            dialog_id, in_dialog, dialog_buffer = dialog_trigger(df, dialog_buffer, row.Index, dialog_id, in_dialog)
            #fill dialog with action
            df.loc[(df.sentence_id == sentence_id) & (df.word_id < word_id) & (df.dialog_token_type == 'none'), dialog_columns] = 'dialog', dialog_id - 4, 'action'
            df.loc[(df.sentence_id == sentence_id) & (df.word_id > word_id) & (df.dialog_token_type == 'none'), dialog_columns] = 'dialog', dialog_id, 'action'
        else:
            dialog_token_type = 'dialog-symbol' if word_type == 'punct' else 'speech'
            dialog_buffer.append((word_id, dialog_token_type))
    else:
        if word in dialog_dash:
            dialog_id, in_dialog, dialog_buffer = dialog_trigger(df, [], row.Index, dialog_id, in_dialog)
        else:
            df.loc[row.Index, 'dialog_id'] = dialog_id

In [20]:
df

In [65]:
df.loc[df.word.isin(dialog_dash),info_columns].groupby('sentence_id').count()

In [66]:
debug_columns = ['paragraph_id','word_id','sentence_id','word', 'dialog_token_type', 'dialog_id', 'dialog_type']

In [71]:
df.loc[(df.paragraph_id >= 20480306) & (df.paragraph_id <= 20480308),debug_columns]

In [67]:
df.loc[df.dialog_type == 'wrong',debug_columns]

In [61]:
df.loc[(df.sentence_id >= 20480824 - 1) & (df.sentence_id <= 20480824 + 1),debug_columns]

In [48]:
df.loc[(df.word_id > 20483751 - 30) & (df.word_id < 20483751 + 100),info_columns]

In [2]:
s = 'John says: "Hello, how are you? and gone.\n "Have a nice day", he said.'

In [3]:
from grammar_ru.common import Separator, DataBundle
fd = Separator.separate_string(s)
db = DataBundle(src=fd)
featurizer = DialogMarkupFeaturizer(['"'])
featurizer.featurize(db)

In [4]:
db.dialog_markup