In [22]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from grammar_ru.corpus import CorpusReader, CorpusBuilder
from diplom.utils.corpus_utils import CorpusFramework
from diplom.utils.dialog_markuper import DialogMarkupFeaturizer
from diplom.utils.speech_action_maker import SpeechActionFeaturizer
import matplotlib.pyplot as plt
from torch.nn.functional import cosine_similarity
from collections import defaultdict
import torch
#reported speach.

In [3]:
path_corpus = Path("./data/corpora/diplom.complete.featurized.zip")

In [4]:
CorpusBuilder.featurize_corpus(
    path_corpus,
    Path(f"./data/corpora/diplom.wow.zip"),
    [
        SpeechActionFeaturizer(['"','“','”','“']),
    ]
)

2024-04-26 17:15:08.417266 INFO: Processing b6266398-300c-438f-b00a-6393b6556320 at #0, total 1395 
2024-04-26 17:15:08.425936 INFO: <class 'diplom.utils.speech_action_maker.SpeechActionFeaturizer'>
2024-04-26 17:15:08.507232 INFO: Processing c4f1c04c-bf42-431b-9224-7cc574d2424b at #1, total 1395 
2024-04-26 17:15:08.510875 INFO: <class 'diplom.utils.speech_action_maker.SpeechActionFeaturizer'>
2024-04-26 17:15:08.814850 INFO: Processing 222df8be-bd7b-48fa-a1a4-f8bce8a797da at #2, total 1395 
2024-04-26 17:15:08.819922 INFO: <class 'diplom.utils.speech_action_maker.SpeechActionFeaturizer'>
2024-04-26 17:15:09.513247 INFO: Processing 8243bdb4-db7a-4cb4-bf4c-856a1422ad29 at #3, total 1395 
2024-04-26 17:15:09.515820 INFO: <class 'diplom.utils.speech_action_maker.SpeechActionFeaturizer'>
2024-04-26 17:15:10.265221 INFO: Processing 261f3623-53b7-488b-b308-18251b5013b8 at #4, total 1395 
2024-04-26 17:15:10.266194 INFO: <class 'diplom.utils.speech_action_maker.SpeechActionFeaturizer'>
2024-

In [8]:
def make_merged_df(bundle, only_dialog=False):
    src, dialog_markup = bundle.src, bundle.dialog_markup
    merged = dialog_markup.merge(src, on='word_id')
    return merged[(merged.dialog_token_type != 'none')] if only_dialog else merged


def get_chapter_bundle(corpus_framework, author, series_id, chapter_id):
    book = corpus_framework.get_book_by_author_and_series_id(author, series_id)
    chapter = corpus_framework.get_chapter_as_bundle(book, chapter_id)
    return chapter

In [9]:
path_corpus = Path(f"./data/corpora/diplom.wow.zip")
corpus = CorpusReader(path_corpus)
corpus_framework = CorpusFramework(corpus)

In [10]:
authors = corpus.get_toc().author.unique()
djordan,rowling,martin = authors
info_columns = ['word_id','sentence_id','word_x', 'dialog_token_type', 'dialog_id']

In [16]:
chapter = get_chapter_bundle(corpus_framework, djordan, series_id=1,chapter_id=5)

In [21]:
chapter.speech_action

Unnamed: 0,sample_id,speech,action
0,0,What sort of place is this?,sounded
1,1,"Your pardon, Master Gleeman,",said
2,2,That pretty little slip of a girl?,exclaimed
3,3,"Old news, even in Baerlon,",said
4,4,"Master Fain has come often to Emond’s Field, M...",said
5,5,"Thom Merrilin,",said
6,6,"Master … ah … Master Merrilin,",said
7,7,"Do I look like a peddler, boy?",grumbled
8,8,"We’ve all seen the Mire, too,",sound
9,9,As far as that?,murmured


In [48]:
example_count_by_authors = defaultdict(int)

for author in tqdm(authors):
    series_count = len(corpus_framework.get_books_by_author(author))
    for book_id in range(series_count):
        try:
            book = corpus_framework.get_book_by_author_and_series_id(author, book_id)
        except IndexError:
            print(f"Не нашлась книга {book_id} для {author}, проверить df")
            continue
        chapters_count = len(book)
        for chapter_id in range(chapters_count):
            chapter = corpus_framework.get_chapter_as_bundle(book, chapter_id)
            example_count = len(chapter.speech_action)
            example_count_by_authors[author] += example_count

  0%|          | 0/3 [00:00<?, ?it/s]

Не нашлась книга 0 для dzhoan rouling, проверить df
Не нашлась книга 2 для dzhoan rouling, проверить df
Не нашлась книга 0 для dzhordzh martin, проверить df


In [49]:
example_count_by_authors

defaultdict(int,
            {'robert dzhordan': 19626,
             'dzhoan rouling': 11718,
             'dzhordzh martin': 6735})

In [12]:
df = make_merged_df(chapter)

# Брать только спич перед экшеном
# В качестве метки использовать глагол из экшена
# Получить databundle - это словарик с df
# Ввести понятие sample_id - speech перед глаголом и сам глагол должны иметь одинаковый sample_id, но лежать в разных df (input, target)

In [18]:
diaog_info_columns = ['word','dialog_token_type','word_id', 'dialog_id','sentence_id','paragraph_id','word_tail']

In [19]:
dialogs = df.loc[df.dialog_type == 'dialog'][diaog_info_columns]#.rename(columns={'word_y': 'word'})[diaog_info_columns]

dialogs['word'] =  dialogs.word+dialogs.assign(space=' ').space*dialogs.word_tail

In [20]:
t_dialog = dialogs.groupby('dialog_id').word.sum()

In [21]:
text_dialog = df[['sentence_id','dialog_id']].merge(t_dialog, on='dialog_id').drop_duplicates()[['word','sentence_id','dialog_id']]

In [22]:
text_dialog = df[['sentence_id','dialog_id']].merge(t_dialog, on='dialog_id')[['word','dialog_id']].drop_duplicates()

In [23]:
dialog_seps = ['“','”']

In [24]:
without_sep = text_dialog.loc[~text_dialog.word.str.strip().isin(dialog_seps)]

In [25]:
text_dialog

Unnamed: 0,word,dialog_id
0,“,1
1,What was that about?,2
6,”,3
7,he demanded as soon as he was in the hall that...,4
22,“,5
...,...,...
3191,”,451
3192,Rand answered with a laugh.,452
3213,“,453
3214,He has to come down sooner or later.,454


Nothing, really. I told Adan al’Caar and some of his snot-nosed friends—Ewin Finngar and Dag Coplin—that some farmers had seen ghost hounds, breathing fire and running through the woods. They ate it up like clotted cream.

In [26]:
without_sep.loc[without_sep.word.str.contains('\.')]

Unnamed: 0,word,dialog_id
7,he demanded as soon as he was in the hall that...,4
37,"Mat said, peering past Rand’s shoulder into th...",12
63,He cut off abruptly as Mistress al’Vere bustle...,16
180,"There is more of this in the kitchen, if you t...",18
233,"Mat said, smacking his lips.",24
...,...,...
3138,he said mournfully.,432
3151,"Rand said, and the younger boy brightened.",436
3168,Mat added.,440
3192,Rand answered with a laugh.,452


In [27]:
from collections import defaultdict
in_dialog = False
speech, action = None, None
sample_id = 0
ans_dict = defaultdict(list)
for row in tqdm(text_dialog.itertuples()):
    _, word, dialog_id = row
    if in_dialog:
        if word.strip() in dialog_seps:
            in_dialog = False
        else:
            speech = word
    else:
        if word in dialog_seps:# Т.е. если сначала был экшен, а потом спич, то мы сотрём экщен, тем самым получаем только экшены после спича.
            in_dialog = True
            speech, action = None, None
        else:
            action = word
    if speech is not None and action is not None:# Такая ситуация возможна только тогда, когда мы не в диалоге, т.к. когда диалог открывается, спич и экшн обнуляются.
        #TODO Добавить фильтр на длину action/ обрезать экшн по длине и если там нет синонима said, то выкидывать и тд
        ans_dict["sample_id"].append(sample_id)
        ans_dict["speech"].append(speech)
        ans_dict["action"].append(action)
        speech, action = None, None
        sample_id += 1

0it [00:00, ?it/s]

If I were you, I’d worry more about Alsbet Luhhan than about the blacksmith. She’s almost as strong, and her temper is a lot worse. No matter, though. If you walk fast, maybe he won’t notice you.

Setting his final cask in the racks, he wiped crumbs from his mouth while Mat was unburdening himself, then said,

In [28]:
ans_df = pd.DataFrame.from_dict(ans_dict)

In [29]:
ans_df.head(7)

Unnamed: 0,sample_id,speech,action
0,0,What was that about?,he demanded as soon as he was in the hall that...
1,1,"It’s old Luhhan,","Mat said, peering past Rand’s shoulder into th..."
2,2,I think he suspects I was the one who—,He cut off abruptly as Mistress al’Vere bustle...
3,3,"Honeycakes,","Mat said, smacking his lips."
4,4,"After,","Rand told him firmly,"
5,5,"Now,","Rand said, as they set their casks in the racks,"
6,6,"If I were you, I’d worry more about Alsbet Luh...","Setting his final cask in the racks, he wiped ..."


In [30]:
from utils.Embeders import SpacyEmbeder
#TODO add more verb to collect
with open('verbs.txt') as f:
    verbs = [line.rstrip() for line in f]
embeder = SpacyEmbeder()
verbs_emded = torch.stack([embeder.get_embedding(token) for token in verbs])

In [31]:
threshold = 0.8

In [32]:
import spacy
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

nlp = spacy.load('en_core_web_sm')

In [39]:
emebs = dict()
cos_sims_d = dict()
new_ans = defaultdict(list)
all_cos_sims = []
for row in ans_df.itertuples():
    sample_id, speech, action = row.sample_id,row.speech, row.action
    doc = nlp(action)
    lemma_embed, action_verb_word = zip(*[(token.lemma_, token.orth_) for token in doc if token.pos_ == 'VERB'])
    maxis = []
    for token in lemma_embed:
        if token not in emebs:
            emb = embeder.get_embedding(token)
            emebs[token] = emb
            cos_sims_d[token] = np.max(cos_sim(torch.unsqueeze(emb, 0), verbs_emded))
        maxis.append(cos_sims_d[token])
    amax = np.argmax(maxis)
    if maxis[amax] > threshold:
        new_ans["sample_id"].append(sample_id)
        new_ans["speech"].append(speech)
        new_ans["action"].append(action_verb_word[amax])
        

In [41]:
type(list(new_ans.items())[2][1][0])

str

In [85]:
pd.DataFrame.from_dict(new_ans)

Unnamed: 0,sample_id,speech,action
0,1,"It’s old Luhhan,",said
1,3,"Honeycakes,",said
2,5,"Now,",said
3,6,"If I were you, I’d worry more about Alsbet Luh...",said
4,7,"They’re the ones I meant to tell you about,",muttered
5,12,"Her name is Moiraine,",said
6,13,"She asked the Wisdom for directions this morning,",said
7,15,"That’s too long for me,",muttered
8,18,"For the last time,",barked
9,19,"Filthy carrion eater,",muttered


In [83]:
new_ans = defaultdict(list)
all_cos_sims = []
for row in ans_df.itertuples():
    sample_id, speech, action = row.sample_id,row.speech, row.action
    doc = nlp(action)
    lemma_embed, action_verb_word = zip(*[(token.lemma_, token) for token in doc if token.pos_ == 'VERB'])
    action_verb_embed= torch.stack([embeder.get_embedding(token) for token in lemma_embed])
    cos_sims = cos_sim(action_verb_embed, verbs_emded)
    amax = np.unravel_index(cos_sims.argmax(), cos_sims.shape)
    all_cos_sims.append((cos_sims[amax],lemma_embed[amax[0]]))
    if cos_sims[amax] > threshold:
        new_ans["sample_id"].append(sample_id)
        new_ans["speech"].append(speech)
        new_ans["action"].append(action_verb_word[amax[0]])

In [61]:
pd.DataFrame.from_dict(new_ans)

Unnamed: 0,sample_id,speech,action
0,1,"It’s old Luhhan,",said
1,3,"Honeycakes,",said
2,5,"Now,",said
3,6,"If I were you, I’d worry more about Alsbet Luh...",said
4,7,"They’re the ones I meant to tell you about,",muttered
5,12,"Her name is Moiraine,",said
6,13,"She asked the Wisdom for directions this morning,",said
7,15,"That’s too long for me,",muttered
8,18,"For the last time,",barked
9,19,"Filthy carrion eater,",muttered
