In [237]:
# !pip install --upgrade spacy
# !pip install --upgrade spacy[transformers]
# !pip install jsonlines
# !python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_trf

In [239]:
from tqdm.autonotebook import tqdm
import re

import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_lg")
nlp_sm = spacy.load("en_core_web_sm")
nlp_trf = spacy.load("en_core_web_trf")



In [10]:
from src.paths import LOCAL_PROCESSED_DATA_PATH
from src.statistics import get_counts_and_percentages

from pathlib import Path
import pandas as pd
import json
import glob
import math

# Get a list of all json files in the directory, excluding 'dev'
files = [Path(f) for f in glob.glob(str(LOCAL_PROCESSED_DATA_PATH / "dialog-re-with-no-relation/*.json"))]
# files = [Path(f) for f in glob.glob(str(LOCAL_PROCESSED_DATA_PATH / "dialog-re-ternary/*.json"))]


# Create an empty DataFrame
df1 = pd.DataFrame(columns=["Dialogue", "Relations", "Origin"])

# Loop over all json files in the directory
for file_name in files:
    with open(file_name, 'r') as file:
        data = json.load(file)

        # Convert the data to a DataFrame
        df_temp = pd.DataFrame(data, columns=["Dialogue", "Relations"])

        # Add a new column to this DataFrame for the origin
        df_temp["Origin"] = file_name.stem  # This will get just the file name without the extension

        # Append the temporary DataFrame to the main DataFrame
        df1 = pd.concat([df1, df_temp], ignore_index=True)
df1

Unnamed: 0,Dialogue,Relations,Origin
0,"[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[{'y': 'casting director', 'x': 'Ann', 'rid': ...",dev
1,"[Speaker 1, Speaker 2: Hi, Speaker 3: Hi! Hey ...","[{'y': 'Speaker 2', 'x': 'Speaker 1', 'rid': [...",dev
2,"[Speaker 1, Speaker 2: Hi!, Speaker 3: Hey!, S...","[{'y': 'man', 'x': 'Speaker 4', 'rid': [37], '...",dev
3,[Speaker 1: Wow! It looks like we got a lot of...,"[{'y': 'baby', 'x': 'Speaker 2', 'rid': [37], ...",dev
4,"[Speaker 1: Now, Mom, everything's going fine,...","[{'y': '26', 'x': 'Speaker 1', 'rid': [25], 'r...",dev
...,...,...,...
1783,"[Speaker 1: Nice camoflauge man, for a minute ...","[{'y': 'Speaker 1', 'x': 'Speaker 2', 'rid': [...",train
1784,"[Speaker 1: Well, I'm sure you'll teach her a ...","[{'y': 'Sir', 'x': 'Speaker 1', 'rid': [37], '...",train
1785,[Speaker 1: You know what? I can't even worry ...,"[{'y': 'baby', 'x': 'Speaker 1', 'rid': [37], ...",train
1786,"[Speaker 1: And cut. Hey, Butt Guy, what the h...","[{'y': 'Butt Guy', 'x': 'Speaker 2', 'rid': [3...",train


In [117]:
all_dialogues = df1.Dialogue.apply(lambda x: '\n'.join(x))
dialogue_sample = all_dialogues[0]
sample_relations = df1.Relations.iloc[0]

In [None]:
doc = nlp(dialogue_sample)



In [21]:
# take a look at how many words in a document
len(doc)

595

In [22]:
# look document-level attributes
dir(doc)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_context',
 '_get_array_attrs',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'copy',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_dict',
 'from_disk',
 'from_docs',
 'from_json',
 'get_extension',
 'get_lca_matrix',
 'has_annotation',
 'has_extension',
 'has_unknown_spaces',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'noun_chunks',
 'noun_chunks_iterator',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set

In [50]:
# tokens in a document can by accessed by their number:
i=29
print(doc[i])
dir(doc[i])

can


['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [51]:
displacy.render(doc, style="ent", jupyter=True)

In [241]:
# doc = nlp_trf(dialogue_sample)
# displacy.render(doc, style="ent", jupyter=True)

In [53]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [256]:
i = 0
doc = nlp(all_dialogues[i])
sample_relations = df1.Relations.iloc[i]
annotated_entities = []
for r in sample_relations:
    annotated_entities.append(f"{r['x']}:{r['x_type']}")
    annotated_entities.append(f"{r['y']}:{r['y_type']}")
print('# ANNOTATED')
display(sorted(set([a for a in annotated_entities if 'Speaker ' not in a])))

print('\n# PREDICITED')
predicited_entities = [f"{t}:{t.ent_type_}" for t in doc if t.ent_type_ not in ('', 'CARDINAL')]
sorted(set(predicited_entities))

# ANNOTATED


['Ann:PER',
 'Annie:PER',
 'Estelle:PER',
 'Joey Tribbiani:PER',
 'Katelynn:PER',
 'Pheebs:PER',
 'Phoebe Buffay:PER',
 'agent:STRING',
 'casting director:STRING',
 'man:STRING']


# PREDICITED


["'s:PERSON",
 'Ann:PERSON',
 'Annie:PERSON',
 'Buffay:PERSON',
 'Chandler:PERSON',
 'Estelle:ORG',
 'Estelle:PERSON',
 'Joey:PERSON',
 'Katelynn:PERSON',
 'Phoebe:PERSON',
 'Tribbiani:PERSON',
 'first:ORDINAL',
 'today:DATE',
 'weeks:DATE']

In [120]:
docs = list(tqdm(nlp.pipe(all_dialogues), total=len(all_dialogues)))

  0%|          | 0/1788 [00:00<?, ?it/s]

In [121]:
from collections import Counter

all_orgs = []
for d in docs:
    orgs = [ent.text for ent in d.ents if ent.label_ == "ORG"]
    all_orgs.extend(orgs)

Counter(all_orgs).most_common(15)

[('Knicks', 20),
 ('sec', 15),
 ('Hugsy', 11),
 ('Pottery Barn', 10),
 ('Porsche', 8),
 ('Phoebs', 7),
 ('Bloomingdale’s', 6),
 ('Sophie', 5),
 ('Lilly', 5),
 ('Soapie', 4),
 ('Rache', 4),
 ('Carol', 4),
 ("Bloomingdale's", 4),
 ('Raych', 4),
 ('Dartmouth', 4)]

In [175]:
result_df = pd.DataFrame(columns=['Entity', 'Entity_Type'])

tmp = df1.Relations.explode().apply(lambda x: [(x['x'], x['x_type']), (x['y'], x['y_type'])]).explode()

import pandas as pd
from src.statistics import get_counts_and_percentages

# Assuming you already have the 'tmp' variable

result_df = pd.DataFrame(columns=['Entity', 'Entity_Type'])



In [190]:
mask = ~result_df.Entity.str.contains('Speaker ')
annotated_entity_freq = get_counts_and_percentages(result_df[mask], ['Entity', 'Entity_Type'])
annotated_entity_freq.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
Entity,Entity_Type,Unnamed: 2_level_1,Unnamed: 3_level_1
Pheebs,PER,858,3.5
honey,STRING,643,2.6
Ross,PER,612,2.5
Rachel,PER,586,2.4
man,STRING,560,2.3
Monica,PER,556,2.2
Chandler,PER,536,2.2
Rach,PER,512,2.1
baby,STRING,464,1.9
Joey,PER,362,1.5


In [191]:
annotated_entity_freq.tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
Entity,Entity_Type,Unnamed: 2_level_1,Unnamed: 3_level_1
athlete,STRING,1,0.0
Handsome,PER,1,0.0
Petty,PER,1,0.0
Mr. Millionaire,PER,1,0.0
Tag Sweetcheeks Jones,PER,1,0.0
Phaybobo,PER,1,0.0
Rabbi Tribbiani,PER,1,0.0
Victor Victoria,PER,1,0.0
cheapasaurus,STRING,1,0.0
Deep Impact,STRING,1,0.0


In [189]:
get_counts_and_percentages(result_df[mask], ['Entity_Type'])


Unnamed: 0_level_0,Counts,%
Entity_Type,Unnamed: 1_level_1,Unnamed: 2_level_1
PER,14861,59.8
STRING,7720,31.1
GPE,1278,5.1
ORG,701,2.8
VALUE,279,1.1


In [202]:
mask_ent = result_df['Entity_Type'] == 'GPE'
get_counts_and_percentages(result_df[mask & mask_ent], ['Entity', 'Entity_Type']).head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Counts,%
Entity,Entity_Type,Unnamed: 2_level_1,Unnamed: 3_level_1
New York,GPE,186,14.6
London,GPE,138,10.8
New York City,GPE,66,5.2
Vegas,GPE,59,4.6
Minsk,GPE,56,4.4
England,GPE,51,4.0
Barbados,GPE,45,3.5
Vermont,GPE,36,2.8
Rome,GPE,34,2.7
Queens,GPE,31,2.4


In [215]:
sent = list(doc.sents)[4]
displacy.render(sent, style="dep", jupyter=True)

In [203]:
print(doc)
tok = doc[21]  # "Aleppo"
print(tok)


def loc_to_verb(tok):
    verb_phrase = []
    # first, iterate through all the ancesters of the token
    for i in tok.ancestors:
        # when you get to a verb (using a POS tag)...
        if i.pos_ == "VERB":
            # ...add the verb to the verb phrase list
            verb_phrase.append(i)
            # then, also add the direct object(s) of the verb, as long as the original token
            # is in the same subtree as the direct object
            verb_phrase.extend([j for j in i.children if j.dep_ == "dobj" and tok in i.subtree])
            # we only want the first verb, so stop after we find one
            break
    # expand out the verb phrase to get modifiers ("amod") of the direct object
    for i in verb_phrase:
        for j in i.children:
            if j.dep_ == "amod":
                verb_phrase.append(j)

    # sort the tokens by their position in the original sentence
    new_list = sorted(verb_phrase, key=lambda x: x.i)
    # join them together with the correct whitespace and return
    return ''.join([i.text_with_ws for i in new_list]).strip()

loc_to_verb(tok)

Speaker 1: Hey!
Speaker 2: Hey.
Speaker 3: Hey, man. What's up?
Speaker 1: Maybe you can tell me. My agent would like to know why I didn't show up at the audition I didn't know I had today. The first good thing she gets me in weeks. How could you not give me the message?!
Speaker 3: Well, I'll tell ya I do enjoy guilt, but, ah, it wasn't me.
Speaker 2: Yes, it was! It was him! Uh huh! Okay, it was me!
Speaker 1: How is it you?
Speaker 2: Well, it was just, it was all so crazy, you know. I mean, Chandler was in the closet, counting to 10, and he was up to 7 and I hadn't found a place to hide yet. I-I-I meant to tell you, and I wrote it all down on my hand. See, all of it.
Speaker 1: Yep, that's my audition.
Speaker 4: See, now this is why I keep notepads everywhere.
Speaker 2: Yep, and that's why we don't invite you to play.
Speaker 5: What is the great tragedy here? You go get yourself another appointment.
Speaker 1: Well, Estelle tried, you know. The casting director told her that I m

'tell me'

In [220]:
actions = []

for d in docs:
    for i in d:
        if i.text == "Pheebs":
            actions.append(loc_to_verb(i))

sorted(list(set(actions)))

['',
 "'s",
 'Come',
 'Hold me young za',
 'Look',
 'Speaker ask',
 'Thank you',
 'act',
 'agree',
 'believe',
 'belongs',
 'bitten',
 'broke Pheebs',
 'check it',
 'come',
 'create own Pheebs',
 'cry',
 'do',
 'do that',
 'doing same thing',
 'earned it',
 'engaged',
 'get',
 'give ring',
 'giving deposit',
 'go',
 'go Pheebs',
 'going',
 'gon',
 'gone',
 'got',
 'got second',
 'guess',
 'had barrel',
 'have',
 'heard that',
 'help',
 'help me Pheebs',
 'knew',
 'know',
 'know Pheebs',
 'lay',
 'let',
 'lie Pheebs',
 'listen',
 'listen Pheebs',
 'looks',
 'love Pheebs',
 'make it',
 'make reservation',
 'mean amazing Pheebs',
 'miss you too Pheebs',
 'need',
 'need help',
 'pick one',
 'remember',
 'run it',
 'said',
 'saved',
 'see Pheebs',
 'speak Italian',
 'stay',
 'talk',
 'think',
 'think Pheebs',
 'thought',
 'tripped me',
 'understand',
 'use machine',
 'wait',
 'want',
 'want cookie',
 'want little taste',
 'want this',
 'what want',
 'wonderingSpeaker',
 '’m',
 '’re',
 '’s']

In [225]:
doc

Speaker 1: Hey!
Speaker 2: Hey.
Speaker 3: Hey, man. What's up?
Speaker 1: Maybe you can tell me. My agent would like to know why I didn't show up at the audition I didn't know I had today. The first good thing she gets me in weeks. How could you not give me the message?!
Speaker 3: Well, I'll tell ya I do enjoy guilt, but, ah, it wasn't me.
Speaker 2: Yes, it was! It was him! Uh huh! Okay, it was me!
Speaker 1: How is it you?
Speaker 2: Well, it was just, it was all so crazy, you know. I mean, Chandler was in the closet, counting to 10, and he was up to 7 and I hadn't found a place to hide yet. I-I-I meant to tell you, and I wrote it all down on my hand. See, all of it.
Speaker 1: Yep, that's my audition.
Speaker 4: See, now this is why I keep notepads everywhere.
Speaker 2: Yep, and that's why we don't invite you to play.
Speaker 5: What is the great tragedy here? You go get yourself another appointment.
Speaker 1: Well, Estelle tried, you know. The casting director told her that I m

In [235]:

def clean_phrase(subtree):
    """Sort and join tokens into a string"""
    new_list = sorted(list(subtree), key=lambda x: x.i)
    return ''.join([i.text_with_ws for i in new_list])


for i in doc:
    # Find instances of the word "backed" that play the role of an adjectival modifier
    if i.dep_ == "amod":
        print("\nToken:", i.text)
        # The children of "backed" will report who the backer is
        print("Backer: ", clean_phrase(i.children))
        # Next, we go up one level to the immediate parent of the word "backed"
        parent = list(i.ancestors)[0]
        branches = [parent]
        # for each of the children of that word, except for the original "backed" token,
        # add it to the branch
        for j in parent.children:
            if j != i:
                branches.append(j)

        print(branches)
    



Token: first
Backer:  
[thing, The, good, gets, .]

Token: good
Backer:  
[thing, The, first, gets, .]

Token: great
Backer:  
[tragedy, the]

Token: sad
Backer:  
[life, What, a, little]

Token: little
Backer:  
[life, What, a, sad]

Token: Right
Backer:  
[look, ,, well, ,, um, ,]


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

hugg = pipeline('question-answering', model=model_name, tokenizer=model_name)

all_text = "\n".join(all_dialogues.tolist())

QA_input = {
    'question': "Who married Chandler?",
    'context': all_text
}
res = hugg(QA_input)

print(res)