In [139]:
import os
os.listdir(LOCAL_PROCESSED_DATA_PATH)

['.gitkeep',
 'dialog-re-binary',
 'dialog-re-ternary',
 'dialog-re-ternary-oversampled',
 'dialog-re-ternary-undersampled',
 'dialog-re-with-no-relation']

In [140]:
from src.paths import LOCAL_PROCESSED_DATA_PATH
from src.processing.etl import DialogREDatasetTransformer
from src.processing.ner import EntityProcessor
from src.processing.utils import get_counts_and_percentages
import spacy
from spacy import displacy


dt = DialogREDatasetTransformer(LOCAL_PROCESSED_DATA_PATH / 'dialog-re-ternary')
df1 = dt.load_data_to_dataframe()


ep = EntityProcessor(df1)
ep.process_all_documents()
df2 = ep.enrich_data()

d = ep.docs[0]
type(d), d

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1788/1788 [00:44<00:00, 40.46it/s]


(spacy.tokens.doc.Doc,
 Speaker 1: Hey!
 Speaker 2: Hey.
 Speaker 3: Hey, man. What's up?
 Speaker 1: Maybe you can tell me. My agent would like to know why I didn't show up at the audition I didn't know I had today. The first good thing she gets me in weeks. How could you not give me the message?!
 Speaker 3: Well, I'll tell ya I do enjoy guilt, but, ah, it wasn't me.
 Speaker 2: Yes, it was! It was him! Uh huh! Okay, it was me!
 Speaker 1: How is it you?
 Speaker 2: Well, it was just, it was all so crazy, you know. I mean, Chandler was in the closet, counting to 10, and he was up to 7 and I hadn't found a place to hide yet. I-I-I meant to tell you, and I wrote it all down on my hand. See, all of it.
 Speaker 1: Yep, that's my audition.
 Speaker 4: See, now this is why I keep notepads everywhere.
 Speaker 2: Yep, and that's why we don't invite you to play.
 Speaker 5: What is the great tragedy here? You go get yourself another appointment.
 Speaker 1: Well, Estelle tried, you know. Th

## Goals of Notebook
1. Create entity distance metric
2. Create hobby keyword spotter

In [241]:
import re
from itertools import combinations, permutations

class EntityCombiner:
    def __init__(self, doc, ignore_entity_types=None, entities=None, relations=None):
        if ignore_entity_types is None:
            ignore_entity_types = []
        self.doc = doc
        if entities is None:
            self.entities = [ent for ent in doc.ents if ent.label_ not in ignore_entity_types]
        else:
            self.entities = [ent for ent in doc.ents if ent.text in entities and ent.label_ not in ignore_entity_types]
        self.relations = relations

    def _calculate_turn_distance(self, pairs_generator):
        entity_turns = dict()
        current_turn = 0
        for sentence in self.doc.sents:
            if re.match(r'Speaker \d+:', sentence.text):
                current_turn += 1
            for entity in self.entities:
                if entity.start >= sentence.start and entity.end <= sentence.end:
                    entity_turns[entity] = current_turn
        entity_pairs = pairs_generator(self.entities, 2)
        entity_pairs_distance = [(pair[0], pair[1], abs(entity_turns.get(pair[0], 0) - entity_turns.get(pair[1], 0))) for pair in entity_pairs]
        return entity_pairs_distance

    def _calculate_word_distance(self, pairs_generator):
        entity_pairs = pairs_generator(self.entities, 2)
        entity_pairs_distance = [(pair[0], pair[1], abs(pair[0].start - pair[1].end)) for pair in entity_pairs]
        return entity_pairs_distance

    def get_entity_pairs(self, distance_type='turn', combination_type='combinations'):
        if distance_type == 'turn':
            if combination_type == 'combinations':
                return self._calculate_turn_distance(combinations)
            elif combination_type == 'permutations':
                return self._calculate_turn_distance(permutations)
            else:
                raise ValueError("Invalid combination_type. Supported values are 'combinations' and 'permutations'.")
        elif distance_type == 'word':
            if combination_type == 'combinations':
                return self._calculate_word_distance(combinations)
            elif combination_type == 'permutations':
                return self._calculate_word_distance(permutations)
            else:
                raise ValueError("Invalid combination_type. Supported values are 'combinations' and 'permutations'.")
        else:
            raise ValueError("Invalid distance_type. Supported values are 'turn' and 'word'.")

    def get_relation_tuples(self):
        relation_tuples = []
        for r in self.relations:
            x_entity = next((e for e in self.entities if e.text == r['x']), None)
            y_entity = next((e for e in self.entities if e.text == r['y']), None)
            if x_entity and y_entity:
                turn_distance = abs(x_entity.start - y_entity.start)
                # the start of the first entity and the end of the second entity
                word_distance_start = abs(x_entity.start - y_entity.end)
                # the end of the first entity and the start of the second entity
                word_distance_end = abs(x_entity.end - y_entity.start)
                relation_tuples.append((x_entity.text, y_entity.text, turn_distance, word_distance_start, word_distance_end, r['r'][0]))
        return relation_tuples
    


In [242]:
df2.apply(lambda row: [e.split(':')[0] for e in row['UniqueEntities']], axis=1)

0       [Estelle, Ann, casting director, Katelynn, Ann...
1                                                  [Jack]
2       [Emma, Mrs. Geller, man, dad, Geller, one, Mr....
3                                          [roomie, baby]
4                                              [Ross, 26]
                              ...                        
1783                     [Susie Moss, man, Chandler Bing]
1784                                     [Bing, sir, Sir]
1785                            [Rach, baby, little girl]
1786                                           [Butt Guy]
1787                                   [Pheebs, Racquela]
Length: 1788, dtype: object

In [243]:
df2.Relations.iloc[0]

[{'y': 'casting director',
  'x': 'Ann',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'STRING'},
 {'y': 'Annie',
  'x': 'Ann',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'agent',
  'x': 'Estelle',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'STRING'},
 {'y': 'Speaker 1',
  'x': 'Estelle',
  'rid': [2],
  'r': ['with_relation'],
  't': ['agent'],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Katelynn',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Pheebs',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Speaker 1',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': ['friends'],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Phoebe Buffay',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relatio

In [244]:
def get_relation_tuples(row):
    unique_entities = [e.split(':')[0] for e in row['UniqueEntities']]
    ec = EntityCombiner(d, ignore_entity_types=['CARDINAL'], entities=unique_entities, relations=row['Relations'])
    return ec.get_relation_tuples()

df2['RelationTuples'] = df2.apply(get_relation_tuples, axis=1)


In [246]:
df2['RelationTuples']

0       [(Ann, Annie, 29, 30, 28, with_relation), (Ann...
1                                                      []
2                                                      []
3                                                      []
4                                                      []
                              ...                        
1783                                                   []
1784                                                   []
1785                                                   []
1786                                                   []
1787                                                   []
Name: RelationTuples, Length: 1788, dtype: object

In [237]:
df2.Relations.iloc[0]

[{'y': 'casting director',
  'x': 'Ann',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'STRING'},
 {'y': 'Annie',
  'x': 'Ann',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'agent',
  'x': 'Estelle',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'STRING'},
 {'y': 'Speaker 1',
  'x': 'Estelle',
  'rid': [2],
  'r': ['with_relation'],
  't': ['agent'],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Katelynn',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Pheebs',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Speaker 1',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': ['friends'],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Phoebe Buffay',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relatio

In [230]:
df2['RelationTuples'].iloc[0]

[('Ann', 'Annie', 29, 30, 'with_relation'),
 ('Annie', 'Annie', 0, 1, 'no_relation'),
 ('Estelle', 'Katelynn', 121, 122, 'no_relation'),
 ('Ann', 'Joey Tribbiani', 39, 41, 'no_relation'),
 ('Estelle', 'Annie', 163, 164, 'no_relation'),
 ('Annie', 'Joey Tribbiani', 10, 12, 'no_relation'),
 ('Ann', 'Katelynn', 13, 12, 'no_relation'),
 ('Estelle', 'Joey Tribbiani', 173, 175, 'no_relation'),
 ('Annie', 'Katelynn', 42, 41, 'no_relation')]

In [225]:

mask = df2['RelationTuples'].apply(lambda x: x!=[])
df2[mask]['RelationTuples']

0       [(Ann, Annie, 29, 30, with_relation), (Annie, ...
29              [(Chandler, Chandler, 0, 1, no_relation)]
36                [(Estelle, Estelle, 0, 1, no_relation)]
64              [(Chandler, Chandler, 0, 1, no_relation)]
77                      [(Joey, Joey, 0, 1, no_relation)]
                              ...                        
1651    [(Joey, Chandler, 373, 372, no_relation), (Joe...
1683            [(Chandler, Chandler, 0, 1, no_relation)]
1688    [(Joey Tribbiani, Joey Tribbiani, 0, 2, no_rel...
1723                [(Phoebe, Phoebe, 0, 1, no_relation)]
1755                    [(Joey, Joey, 0, 1, no_relation)]
Name: RelationTuples, Length: 104, dtype: object

In [214]:
unique_entities = df2.UniqueEntities.apply(lambda x: [e.split(':')[0] for e in x]).iloc[0]
unique_entities

ec = EntityCombiner(d, ignore_entity_types=['CARDINAL'], entities=unique_entities)
ec.get_entity_pairs(distance_type='turn', combination_type='permutations')

[(Estelle, Katelynn, 4),
 (Estelle, Ann, 4),
 (Estelle, Annie, 5),
 (Estelle, Joey Tribbiani, 5),
 (Estelle, Estelle, 5),
 (Estelle, Estelle, 6),
 (Estelle, Annie, 6),
 (Katelynn, Estelle, 4),
 (Katelynn, Ann, 0),
 (Katelynn, Annie, 1),
 (Katelynn, Joey Tribbiani, 1),
 (Katelynn, Estelle, 1),
 (Katelynn, Estelle, 2),
 (Katelynn, Annie, 2),
 (Ann, Estelle, 4),
 (Ann, Katelynn, 0),
 (Ann, Annie, 1),
 (Ann, Joey Tribbiani, 1),
 (Ann, Estelle, 1),
 (Ann, Estelle, 2),
 (Ann, Annie, 2),
 (Annie, Estelle, 5),
 (Annie, Katelynn, 1),
 (Annie, Ann, 1),
 (Annie, Joey Tribbiani, 0),
 (Annie, Estelle, 0),
 (Annie, Estelle, 1),
 (Annie, Annie, 1),
 (Joey Tribbiani, Estelle, 5),
 (Joey Tribbiani, Katelynn, 1),
 (Joey Tribbiani, Ann, 1),
 (Joey Tribbiani, Annie, 0),
 (Joey Tribbiani, Estelle, 0),
 (Joey Tribbiani, Estelle, 1),
 (Joey Tribbiani, Annie, 1),
 (Estelle, Estelle, 5),
 (Estelle, Katelynn, 1),
 (Estelle, Ann, 1),
 (Estelle, Annie, 0),
 (Estelle, Joey Tribbiani, 0),
 (Estelle, Estelle, 1),
 (

In [212]:
df2.Relations.iloc[0]

[{'y': 'casting director',
  'x': 'Ann',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'STRING'},
 {'y': 'Annie',
  'x': 'Ann',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'agent',
  'x': 'Estelle',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'STRING'},
 {'y': 'Speaker 1',
  'x': 'Estelle',
  'rid': [2],
  'r': ['with_relation'],
  't': ['agent'],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Katelynn',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Pheebs',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Speaker 1',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': ['friends'],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Phoebe Buffay',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relatio

In [142]:
entity_combiner = EntityCombiner(d, ignore_entity_types=['CARDINAL'])
entity_pairs = entity_combiner.get_entity_pairs(distance_type='turn', combination_type='combinations')
print("Combinations:")
print(entity_pairs)

entity_pairs = entity_combiner.get_entity_pairs(distance_type='turn', combination_type='permutations')
print("Permutations:")
print(entity_pairs)


Combinations:
[(today, first, 0), (today, weeks, 0), (today, Chandler, 4), (today, Estelle, 9), (today, Katelynn, 13), (today, Phoebe Buffay's, 13), (today, Ann, 13), (today, Phoebe, 13), (today, Annie, 14), (today, Joey Tribbiani, 14), (today, Estelle, 14), (today, Joey, 15), (today, Estelle, 15), (today, Annie, 15), (first, weeks, 0), (first, Chandler, 4), (first, Estelle, 9), (first, Katelynn, 13), (first, Phoebe Buffay's, 13), (first, Ann, 13), (first, Phoebe, 13), (first, Annie, 14), (first, Joey Tribbiani, 14), (first, Estelle, 14), (first, Joey, 15), (first, Estelle, 15), (first, Annie, 15), (weeks, Chandler, 4), (weeks, Estelle, 9), (weeks, Katelynn, 13), (weeks, Phoebe Buffay's, 13), (weeks, Ann, 13), (weeks, Phoebe, 13), (weeks, Annie, 14), (weeks, Joey Tribbiani, 14), (weeks, Estelle, 14), (weeks, Joey, 15), (weeks, Estelle, 15), (weeks, Annie, 15), (Chandler, Estelle, 5), (Chandler, Katelynn, 9), (Chandler, Phoebe Buffay's, 9), (Chandler, Ann, 9), (Chandler, Phoebe, 9), (Ch

In [147]:
df2.Relations.iloc[0]

[{'y': 'casting director',
  'x': 'Ann',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'STRING'},
 {'y': 'Annie',
  'x': 'Ann',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'agent',
  'x': 'Estelle',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'STRING'},
 {'y': 'Speaker 1',
  'x': 'Estelle',
  'rid': [2],
  'r': ['with_relation'],
  't': ['agent'],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Katelynn',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Pheebs',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': [''],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Speaker 1',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relation'],
  't': ['friends'],
  'x_type': 'PER',
  'y_type': 'PER'},
 {'y': 'Phoebe Buffay',
  'x': 'Speaker 2',
  'rid': [2],
  'r': ['with_relatio

In [153]:
df2.CorrectPredictions.iloc[0]

['Katelynn:PERSON',
 'Estelle:PERSON',
 'Joey Tribbiani:PERSON',
 'Annie:PERSON',
 'Ann:PERSON',
 'Phoebe Buffay:PERSON']

In [159]:
df2.apply(lambda row: [(r['x'], r['y'], r['r'][0]) 
                       for r in row['Relations'] 
                       if (r['x'] in [cp.split(':')[0] for cp in row['CorrectPredictions']] 
                           and r['y'] in [cp.split(':')[0] for cp in row['CorrectPredictions']])], 
                       axis=1).explode().value_counts()Q


(Ross, Ross, no_relation)                 61
(Rachel, Rachel, no_relation)             58
(Chandler, Chandler, no_relation)         47
(Monica, Monica, no_relation)             47
(Joey, Joey, no_relation)                 36
                                          ..
(Egypt, Egypt, no_relation)                1
(Pete, Richard, no_relation)               1
(Richard, Pete, no_relation)               1
(Phoebe, Phoebe Buffay, with_relation)     1
(Nancy, Nancy, no_relation)                1
Length: 1471, dtype: int64

In [176]:
df2.Relations.explode().apply( lambda x: x['r'][0])

0       with_relation
0       with_relation
0       with_relation
0       with_relation
0       with_relation
            ...      
1787      no_relation
1787      no_relation
1787      no_relation
1787      no_relation
1787      no_relation
Name: Relations, Length: 26238, dtype: object

In [179]:
get_counts_and_percentages(pd.DataFrame(df2.Relations.explode().apply( lambda x: x['r'][0])), cols=['Relations'])

Unnamed: 0_level_0,Counts,%
Relations,Unnamed: 1_level_1,Unnamed: 2_level_1
no_relation,16489,62.8
with_relation,7650,29.2
unanswerable,2099,8.0


In [184]:
get_counts_and_percentages(pd.DataFrame(df2.apply(lambda row: [(0, r['r'][0]) 
                       for r in row['Relations'] 
                       if (r['x'] in [cp.split(':')[0] for cp in row['CorrectPredictions']] 
                           and r['y'] in [cp.split(':')[0] for cp in row['CorrectPredictions']])], 
                       axis=1).explode().rename('Relations')), cols=['Relations'])


#.value_counts()


Unnamed: 0_level_0,Counts,%
Relations,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0, no_relation)",1995,84.7
"(0, with_relation)",357,15.2
"(0, unanswerable)",3,0.1


In [65]:
displacy.render(d, style="ent", jupyter=True)


In [86]:
ignore_entity_types=['CARDINAL']
entities = set([ent for ent in d.ents if ent.label_ not in ignore_entity_types]) 
len(entities), entities

(15,
 {today,
  first,
  weeks,
  Chandler,
  Estelle,
  Katelynn,
  Phoebe Buffay's,
  Ann,
  Phoebe,
  Annie,
  Joey Tribbiani,
  Estelle,
  Joey,
  Estelle,
  Annie})

In [91]:
len(entities)*(len(entities)-1)/2

105.0

#### 

In [76]:
from itertools import permutations, combinations

def entity_pair_combinations(doc, ignore_entity_types=None):
    if ignore_entity_types is None:
        ignore_entity_types = []

    entities = set([ent for ent in doc.ents if ent.label_ not in ignore_entity_types])  # filter out entities in the ignore list
    entity_pairs = list(combinations(entities, 2))  # create permutations of entity pairs
    entity_pairs_distance = [(pair[0], pair[1], abs(pair[0].start - pair[1].end)) for pair in entity_pairs]  # calculate word distance
    return entity_pairs_distance


In [77]:
pairs = entity_pair_combinations(d, ignore_entity_types=['CARDINAL'])
len(pairs), pairs

(105,
 [(Estelle, Ann, 135),
  (Estelle, Chandler, 120),
  (Estelle, first, 216),
  (Estelle, Estelle, 192),
  (Estelle, Joey, 253),
  (Estelle, Estelle, 262),
  (Estelle, Phoebe, 138),
  (Estelle, Joey Tribbiani, 175),
  (Estelle, Katelynn, 122),
  (Estelle, today, 219),
  (Estelle, Annie, 164),
  (Estelle, Phoebe Buffay's, 127),
  (Estelle, weeks, 209),
  (Estelle, Annie, 269),
  (Ann, Chandler, 254),
  (Ann, first, 350),
  (Ann, Estelle, 58),
  (Ann, Joey, 119),
  (Ann, Estelle, 128),
  (Ann, Phoebe, 4),
  (Ann, Joey Tribbiani, 41),
  (Ann, Katelynn, 12),
  (Ann, today, 353),
  (Ann, Annie, 30),
  (Ann, Phoebe Buffay's, 7),
  (Ann, weeks, 343),
  (Ann, Annie, 135),
  (Chandler, first, 95),
  (Chandler, Estelle, 313),
  (Chandler, Joey, 374),
  (Chandler, Estelle, 383),
  (Chandler, Phoebe, 259),
  (Chandler, Joey Tribbiani, 296),
  (Chandler, Katelynn, 243),
  (Chandler, today, 98),
  (Chandler, Annie, 285),
  (Chandler, Phoebe Buffay's, 248),
  (Chandler, weeks, 88),
  (Chandler, A

In [105]:
len(d.text.split('\n'))

24

In [101]:
import re
from itertools import combinations

from itertools import combinations

def entity_pair_combinations_turn(doc, ignore_entity_types=None):
    if ignore_entity_types is None:
        ignore_entity_types = []

    entities = [ent for ent in doc.ents if ent.label_ not in ignore_entity_types]  # filter out entities in the ignore list

    # Find the turn number for each entity
    entity_turns = dict()
    current_turn = 0
    for sentence in doc.sents:
        if re.match(r'Speaker \d+:', sentence.text):
            current_turn += 1
        for entity in entities:
            if entity.start >= sentence.start and entity.end <= sentence.end:
                entity_turns[entity] = current_turn

    entity_pairs = combinations(entities, 2)  # create combinations of entity pairs
    entity_pairs_distance = [(pair[0], pair[1], abs(entity_turns.get(pair[0], 0) - entity_turns.get(pair[1], 0))) for pair in entity_pairs]  # calculate turn distance
    return entity_pairs_distance


pairs = entity_pair_combinations_turn(d, ignore_entity_types=['CARDINAL'])
len(pairs),pairs

(105,
 [(today, first, 0),
  (today, weeks, 0),
  (today, Chandler, 4),
  (today, Estelle, 9),
  (today, Katelynn, 13),
  (today, Phoebe Buffay's, 13),
  (today, Ann, 13),
  (today, Phoebe, 13),
  (today, Annie, 14),
  (today, Joey Tribbiani, 14),
  (today, Estelle, 14),
  (today, Joey, 15),
  (today, Estelle, 15),
  (today, Annie, 15),
  (first, weeks, 0),
  (first, Chandler, 4),
  (first, Estelle, 9),
  (first, Katelynn, 13),
  (first, Phoebe Buffay's, 13),
  (first, Ann, 13),
  (first, Phoebe, 13),
  (first, Annie, 14),
  (first, Joey Tribbiani, 14),
  (first, Estelle, 14),
  (first, Joey, 15),
  (first, Estelle, 15),
  (first, Annie, 15),
  (weeks, Chandler, 4),
  (weeks, Estelle, 9),
  (weeks, Katelynn, 13),
  (weeks, Phoebe Buffay's, 13),
  (weeks, Ann, 13),
  (weeks, Phoebe, 13),
  (weeks, Annie, 14),
  (weeks, Joey Tribbiani, 14),
  (weeks, Estelle, 14),
  (weeks, Joey, 15),
  (weeks, Estelle, 15),
  (weeks, Annie, 15),
  (Chandler, Estelle, 5),
  (Chandler, Katelynn, 9),
  (Ch

In [109]:
ec = EntityCombiner(d, ignore_entity_types=['CARDINAL'])

In [114]:
ec.get_entity_pairs()
ec.get_entity_pairs('word')

[(today, first, 4),
 (today, weeks, 11),
 (today, Chandler, 100),
 (today, Estelle, 221),
 (today, Katelynn, 342),
 (today, Phoebe Buffay's, 347),
 (today, Ann, 355),
 (today, Phoebe, 358),
 (today, Annie, 384),
 (today, Joey Tribbiani, 395),
 (today, Estelle, 412),
 (today, Joey, 473),
 (today, Estelle, 482),
 (today, Annie, 489),
 (first, weeks, 8),
 (first, Chandler, 97),
 (first, Estelle, 218),
 (first, Katelynn, 339),
 (first, Phoebe Buffay's, 344),
 (first, Ann, 352),
 (first, Phoebe, 355),
 (first, Annie, 381),
 (first, Joey Tribbiani, 392),
 (first, Estelle, 409),
 (first, Joey, 470),
 (first, Estelle, 479),
 (first, Annie, 486),
 (weeks, Chandler, 90),
 (weeks, Estelle, 211),
 (weeks, Katelynn, 332),
 (weeks, Phoebe Buffay's, 337),
 (weeks, Ann, 345),
 (weeks, Phoebe, 348),
 (weeks, Annie, 374),
 (weeks, Joey Tribbiani, 385),
 (weeks, Estelle, 402),
 (weeks, Joey, 463),
 (weeks, Estelle, 472),
 (weeks, Annie, 479),
 (Chandler, Estelle, 122),
 (Chandler, Katelynn, 243),
 (Chand

In [55]:
def extract_hobbies(doc):
    # List of common hobbies
    hobby_list = [
        'reading', 'writing', 'sports', 'music', 'cooking', 'painting', 'dancing', 'hiking', 'fishing', 
        'gardening', 'photography', 'knitting', 'sewing', 'drawing', 'crafts', 'cycling', 'baking', 'swimming', 
        'camping', 'pottery', 'yoga', 'martial arts', 'singing', 'rock climbing', 'horse riding', 'bird watching', 
        'stamp collecting', 'jewelry making', 'playing guitar', 'playing piano', 'playing violin', 
        'video games', 'board games', 'card games', 'running', 'weightlifting', 'woodworking', 'scrapbooking', 
        'calligraphy', 'quilting', 'embroidery', 'crocheting', 'crossword puzzles', 'jigsaw puzzles', 
        'roller skating', 'skateboarding', 'soccer', 'football', 'basketball', 'tennis', 'golf', 'volleyball', 
        'rugby', 'skiing', 'snowboarding', 'ice skating', 'origami', 'magic', 'acting', 'theatre', 'comedy',
        'traveling', 'cinema', 'astronomy', 'chess', 'billiards', 'snooker', 'poker', 'brewing beer', 'winemaking',
        'blogging', 'podcasting', 'streaming', 'collecting', 'fashion', 'makeup', 'cosplay', 'meditation', 
        'learning languages', 'genealogy', 'archery', 'fencing', 'sailing', 'canoeing', 'kayaking', 'windsurfing', 
        'surfing', 'scuba diving', 'drone racing', 'kart racing', 'drone photography'
    ]

    # Create a regex pattern that looks for these hobbies in the text
    pattern = r'\b(' + '|'.join(hobby_list) + r')\b'

    # Use the pattern to find matches in the text
    matches = re.findall(pattern, doc.text, re.IGNORECASE)
    
    return matches

hobbies = extract_hobbies(d)
print(hobbies)


[]


In [56]:
from tqdm import tqdm

In [115]:
import pandas as pd

In [122]:
hobbies = [extract_hobbies(d) for d in ep.docs]
hobbies = [h for h in hobbies if h != []]
tmp = pd.Series(hobbies)
hobby_df = pd.concat([
    tmp.explode().value_counts(),
    tmp.apply(set).explode().value_counts()
], axis=1
         ).rename({
    0: 'ent_count',
    1: 'doc_count'
         },
axis=1)
hobby_df

Unnamed: 0,ent_count,doc_count
writing,23,19
acting,20,17
music,17,14
fashion,16,12
running,15,13
reading,15,14
football,14,6
dancing,13,10
magic,10,9
Pottery,10,2


In [124]:
hobby_df['ratio'] = hobby_df['ent_count'] / hobby_df['doc_count']
hobby_df

Unnamed: 0,ent_count,doc_count,ratio
writing,23,19,1.210526
acting,20,17,1.176471
music,17,14,1.214286
fashion,16,12,1.333333
running,15,13,1.153846
reading,15,14,1.071429
football,14,6,2.333333
dancing,13,10,1.3
magic,10,9,1.111111
Pottery,10,2,5.0


[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['acting', 'acting'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['dancing', 'dancing'],
 [],
 [],
 [],
 [],
 ['magic'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['magic'],
 [],
 ['running'],
 [],
 [],
 [],
 [],
 ['cinema'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['magic'],
 [],
 ['fashion', 'fashion'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['basketball', 'basketball'],
 [],
 [],
 [],
 ['magic'],
 [],
 [],
 [],
 [],
 ['yoga'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['skiing', 'skiing'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['singing'],
 ['writing'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['theatre'],
 [],
 [],
 [],
 [],
 [],
 [],
