In [1]:
from src.paths import LOCAL_PROCESSED_DATA_PATH
from src.processing.etl import DialogREDatasetTransformer
from src.processing.ner import EntityProcessor
from src.processing.utils import get_counts_and_percentages
import spacy
from spacy import displacy


dt = DialogREDatasetTransformer(LOCAL_PROCESSED_DATA_PATH / 'dialog-re-with-no-relation')
df1 = dt.load_data_to_dataframe()


ep = EntityProcessor(df1)
ep.process_all_documents()
df2 = ep.enrich_data()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1788/1788 [00:54<00:00, 33.02it/s]


## Goals of Notebook
1. Create entity distance metric
2. Create hobby keyword spotter

In [63]:
import re
from itertools import combinations

class EntityCombiner:
    def __init__(self, doc, ignore_entity_types=None):
        if ignore_entity_types is None:
            ignore_entity_types = []
        self.entities = [ent for ent in doc.ents if ent.label_ not in ignore_entity_types]
        self.doc = doc

    def _calculate_turn_distance(self):
        entity_turns = dict()
        current_turn = 0
        for sentence in self.doc.sents:
            if re.match(r'Speaker \d+:', sentence.text):
                current_turn += 1
            for entity in self.entities:
                if entity.start >= sentence.start and entity.end <= sentence.end:
                    entity_turns[entity] = current_turn
        entity_pairs = combinations(self.entities, 2)
        entity_pairs_distance = [(pair[0], pair[1], abs(entity_turns.get(pair[0], 0) - entity_turns.get(pair[1], 0))) for pair in entity_pairs]
        return entity_pairs_distance

    def _calculate_word_distance(self):
        entity_pairs = combinations(self.entities, 2)
        entity_pairs_distance = [(pair[0], pair[1], abs(pair[0].start - pair[1].end)) for pair in entity_pairs]
        return entity_pairs_distance

    def get_entity_pairs(self, distance_type='turn'):
        if distance_type == 'turn':
            return self._calculate_turn_distance()
        elif distance_type == 'word':
            return self._calculate_word_distance()
        else:
            raise ValueError("Invalid distance_type. Supported values are 'turn' and 'word'.")



In [7]:
d = ep.docs[0]
type(d), d

(spacy.tokens.doc.Doc,
 Speaker 1: Hey!
 Speaker 2: Hey.
 Speaker 3: Hey, man. What's up?
 Speaker 1: Maybe you can tell me. My agent would like to know why I didn't show up at the audition I didn't know I had today. The first good thing she gets me in weeks. How could you not give me the message?!
 Speaker 3: Well, I'll tell ya I do enjoy guilt, but, ah, it wasn't me.
 Speaker 2: Yes, it was! It was him! Uh huh! Okay, it was me!
 Speaker 1: How is it you?
 Speaker 2: Well, it was just, it was all so crazy, you know. I mean, Chandler was in the closet, counting to 10, and he was up to 7 and I hadn't found a place to hide yet. I-I-I meant to tell you, and I wrote it all down on my hand. See, all of it.
 Speaker 1: Yep, that's my audition.
 Speaker 4: See, now this is why I keep notepads everywhere.
 Speaker 2: Yep, and that's why we don't invite you to play.
 Speaker 5: What is the great tragedy here? You go get yourself another appointment.
 Speaker 1: Well, Estelle tried, you know. Th

In [65]:
displacy.render(d, style="ent", jupyter=True)


In [86]:
ignore_entity_types=['CARDINAL']
entities = set([ent for ent in d.ents if ent.label_ not in ignore_entity_types]) 
len(entities), entities

(15,
 {today,
  first,
  weeks,
  Chandler,
  Estelle,
  Katelynn,
  Phoebe Buffay's,
  Ann,
  Phoebe,
  Annie,
  Joey Tribbiani,
  Estelle,
  Joey,
  Estelle,
  Annie})

In [91]:
len(entities)*(len(entities)-1)/2

105.0

#### 

In [76]:
from itertools import permutations, combinations

def entity_pair_combinations(doc, ignore_entity_types=None):
    if ignore_entity_types is None:
        ignore_entity_types = []

    entities = set([ent for ent in doc.ents if ent.label_ not in ignore_entity_types])  # filter out entities in the ignore list
    entity_pairs = list(combinations(entities, 2))  # create permutations of entity pairs
    entity_pairs_distance = [(pair[0], pair[1], abs(pair[0].start - pair[1].end)) for pair in entity_pairs]  # calculate word distance
    return entity_pairs_distance


In [77]:
pairs = entity_pair_combinations(d, ignore_entity_types=['CARDINAL'])
len(pairs), pairs

(105,
 [(Estelle, Ann, 135),
  (Estelle, Chandler, 120),
  (Estelle, first, 216),
  (Estelle, Estelle, 192),
  (Estelle, Joey, 253),
  (Estelle, Estelle, 262),
  (Estelle, Phoebe, 138),
  (Estelle, Joey Tribbiani, 175),
  (Estelle, Katelynn, 122),
  (Estelle, today, 219),
  (Estelle, Annie, 164),
  (Estelle, Phoebe Buffay's, 127),
  (Estelle, weeks, 209),
  (Estelle, Annie, 269),
  (Ann, Chandler, 254),
  (Ann, first, 350),
  (Ann, Estelle, 58),
  (Ann, Joey, 119),
  (Ann, Estelle, 128),
  (Ann, Phoebe, 4),
  (Ann, Joey Tribbiani, 41),
  (Ann, Katelynn, 12),
  (Ann, today, 353),
  (Ann, Annie, 30),
  (Ann, Phoebe Buffay's, 7),
  (Ann, weeks, 343),
  (Ann, Annie, 135),
  (Chandler, first, 95),
  (Chandler, Estelle, 313),
  (Chandler, Joey, 374),
  (Chandler, Estelle, 383),
  (Chandler, Phoebe, 259),
  (Chandler, Joey Tribbiani, 296),
  (Chandler, Katelynn, 243),
  (Chandler, today, 98),
  (Chandler, Annie, 285),
  (Chandler, Phoebe Buffay's, 248),
  (Chandler, weeks, 88),
  (Chandler, A

In [105]:
len(d.text.split('\n'))

24

In [101]:
import re
from itertools import combinations

from itertools import combinations

def entity_pair_combinations_turn(doc, ignore_entity_types=None):
    if ignore_entity_types is None:
        ignore_entity_types = []

    entities = [ent for ent in doc.ents if ent.label_ not in ignore_entity_types]  # filter out entities in the ignore list

    # Find the turn number for each entity
    entity_turns = dict()
    current_turn = 0
    for sentence in doc.sents:
        if re.match(r'Speaker \d+:', sentence.text):
            current_turn += 1
        for entity in entities:
            if entity.start >= sentence.start and entity.end <= sentence.end:
                entity_turns[entity] = current_turn

    entity_pairs = combinations(entities, 2)  # create combinations of entity pairs
    entity_pairs_distance = [(pair[0], pair[1], abs(entity_turns.get(pair[0], 0) - entity_turns.get(pair[1], 0))) for pair in entity_pairs]  # calculate turn distance
    return entity_pairs_distance


pairs = entity_pair_combinations_turn(d, ignore_entity_types=['CARDINAL'])
len(pairs),pairs

(105,
 [(today, first, 0),
  (today, weeks, 0),
  (today, Chandler, 4),
  (today, Estelle, 9),
  (today, Katelynn, 13),
  (today, Phoebe Buffay's, 13),
  (today, Ann, 13),
  (today, Phoebe, 13),
  (today, Annie, 14),
  (today, Joey Tribbiani, 14),
  (today, Estelle, 14),
  (today, Joey, 15),
  (today, Estelle, 15),
  (today, Annie, 15),
  (first, weeks, 0),
  (first, Chandler, 4),
  (first, Estelle, 9),
  (first, Katelynn, 13),
  (first, Phoebe Buffay's, 13),
  (first, Ann, 13),
  (first, Phoebe, 13),
  (first, Annie, 14),
  (first, Joey Tribbiani, 14),
  (first, Estelle, 14),
  (first, Joey, 15),
  (first, Estelle, 15),
  (first, Annie, 15),
  (weeks, Chandler, 4),
  (weeks, Estelle, 9),
  (weeks, Katelynn, 13),
  (weeks, Phoebe Buffay's, 13),
  (weeks, Ann, 13),
  (weeks, Phoebe, 13),
  (weeks, Annie, 14),
  (weeks, Joey Tribbiani, 14),
  (weeks, Estelle, 14),
  (weeks, Joey, 15),
  (weeks, Estelle, 15),
  (weeks, Annie, 15),
  (Chandler, Estelle, 5),
  (Chandler, Katelynn, 9),
  (Ch

In [109]:
ec = EntityCombiner(d, ignore_entity_types=['CARDINAL'])

In [114]:
ec.get_entity_pairs()
ec.get_entity_pairs('word')

[(today, first, 4),
 (today, weeks, 11),
 (today, Chandler, 100),
 (today, Estelle, 221),
 (today, Katelynn, 342),
 (today, Phoebe Buffay's, 347),
 (today, Ann, 355),
 (today, Phoebe, 358),
 (today, Annie, 384),
 (today, Joey Tribbiani, 395),
 (today, Estelle, 412),
 (today, Joey, 473),
 (today, Estelle, 482),
 (today, Annie, 489),
 (first, weeks, 8),
 (first, Chandler, 97),
 (first, Estelle, 218),
 (first, Katelynn, 339),
 (first, Phoebe Buffay's, 344),
 (first, Ann, 352),
 (first, Phoebe, 355),
 (first, Annie, 381),
 (first, Joey Tribbiani, 392),
 (first, Estelle, 409),
 (first, Joey, 470),
 (first, Estelle, 479),
 (first, Annie, 486),
 (weeks, Chandler, 90),
 (weeks, Estelle, 211),
 (weeks, Katelynn, 332),
 (weeks, Phoebe Buffay's, 337),
 (weeks, Ann, 345),
 (weeks, Phoebe, 348),
 (weeks, Annie, 374),
 (weeks, Joey Tribbiani, 385),
 (weeks, Estelle, 402),
 (weeks, Joey, 463),
 (weeks, Estelle, 472),
 (weeks, Annie, 479),
 (Chandler, Estelle, 122),
 (Chandler, Katelynn, 243),
 (Chand

In [55]:
def extract_hobbies(doc):
    # List of common hobbies
    hobby_list = [
        'reading', 'writing', 'sports', 'music', 'cooking', 'painting', 'dancing', 'hiking', 'fishing', 
        'gardening', 'photography', 'knitting', 'sewing', 'drawing', 'crafts', 'cycling', 'baking', 'swimming', 
        'camping', 'pottery', 'yoga', 'martial arts', 'singing', 'rock climbing', 'horse riding', 'bird watching', 
        'stamp collecting', 'jewelry making', 'playing guitar', 'playing piano', 'playing violin', 
        'video games', 'board games', 'card games', 'running', 'weightlifting', 'woodworking', 'scrapbooking', 
        'calligraphy', 'quilting', 'embroidery', 'crocheting', 'crossword puzzles', 'jigsaw puzzles', 
        'roller skating', 'skateboarding', 'soccer', 'football', 'basketball', 'tennis', 'golf', 'volleyball', 
        'rugby', 'skiing', 'snowboarding', 'ice skating', 'origami', 'magic', 'acting', 'theatre', 'comedy',
        'traveling', 'cinema', 'astronomy', 'chess', 'billiards', 'snooker', 'poker', 'brewing beer', 'winemaking',
        'blogging', 'podcasting', 'streaming', 'collecting', 'fashion', 'makeup', 'cosplay', 'meditation', 
        'learning languages', 'genealogy', 'archery', 'fencing', 'sailing', 'canoeing', 'kayaking', 'windsurfing', 
        'surfing', 'scuba diving', 'drone racing', 'kart racing', 'drone photography'
    ]

    # Create a regex pattern that looks for these hobbies in the text
    pattern = r'\b(' + '|'.join(hobby_list) + r')\b'

    # Use the pattern to find matches in the text
    matches = re.findall(pattern, doc.text, re.IGNORECASE)
    
    return matches

hobbies = extract_hobbies(d)
print(hobbies)


[]


In [56]:
from tqdm import tqdm

In [62]:
hobbies = [extract_hobbies(d) for d in ep.docs]
hobbies = [h for h in hobbies if h != []]
hobbies


[['acting', 'acting'],
 ['dancing', 'dancing'],
 ['magic'],
 ['magic'],
 ['running'],
 ['cinema'],
 ['magic'],
 ['fashion', 'fashion'],
 ['basketball', 'basketball'],
 ['magic'],
 ['yoga'],
 ['skiing', 'skiing'],
 ['singing'],
 ['writing'],
 ['theatre'],
 ['reading'],
 ['Pottery', 'Pottery', 'Pottery', 'Pottery'],
 ['acting'],
 ['reading'],
 ['sailing'],
 ['Singing'],
 ['running'],
 ['music'],
 ['Pottery', 'Pottery', 'Pottery', 'Pottery', 'Pottery', 'Pottery'],
 ['theatre'],
 ['fishing'],
 ['acting'],
 ['theatre'],
 ['basketball', 'Sports'],
 ['makeup'],
 ['golf'],
 ['golf'],
 ['music'],
 ['writing'],
 ['writing'],
 ['makeup'],
 ['music'],
 ['dancing', 'dancing'],
 ['dancing', 'dancing'],
 ['writing'],
 ['cooking'],
 ['basketball'],
 ['acting'],
 ['reading'],
 ['writing'],
 ['poker'],
 ['writing', 'writing'],
 ['drawing'],
 ['running'],
 ['makeup'],
 ['reading'],
 ['running'],
 ['singing'],
 ['Cooking'],
 ['theatre'],
 ['writing'],
 ['running'],
 ['writing', 'writing', 'writing'],
 ['d

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['acting', 'acting'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['dancing', 'dancing'],
 [],
 [],
 [],
 [],
 ['magic'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['magic'],
 [],
 ['running'],
 [],
 [],
 [],
 [],
 ['cinema'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['magic'],
 [],
 ['fashion', 'fashion'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['basketball', 'basketball'],
 [],
 [],
 [],
 ['magic'],
 [],
 [],
 [],
 [],
 ['yoga'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['skiing', 'skiing'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['singing'],
 ['writing'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['theatre'],
 [],
 [],
 [],
 [],
 [],
 [],
