In [1]:
from tqdm.autonotebook import tqdm
import re

import spacy
from spacy import displacy

from src.paths import LOCAL_PROCESSED_DATA_PATH
from src.statistics import get_counts_and_percentages

from pathlib import Path
import pandas as pd
import json
import glob
import math

nlp = spacy.load("en_core_web_lg")


# Get a list of all json files in the directory, excluding 'dev'
files = [Path(f) for f in glob.glob(str(LOCAL_PROCESSED_DATA_PATH / "dialog-re-with-no-relation/*.json"))]
# files = [Path(f) for f in glob.glob(str(LOCAL_PROCESSED_DATA_PATH / "dialog-re-ternary/*.json"))]


# Create an empty DataFrame
df1 = pd.DataFrame(columns=["Dialogue", "Relations", "Origin"])

# Loop over all json files in the directory
for file_name in files:
    with open(file_name, 'r') as file:
        data = json.load(file)

        # Convert the data to a DataFrame
        df_temp = pd.DataFrame(data, columns=["Dialogue", "Relations"])

        # Add a new column to this DataFrame for the origin
        df_temp["Origin"] = file_name.stem  # This will get just the file name without the extension

        # Append the temporary DataFrame to the main DataFrame
        df1 = pd.concat([df1, df_temp], ignore_index=True)
df1

  from tqdm.autonotebook import tqdm


Unnamed: 0,Dialogue,Relations,Origin
0,"[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[{'y': 'casting director', 'x': 'Ann', 'rid': ...",dev
1,"[Speaker 1, Speaker 2: Hi, Speaker 3: Hi! Hey ...","[{'y': 'Speaker 2', 'x': 'Speaker 1', 'rid': [...",dev
2,"[Speaker 1, Speaker 2: Hi!, Speaker 3: Hey!, S...","[{'y': 'man', 'x': 'Speaker 4', 'rid': [37], '...",dev
3,[Speaker 1: Wow! It looks like we got a lot of...,"[{'y': 'baby', 'x': 'Speaker 2', 'rid': [37], ...",dev
4,"[Speaker 1: Now, Mom, everything's going fine,...","[{'y': '26', 'x': 'Speaker 1', 'rid': [25], 'r...",dev
...,...,...,...
1783,"[Speaker 1: Nice camoflauge man, for a minute ...","[{'y': 'Speaker 1', 'x': 'Speaker 2', 'rid': [...",train
1784,"[Speaker 1: Well, I'm sure you'll teach her a ...","[{'y': 'Sir', 'x': 'Speaker 1', 'rid': [37], '...",train
1785,[Speaker 1: You know what? I can't even worry ...,"[{'y': 'baby', 'x': 'Speaker 1', 'rid': [37], ...",train
1786,"[Speaker 1: And cut. Hey, Butt Guy, what the h...","[{'y': 'Butt Guy', 'x': 'Speaker 2', 'rid': [3...",train


In [2]:
all_dialogues = df1.Dialogue.apply(lambda x: '\n'.join(x))
all_dialogues

0       Speaker 1: Hey!\nSpeaker 2: Hey.\nSpeaker 3: H...
1       Speaker 1, Speaker 2: Hi\nSpeaker 3: Hi! Hey m...
2       Speaker 1, Speaker 2: Hi!\nSpeaker 3: Hey!\nSp...
3       Speaker 1: Wow! It looks like we got a lot of ...
4       Speaker 1: Now, Mom, everything's going fine, ...
                              ...                        
1783    Speaker 1: Nice camoflauge man, for a minute t...
1784    Speaker 1: Well, I'm sure you'll teach her a l...
1785    Speaker 1: You know what? I can't even worry a...
1786    Speaker 1: And cut. Hey, Butt Guy, what the he...
1787    Speaker 1: Buon Giorno, Bella Phoebe!\nSpeaker...
Name: Dialogue, Length: 1788, dtype: object

In [3]:
docs = list(tqdm(nlp.pipe(all_dialogues), total=len(all_dialogues)))

  0%|          | 0/1788 [00:00<?, ?it/s]

In [4]:
dir(docs[0][0])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [5]:
[t.ent_iob_+'-'+t.ent_type_ for t in docs[0] if t.ent_type_ != '']

['B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-DATE',
 'B-ORDINAL',
 'B-DATE',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-PERSON',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-PERSON',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-PERSON',
 'B-PERSON',
 'I-PERSON',
 'I-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-PERSON',
 'B-PERSON',
 'I-PERSON',
 'B-ORG',
 'B-CARDINAL',
 'B-CARDINAL',
 'B-PERSON',
 'B-PERSON',
 'B-PERSON',
 'B-CARDINAL',
 'B-CARDINAL']

In [134]:
def extract_unique_entities_relations(row_relations, ignore_containing=['Speaker ']):
    # Initialize a set to store the unique entities
    unique_entities = set()
    
    # Iterate over each relation in the row
    for relation in row_relations:
        # Add the 'x' and 'y' entities and their types to the set
        if not any(sub_str in relation['x'] for sub_str in ignore_containing):
            unique_entities.add(f"{relation['x']}:{relation['x_type']}")
        if not any(sub_str in relation['y'] for sub_str in ignore_containing):
            unique_entities.add(f"{relation['y']}:{relation['y_type']}")
    
    return list(unique_entities)

def extract_unique_entities_spacy(doc):
    unique_entities = set()
    
    entity = ""
    entity_type = ""
    
    for token in doc:
        if token.ent_iob_ == "B":
            # If an entity is currently being constructed, add it to the set
            if entity:
                entity_processed = entity.strip()
                if entity_processed.endswith("'s"):
                    entity_processed = entity_processed[:-2].strip()
                unique_entities.add(f"{entity_processed}:{entity_type}")
            
            # Start a new entity
            entity = token.text
            entity_type = token.ent_type_
        elif token.ent_iob_ == "I":
            # Continue the entity
            entity += " " + token.text
        else:
            # If an entity is currently being constructed, add it to the set
            if entity:
                entity_processed = entity.strip()
                if entity_processed.endswith("'s"):
                    entity_processed = entity_processed[:-2].strip()
                unique_entities.add(f"{entity_processed}:{entity_type}")
            # Reset the entity
            entity = ""
            entity_type = ""
    
    # If an entity is currently being constructed at the end of the document, add it to the set
    if entity:
        entity_processed = entity.strip()
        if entity_processed.endswith("'s"):
            entity_processed = entity_processed[:-2].strip()
        unique_entities.add(f"{entity_processed}:{entity_type}")
    
    return list(unique_entities)


# Apply to the documents
df1['PredictedEntities'] = [extract_unique_entities_spacy(doc) for doc in docs]


# Apply to the documents

# Apply the function to each row in the 'Relations' column
df1['UniqueEntities'] = df1['Relations'].apply(extract_unique_entities_relations)

# Apply the function to each document in docs
df1['PredictedEntities'] = [extract_unique_entities_spacy(doc) for doc in docs]


In [141]:
df1['UniqueEntities'].explode().value_counts().head(50)

Rach:PER                    161
honey:STRING                159
man:STRING                  149
Pheebs:PER                  145
baby:STRING                 117
Rachel:PER                   80
Ross:PER                     80
Monica:PER                   72
my friend:STRING             71
girl:STRING                  67
Chandler:PER                 65
sweetie:STRING               63
Mon:PER                      58
Honey:STRING                 58
actor:STRING                 57
Joey:PER                     54
Joe:PER                      53
Joey Tribbiani:PER           48
dude:STRING                  48
Dude:STRING                  36
Emma:PER                     34
Rachel Green:PER             31
Phoebe:PER                   31
Emily:PER                    30
Chandler Bing:PER            30
boy:STRING                   29
dad:STRING                   26
Man:STRING                   26
Ben:PER                      25
Phoebe Buffay:PER            25
sweetheart:STRING            24
Ross Gel

In [142]:
df1.PredictedEntities.explode().value_counts().sort_values().iloc[80:120]

Ten dollars:MONEY            1
' 76:DATE                    1
MTV:ORG                      1
Aunt Syl:PERSON              1
the end of the date:DATE     1
the end of a date:DATE       1
Oscar:PERSON                 1
Vincent:PERSON               1
Richard Crosby:PERSON        1
Smartie Pants:PERSON         1
the 18th:DATE                1
October 25th:DATE            1
February 16th:DATE           1
Omaha Beach:GPE              1
Adrienne Turner:PERSON       1
Three months:DATE            1
the first four years:DATE    1
Sha - la - lap:PERSON        1
Rocky:PERSON                 1
Raquel:PERSON                1
Pablo Diaz:PERSON            1
Arthur Fonzerelli:PERSON     1
Brady Smith:PERSON           1
Jethro Tull:PERSON           1
Bob Greenmore:PERSON         1
Victorian:NORP               1
Buh - bye:PERSON             1
Cupert:PERSON                1
Laurie:PERSON                1
almost fake midnight:TIME    1
twelve:CARDINAL              1
India:GPE                    1
Fonz:PER

In [143]:
df1.UniqueEntities.apply(lambda x: [xi.split(':')[1] for xi in x]).explode().value_counts()

PER       2537
STRING    2105
GPE        137
ORG         92
VALUE       81
Name: UniqueEntities, dtype: int64

In [144]:
df1.PredictedEntities.apply(lambda x: [xi.split(':')[1] for xi in x]).explode().value_counts()

CARDINAL                      6621
PERSON                        4826
DATE                           950
TIME                           523
ORG                            434
GPE                            340
ORDINAL                        322
NORP                           104
MONEY                           86
WORK_OF_ART                     83
FAC                             43
PRODUCT                         37
LOC                             29
EVENT                           27
QUANTITY                        24
PERCENT                         20
00                              10
LAW                              7
LANGUAGE                         7
 G'night                         2
30 in the morning                2
30                               2
15                               1
00 A.M.                          1
00 AM                            1
30 on a                          1
 Thissa Time Itsa Personal       1
 An hour                         1
45                  

In [145]:
df1.UniqueEntities.apply(lambda x: [xi.split(':')[0] for xi in x if xi.split(':')[1] == 'STRING']).explode().value_counts().head(10)

honey        159
man          149
baby         117
my friend     71
girl          67
sweetie       63
Honey         58
actor         57
dude          48
Dude          36
Name: UniqueEntities, dtype: int64

In [146]:
# Define a mapping from your annotation types to SpaCy types
type_mapping = {
    'PER': 'PERSON',
    'STRING': 'STRING', # No direct equivalent in SpaCy, keep as is
    'GPE': 'GPE',
    'ORG': 'ORG',
    'VALUE': 'CARDINAL' # Assuming 'VALUE' corresponds to numbers in your dataset
}

# Apply the mapping to the 'UniqueEntities' column
df1['StandardizedUniqueEntities'] = df1['UniqueEntities'].apply(lambda entities: [f"{entity.split(':')[0]}:{type_mapping.get(entity.split(':')[1], 'OTHER')}" for entity in entities])
df1.StandardizedUniqueEntities.apply(lambda x: [xi.split(':')[1] for xi in x]).explode().value_counts()

PERSON      2537
STRING      2105
GPE          137
ORG           92
CARDINAL      81
Name: StandardizedUniqueEntities, dtype: int64

In [147]:
df1 = df1.drop(['Precision', 'Recall', 'F1'], axis=1)

In [148]:
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(row):
    y_true = set(row['StandardizedUniqueEntities'])
    y_pred = set(row['PredictedEntities'])
    
    # We're treating this as a binary classification problem. Entity is either correct (1) or not (0).
    y_true_bin = [1 if entity in y_true else 0 for entity in y_true.union(y_pred)]
    y_pred_bin = [1 if entity in y_pred else 0 for entity in y_true.union(y_pred)]
    
    precision, recall, f1, _ = precision_recall_fscore_support(y_true_bin, y_pred_bin, average='binary')
    
    return pd.Series({'Precision': precision, 'Recall': recall, 'F1': f1})

# Apply to dataframe
metrics_df = df1.apply(compute_metrics, axis=1)

# Join with original dataframe
df1 = df1.join(metrics_df)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [149]:
def missing_entities(row):
    y_true = set(row['StandardizedUniqueEntities'])
    y_pred = set(row['PredictedEntities'])
    
    missing_from_ground_truth = list(y_pred.difference(y_true))
    missing_from_predictions = list(y_true.difference(y_pred))
    
    return pd.Series({'MissingFromGroundTruth': missing_from_ground_truth, 
                      'MissingFromPredictions': missing_from_predictions})

def correct_predictions(row):
    y_true = set(row['StandardizedUniqueEntities'])
    y_pred = set(row['PredictedEntities'])
    
    correct_predictions = list(y_true.intersection(y_pred))
    
    return pd.Series({'CorrectPredictions': correct_predictions})


# Apply the function
df1[['MissingFromGroundTruth', 'MissingFromPredictions']] = df1.apply(missing_entities, axis=1)
# Apply the function
df1['CorrectPredictions'] = df1.apply(correct_predictions, axis=1)


In [150]:
df1.head()

Unnamed: 0,Dialogue,Relations,Origin,UniqueEntities,PredictedEntities,StandardizedUniqueEntities,MissingFromGroundTruth,MissingFromPredictions,CorrectPredictions,Precision,Recall,F1
0,"[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[{'y': 'casting director', 'x': 'Ann', 'rid': ...",dev,"[agent:STRING, Joey Tribbiani:PER, Katelynn:PE...","[first:ORDINAL, 10:CARDINAL, Katelynn:PERSON, ...","[agent:STRING, Joey Tribbiani:PERSON, Katelynn...","[7:CARDINAL, first:ORDINAL, 10:CARDINAL, Phoeb...","[man:STRING, agent:STRING, casting director:ST...","[Estelle:PERSON, Katelynn:PERSON, Joey Tribbia...",0.3,0.6,0.4
1,"[Speaker 1, Speaker 2: Hi, Speaker 3: Hi! Hey ...","[{'y': 'Speaker 2', 'x': 'Speaker 1', 'rid': [...",dev,[Jack:PER],"[Jack:PERSON, 2:CARDINAL, 35 years:DATE, 3:CAR...",[Jack:PERSON],"[2:CARDINAL, 35 years:DATE, 3:CARDINAL, 1:CARD...",[],[Jack:PERSON],0.166667,1.0,0.285714
2,"[Speaker 1, Speaker 2: Hi!, Speaker 3: Hey!, S...","[{'y': 'man', 'x': 'Speaker 4', 'rid': [37], '...",dev,"[Mrs. Geller:PER, Geller:PER, Mr. Geller:PER, ...","[three:CARDINAL, first:ORDINAL, Geller:PERSON,...","[Mrs. Geller:PERSON, Geller:PERSON, Mr. Geller...","[three:CARDINAL, first:ORDINAL, Ross:PERSON, 5...","[man:STRING, dad:STRING, one:CARDINAL, Mrs. Ge...","[Emma:PERSON, Geller:PERSON]",0.2,0.285714,0.235294
3,[Speaker 1: Wow! It looks like we got a lot of...,"[{'y': 'baby', 'x': 'Speaker 2', 'rid': [37], ...",dev,"[baby:STRING, roomie:STRING]","[Woo - hoo:PERSON, Ross:PERSON, 2:CARDINAL, ei...","[baby:STRING, roomie:STRING]","[Woo - hoo:PERSON, Ross:PERSON, 2:CARDINAL, ei...","[baby:STRING, roomie:STRING]",[],0.0,0.0,0.0
4,"[Speaker 1: Now, Mom, everything's going fine,...","[{'y': '26', 'x': 'Speaker 1', 'rid': [25], 'r...",dev,"[Ross:PER, 26:VALUE]","[1:CARDINAL, only 26:CARDINAL, Ross:PERSON]","[Ross:PERSON, 26:CARDINAL]","[1:CARDINAL, only 26:CARDINAL]",[26:CARDINAL],[Ross:PERSON],0.333333,0.5,0.4


In [183]:
i = 2
row = df1.iloc[i]
print(f"# SAMPLE INSPECTION - #{i}\n")
print(row[['Precision', 'Recall', 'F1']])


# Validation of the metrics by re-computation
TP = len(row.CorrectPredictions)
FP = len(row.MissingFromGroundTruth)
FN = len(row.MissingFromPredictions)


print(f"\n## GT: MissingFromPredictions (FN={FN})")
display(row.MissingFromPredictions)

print(f"\n## PREDICTIONS: MissingFromGroundTruth (FP={FP})")
display(row.MissingFromGroundTruth)

print(f"\n## PREDICTIONS: CorrectPredictions (TP={TP})")
display(row.CorrectPredictions)


Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
F1 = 2 * Precision * Recall / (Precision + Recall)

print("\n## VALIDATED METRICS")
print(f"""
- Precision: {Precision:.1%} \tTP / (TP + FP) = {TP} / ({TP} + {FP})
- Recall:    {Recall:.1%} \tTP / (TP + FN) = {TP} / ({TP} + {FN})
- F1 Score:  {F1:.1%} \t2 * Pre * Rec / (Prec + Rec) = 2 * {Precision:.1%} * {Recall:.1%} / ({Precision:.1%} + {Recall:.1%})""")


# SAMPLE INSPECTION - #2

Precision         0.2
Recall       0.285714
F1           0.235294
Name: 2, dtype: object

## GT: MissingFromPredictions (FN=5)


['man:STRING',
 'dad:STRING',
 'one:CARDINAL',
 'Mrs. Geller:PERSON',
 'Mr. Geller:PERSON']


## PREDICTIONS: MissingFromGroundTruth (FP=8)


['three:CARDINAL',
 'first:ORDINAL',
 'Ross:PERSON',
 '5:CARDINAL',
 '2:CARDINAL',
 '3:CARDINAL',
 '1:CARDINAL',
 '4:CARDINAL']


## PREDICTIONS: CorrectPredictions (TP=2)


['Emma:PERSON', 'Geller:PERSON']


## VALIDATED METRICS

- Precision: 20.0% 	TP / (TP + FP) = 2 / (2 + 8)
- Recall:    28.6% 	TP / (TP + FN) = 2 / (2 + 5)
- F1 Score:  23.5% 	2 * Pre * Rec / (Prec + Rec) = 2 * 20.0% * 28.6% / (20.0% + 28.6%)


In [188]:
df1.MissingFromPredictions.explode().value_counts().head(50)

honey:STRING                159
man:STRING                  149
Pheebs:PERSON               145
baby:STRING                 117
my friend:STRING             71
girl:STRING                  67
sweetie:STRING               63
Mon:PERSON                   58
Honey:STRING                 58
actor:STRING                 57
dude:STRING                  48
Dude:STRING                  36
boy:STRING                   29
dad:STRING                   26
Man:STRING                   26
sweetheart:STRING            24
chef:STRING                  23
doctor:STRING                23
Days of Our Lives:STRING     21
buddy:STRING                 20
restaurant:STRING            17
sir:STRING                   17
professor:STRING             15
waitress:STRING              15
Ralph Lauren:ORG             14
teacher:STRING               14
scientist:STRING             13
paleontologist:STRING        13
bitch:STRING                 12
dear:STRING                  11
Phoebs:PERSON                11
pal:STRI

In [189]:
df1.MissingFromGroundTruth.explode().value_counts().head(50)

1:CARDINAL              1788
2:CARDINAL              1781
3:CARDINAL              1154
4:CARDINAL               629
Ross:PERSON              338
5:CARDINAL               305
Joey:PERSON              272
Chandler:PERSON          245
Monica:PERSON            240
Phoebe:PERSON            216
Rachel:PERSON            204
first:ORDINAL            202
one:CARDINAL             187
two:CARDINAL             165
6:CARDINAL               137
today:DATE               112
tonight:TIME              99
a minute:TIME             75
tomorrow:DATE             69
7:CARDINAL                61
three:CARDINAL            50
second:ORDINAL            48
last night:TIME           34
Emma:PERSON               28
One:CARDINAL              27
Thanksgiving:DATE         26
Ben:PERSON                23
half:CARDINAL             23
yesterday:DATE            22
Mike:PERSON               22
Geller:PERSON             21
Ralph Lauren:PERSON       19
First:ORDINAL             19
Christmas:DATE            19
Umm:PERSON    

In [192]:
df1.PredictedEntities

0       [first:ORDINAL, 10:CARDINAL, Katelynn:PERSON, ...
1       [Jack:PERSON, 2:CARDINAL, 35 years:DATE, 3:CAR...
2       [three:CARDINAL, first:ORDINAL, Geller:PERSON,...
3       [Woo - hoo:PERSON, Ross:PERSON, 2:CARDINAL, ei...
4             [1:CARDINAL, only 26:CARDINAL, Ross:PERSON]
                              ...                        
1783    [fourth:ORDINAL, Stephen Hurs:PERSON, Susie Mo...
1784    [Douglas:PERSON, Tuesday:DATE, the holiday wee...
1785    [1:CARDINAL, Rach:PERSON, 2:CARDINAL, shh shhh...
1786            [1:CARDINAL, Butt Guy:PERSON, 2:CARDINAL]
1787    [Paolo:PERSON, Bella Phoebe !:PERSON, 7:CARDIN...
Name: PredictedEntities, Length: 1788, dtype: object