In [1]:
import json
import pandas as pd
pd.set_option('display.max_colwidth', 150)

In [2]:
df_opener_en_test = pd.read_json("data/opener_en/test.json")
df_en_BERT_review = pd.read_json("data/preds/en_BERT_review_pred.json")
df_en_bert_large = pd.read_json("data/preds/en_bert-large-uncased_pred.json")



df_opener_es_test = pd.read_json("data/opener_es/test.json")
df_es_roberta_bne = pd.read_json("data/preds/es_roberta-large-bne_pred.json")
df_es_xlm_roberta = pd.read_json("data/preds/es_xlm-roberta-large_pred.json")

In [3]:
def normalize_jsons(filepath):

    with open(filepath, 'r') as json_file:
        json_data = json.load(json_file)
        df = pd.json_normalize(
            json_data,
            record_path=["opinions"],
            sep="_",
            meta=[
                ['sent_id'],
                ['text']
            ],
            errors='ignore'
        )

    df[['Source_string','Source_index']] = pd.DataFrame(df['Source'].tolist(), index = df.index)
    df[['Target_string','Target_index']] = pd.DataFrame(df['Target'].tolist(), index = df.index)
    df[['Polar_expression_string','Polar_expression_index']] = pd.DataFrame(df['Polar_expression'].tolist(), index = df.index)
    df['Implicit_holder'] = df['Source_string'].apply(lambda x: 1 if len(x) == 0 else 0)
    df['Implicit_target'] = df['Target_string'].apply(lambda x: 1 if len(x) == 0 else 0)
    df['Implicit_holder_target'] = (df['Implicit_holder'] == 1) & (df['Implicit_target'] == 1)
    df['Implicit_expression'] = df['Polar_expression_string'].apply(lambda x: 1 if len(x) == 0 else 0)
    df['Implicit_target_exclusive'] = (df['Implicit_target']==1) & (df['Implicit_holder'] == 0) & (df['Implicit_expression']==0 ) 
    df['Implicit_holder_exclusive'] = (df['Implicit_target']==0) & (df['Implicit_holder'] == 1) & (df['Implicit_expression']==0 )
    
    df['H-A-O'] = df.apply(lambda row: 1 if (row['Implicit_target'] == 0 and row['Implicit_holder'] ==0) and row['Implicit_expression'] ==0 else 0, axis=1)
    
#    df['H-A-O'] =  (df['Implicit_target']==0) & (df['Implicit_holder'] == 0) & (df['Implicit_expression']==0 )
    df['H-IA-IO'] =(df['Implicit_target']==1) & (df['Implicit_holder'] == 0) & (df['Implicit_expression']==1 )
    df['H-A-IO'] = (df['Implicit_target']==0) & (df['Implicit_holder'] == 0) & (df['Implicit_expression']==1 )
    df['H-IA-O']=  (df['Implicit_target']==1) & (df['Implicit_holder'] == 0) & (df['Implicit_expression']==0 )
    df['IH-A-O']=  (df['Implicit_target']==0) & (df['Implicit_holder'] == 1) & (df['Implicit_expression']==0 )
    df['IH-A-IO']= (df['Implicit_target']==0) & (df['Implicit_holder'] == 1) & (df['Implicit_expression']==1 )
    df['IH-IA-O']= (df['Implicit_target']==1) & (df['Implicit_holder'] == 1) & (df['Implicit_expression']==0 )
    df['IH-IA-IO']= (df['Implicit_target']==1) & (df['Implicit_holder'] == 1) & (df['Implicit_expression']==1 )


    df = df.reindex(
        columns= [
        'text',
        'sent_id',
        'Source',
        'Source_string',
        'Source_index',
        'Target',
        'Target_string',
        'Target_index',
        'Polar_expression',
        'Polar_expression_string',
        'Polar_expression_index',
        'Polarity',
        'Intensity',
        'Implicit_holder',
        'Implicit_target',
        'Implicit_holder_target',
        'Implicit_expression',
        'Implicit_target_exclusive',
        'Implicit_holder_exclusive',
        'H-A-O', 'H-IA-IO','H-A-IO','H-IA-O','IH-A-O', 'IH-A-IO', 'IH-IA-O', 'IH-IA-IO'
        ]
    )

    return df

In [4]:
df_opener_en_test = normalize_jsons("data/opener_en/test.json")
df_en_BERT_review = normalize_jsons("data/preds/en_BERT_review_pred.json")
df_en_bert_large = normalize_jsons("data/preds/en_bert-large-uncased_pred.json")

df_opener_es_test = normalize_jsons("data/opener_es/test.json")
df_es_roberta_bne = normalize_jsons("data/preds/es_roberta-large-bne_pred.json")
df_es_xlm_roberta = normalize_jsons("data/preds/es_xlm-roberta-large_pred.json")


In [5]:
df_opener_en_test.columns

# df_en.dtypes
# df_en.columns
# df_en.shape

Index(['text', 'sent_id', 'Source', 'Source_string', 'Source_index', 'Target',
       'Target_string', 'Target_index', 'Polar_expression',
       'Polar_expression_string', 'Polar_expression_index', 'Polarity',
       'Intensity', 'Implicit_holder', 'Implicit_target',
       'Implicit_holder_target', 'Implicit_expression',
       'Implicit_target_exclusive', 'Implicit_holder_exclusive', 'H-A-O',
       'H-IA-IO', 'H-A-IO', 'H-IA-O', 'IH-A-O', 'IH-A-IO', 'IH-IA-O',
       'IH-IA-IO'],
      dtype='object')

In [6]:
df_opener_en_test.head(5)

Unnamed: 0,text,sent_id,Source,Source_string,Source_index,Target,Target_string,Target_index,Polar_expression,Polar_expression_string,...,Implicit_target_exclusive,Implicit_holder_exclusive,H-A-O,H-IA-IO,H-A-IO,H-IA-O,IH-A-O,IH-A-IO,IH-IA-O,IH-IA-IO
0,They were there just to keep us happy .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-6,"[[us], [29:31]]",[us],[29:31],"[[They], [0:4]]",[They],[0:4],"[[to keep, happy], [21:28, 32:37]]","[to keep, happy]",...,False,False,1,False,False,False,False,False,False,False
1,A wonderful place to go and we are planning to return as soon as we can .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-7,"[[], []]",[],[],"[[place], [12:17]]",[place],[12:17],"[[wonderful], [2:11]]",[wonderful],...,False,True,0,False,False,False,True,False,False,False
2,A wonderful place to go and we are planning to return as soon as we can .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-7,"[[we], [28:30]]",[we],[28:30],"[[], []]",[],[],"[[planning to return], [35:53]]",[planning to return],...,True,False,0,False,False,True,False,False,False,False
3,Thank you for such a wonderful experience .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-8,"[[], []]",[],[],"[[experience], [31:41]]",[experience],[31:41],"[[wonderful], [21:30]]",[wonderful],...,False,True,0,False,False,False,True,False,False,False
4,Hotel was under construction .,opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-2,"[[], []]",[],[],"[[Hotel], [0:5]]",[Hotel],[0:5],"[[under construction], [10:28]]",[under construction],...,False,True,0,False,False,False,True,False,False,False


In [7]:
df_en_BERT_review.head(5)

Unnamed: 0,text,sent_id,Source,Source_string,Source_index,Target,Target_string,Target_index,Polar_expression,Polar_expression_string,...,Implicit_target_exclusive,Implicit_holder_exclusive,H-A-O,H-IA-IO,H-A-IO,H-IA-O,IH-A-O,IH-A-IO,IH-IA-O,IH-IA-IO
0,They were there just to keep us happy .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-6,"[[They], [0:4]]",[They],[0:4],"[[They], [0:4]]",[They],[0:4],"[[keep], [24:28]]",[keep],...,False,False,1,False,False,False,False,False,False,False
1,They were there just to keep us happy .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-6,"[[They], [0:4]]",[They],[0:4],"[[They], [0:4]]",[They],[0:4],"[[happy], [32:37]]",[happy],...,False,False,1,False,False,False,False,False,False,False
2,A wonderful place to go and we are planning to return as soon as we can .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-7,"[[we], [28:30]]",[we],[28:30],"[[place], [12:17]]",[place],[12:17],"[[wonderful], [2:11]]",[wonderful],...,False,False,1,False,False,False,False,False,False,False
3,Thank you for such a wonderful experience .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-8,"[[], []]",[],[],"[[experience], [31:41]]",[experience],[31:41],"[[such a wonderful], [14:30]]",[such a wonderful],...,False,True,0,False,False,False,True,False,False,False
4,Hotel was under construction .,opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-2,"[[], []]",[],[],"[[Hotel], [0:5]]",[Hotel],[0:5],"[[under construction], [10:28]]",[under construction],...,False,True,0,False,False,False,True,False,False,False


# Load Highest Scoring F1 Score Predictions and Labels

In [8]:
df_opener_en_test = normalize_jsons("data/opener_en/test.json")
df_opener_es_test = normalize_jsons("data/opener_es/test.json")

df_es_xlm_roberta = normalize_jsons("es_xlm-roberta-large_pred.json")
df_en_roberta_largev1 = normalize_jsons("en_all-roberta-large-v1_pred.json")



# Set of Aspects

In [9]:
# Target

en_target_tokens_test = set(sum(df_opener_en_test['Target_string'].tolist(), []))
en_target_tokens_pred = set(sum(df_en_roberta_largev1['Target_string'].tolist(), []))

es_target_tokens_test = set(sum(df_opener_es_test['Target_string'].tolist(), []))
es_target_tokens_pred = set(sum(df_es_xlm_roberta['Target_string'].tolist(), []))

# Holder

en_holder_tokens_test = set(sum(df_opener_en_test['Source_string'].tolist(), []))
en_holder_tokens_pred = set(sum(df_en_roberta_largev1['Source_string'].tolist(), []))

es_holder_tokens_test = set(sum(df_opener_es_test['Source_string'].tolist(), []))
es_holder_tokens_pred = set(sum(df_es_xlm_roberta['Source_string'].tolist(), []))

# Expression

en_expression_tokens_test = set(sum(df_opener_en_test['Polar_expression_string'].tolist(), []))
en_expression_tokens_pred = set(sum(df_en_roberta_largev1['Polar_expression_string'].tolist(), []))

es_expression_tokens_test = set(sum(df_opener_es_test['Polar_expression_string'].tolist(), []))
es_expression_tokens_pred = set(sum(df_es_xlm_roberta['Polar_expression_string'].tolist(), []))


# Polarity
en_pos_test = sum(df_opener_en_test['Polarity'] == "Positive") 
en_pos_pred = sum(df_en_roberta_largev1['Polarity'] == "Positive") 
en_neg_test = sum(df_opener_en_test['Polarity'] == "Negative") 
en_neg_pred = sum(df_en_roberta_largev1['Polarity'] == "Negative") 

es_pos_test = sum(df_opener_es_test['Polarity'] == "Positive") 
es_pos_pred = sum(df_es_xlm_roberta['Polarity'] == "Positive") 
es_neg_test = sum(df_opener_es_test['Polarity'] == "Negative") 
es_neg_pred = sum(df_es_xlm_roberta['Polarity'] == "Negative") 


## Polarity Counts

In [10]:
print(en_pos_test, en_pos_pred, en_neg_test, en_neg_pred)

596 589 269 257


In [11]:
print(es_pos_test, es_pos_pred, es_neg_test, es_neg_pred)

768 603 189 142


## Set Target

In [12]:
en_target_diff_test = en_target_tokens_test.difference(en_target_tokens_pred)
en_target_diff_pred = en_target_tokens_test.difference(en_target_tokens_pred)

print(len(en_target_diff_test),len(en_target_tokens_test), len(en_target_tokens_pred))

136 396 380


In [13]:
en_target_diff_test

{'24 hr bar',
 'A la carte restaurant steak House',
 'A stay',
 'All inclusive',
 'Busdriver Peter Langhout',
 'Coffee maker',
 'French cuisine',
 'He',
 'Hotel and Villa Dievole',
 'Kettle',
 'Melia Grand Hermitage',
 'Metro',
 'Numerous shopping',
 'One bed',
 'Our room',
 'Pool',
 'RS375 off',
 'The TV',
 'The all inclusive',
 'The house · keeping',
 'The technology provided',
 'This',
 'This B & amp ; B',
 'Tuscan location',
 'Wellness offer and quality',
 'Wi · Fi in the room',
 'a choice',
 'a hotel',
 'a hotel with such staff',
 'a rubber duck',
 'a waiter on hand',
 'access to the city and Messe',
 'all Steigenburger hotels',
 'all rounder',
 'anything to complain about',
 'back massage',
 'bars',
 'bath',
 'bathroom door',
 'caffees',
 'chairs / tables',
 'check in',
 'chill out',
 'choice of drinks',
 'cocktails',
 'cocktails.',
 'complex',
 'curtains',
 'customer',
 'distance from firenze',
 'dvd player',
 'electrical outlet',
 'enough seats',
 'ever.staff',
 'everything els

In [40]:
df_opener_en_test

Unnamed: 0,text,sent_id,Source,Source_string,Source_index,Target,Target_string,Target_index,Polar_expression,Polar_expression_string,...,Implicit_holder_exclusive,H-A-O,H-IA-IO,H-A-IO,H-IA-O,IH-A-O,IH-A-IO,IH-IA-O,IH-IA-IO,Token Count
0,They were there just to keep us happy .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-6,"[[us], [29:31]]",[us],[29:31],"[[They], [0:4]]",[They],[0:4],"[[to keep, happy], [21:28, 32:37]]","[to keep, happy]",...,False,1,False,False,False,False,False,False,False,11
1,A wonderful place to go and we are planning to return as soon as we can .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-7,"[[], []]",[],[],"[[place], [12:17]]",[place],[12:17],"[[wonderful], [2:11]]",[wonderful],...,True,0,False,False,False,True,False,False,False,19
2,A wonderful place to go and we are planning to return as soon as we can .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-7,"[[we], [28:30]]",[we],[28:30],"[[], []]",[],[],"[[planning to return], [35:53]]",[planning to return],...,False,0,False,False,True,False,False,False,False,19
3,Thank you for such a wonderful experience .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-8,"[[], []]",[],[],"[[experience], [31:41]]",[experience],[31:41],"[[wonderful], [21:30]]",[wonderful],...,True,0,False,False,False,True,False,False,False,10
4,Hotel was under construction .,opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-2,"[[], []]",[],[],"[[Hotel], [0:5]]",[Hotel],[0:5],"[[under construction], [10:28]]",[under construction],...,True,0,False,False,False,True,False,False,False,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
860,"Front desk wasn 't the best , quite slow for passport registration and clerk insisted that we pay up for a better room .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-8,"[[], []]",[],[],"[[Front desk], [0:10]]",[Front desk],[0:10],"[[quite slow], [30:40]]",[quite slow],...,True,0,False,False,False,True,False,False,False,27
861,"Front desk wasn 't the best , quite slow for passport registration and clerk insisted that we pay up for a better room .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-8,"[[], []]",[],[],"[[Front desk], [0:10]]",[Front desk],[0:10],"[[wasn 't the best], [11:27]]",[wasn 't the best],...,True,0,False,False,False,True,False,False,False,27
862,"In · room breakfast was tasty and quite large , awesome .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-9,"[[], []]",[],[],"[[breakfast], [10:19]]",[breakfast],[10:19],"[[quite large], [34:45]]",[quite large],...,True,0,False,False,False,True,False,False,False,14
863,"In · room breakfast was tasty and quite large , awesome .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-9,"[[], []]",[],[],"[[breakfast], [10:19]]",[breakfast],[10:19],"[[tasty], [24:29]]",[tasty],...,True,0,False,False,False,True,False,False,False,14


In [39]:
df_opener_en_test.loc[0]['Target_string']

['They']

In [52]:
df_opener_en_test[df_opener_en_test['sent_id'] == "opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6"]

Unnamed: 0,text,sent_id,Source,Source_string,Source_index,Target,Target_string,Target_index,Polar_expression,Polar_expression_string,...,Implicit_holder_exclusive,H-A-O,H-IA-IO,H-A-IO,H-IA-O,IH-A-O,IH-A-IO,IH-IA-O,IH-IA-IO,Token Count
565,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[a waiter on hand], [87:103]]",[a waiter on hand],[87:103],"[[there was always], [70:86]]",[there was always],...,True,0,False,False,False,True,False,False,False,28
566,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[24 hr bar], [16:25]]",[24 hr bar],[16:25],"[[well run], [30:38]]",[well run],...,True,0,False,False,False,True,False,False,False,28


In [51]:
df_en_roberta_largev1[df_en_roberta_largev1['sent_id'] == "opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6"]

Unnamed: 0,text,sent_id,Source,Source_string,Source_index,Target,Target_string,Target_index,Polar_expression,Polar_expression_string,...,Implicit_target_exclusive,Implicit_holder_exclusive,H-A-O,H-IA-IO,H-A-IO,H-IA-O,IH-A-O,IH-A-IO,IH-IA-O,IH-IA-IO
523,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[The first floor], [0:15]]",[The first floor],[0:15],"[[24 hr], [16:21]]",[24 hr],...,False,True,0,False,False,False,True,False,False,False
524,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[The first floor], [0:15]]",[The first floor],[0:15],"[[well run], [30:38]]",[well run],...,False,True,0,False,False,False,True,False,False,False
525,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[bar], [22:25]]",[bar],[22:25],"[[well run], [30:38]]",[well run],...,False,True,0,False,False,False,True,False,False,False
526,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[a waiter], [87:95]]",[a waiter],[87:95],"[[there was always], [70:86]]",[there was always],...,False,True,0,False,False,False,True,False,False,False
527,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[a waiter], [87:95]]",[a waiter],[87:95],"[[on hand], [96:103]]",[on hand],...,False,True,0,False,False,False,True,False,False,False


In [None]:
df_opener_en_test[df_opener_en_test['sent_id'] == "opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6"]

In [44]:
df_en_roberta_largev1[df_en_roberta_largev1['text'].str.contains("24 hr bar")]

Unnamed: 0,text,sent_id,Source,Source_string,Source_index,Target,Target_string,Target_index,Polar_expression,Polar_expression_string,...,Implicit_target_exclusive,Implicit_holder_exclusive,H-A-O,H-IA-IO,H-A-IO,H-IA-O,IH-A-O,IH-A-IO,IH-IA-O,IH-IA-IO
523,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[The first floor], [0:15]]",[The first floor],[0:15],"[[24 hr], [16:21]]",[24 hr],...,False,True,0,False,False,False,True,False,False,False
524,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[The first floor], [0:15]]",[The first floor],[0:15],"[[well run], [30:38]]",[well run],...,False,True,0,False,False,False,True,False,False,False
525,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[bar], [22:25]]",[bar],[22:25],"[[well run], [30:38]]",[well run],...,False,True,0,False,False,False,True,False,False,False
526,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[a waiter], [87:95]]",[a waiter],[87:95],"[[there was always], [70:86]]",[there was always],...,False,True,0,False,False,False,True,False,False,False
527,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[a waiter], [87:95]]",[a waiter],[87:95],"[[on hand], [96:103]]",[on hand],...,False,True,0,False,False,False,True,False,False,False


In [14]:
es_target_diff_test = es_target_tokens_test.difference(es_target_tokens_pred)
es_target_diff_pred = es_target_tokens_test.difference(es_target_tokens_pred)

print(len(es_target_diff_test),len(es_target_tokens_test), len(es_target_tokens_pred))

215 472 370


In [15]:
es_target_diff_test

{'7 dias',
 'APARTAMENTO',
 'Algo',
 'Alojamiento',
 'Camas de 1,05',
 'Camp Nou',
 'El Gran Bahía Principe Ambar',
 'El complejo',
 'El tamaño de la habitación y el baño',
 'Fin de semana',
 'La comida y cena',
 'Los de recepción',
 'Museos',
 'Neptuno',
 'Puerta del Sol',
 'Sol',
 'Tiendas',
 'WIFI',
 'a lounge',
 'a todos lados',
 'acogimiento por parte de todos los empleados',
 'actividades',
 'acudir a un musical',
 'aire acondicionado',
 'al estadio de futbol del barca',
 'algo',
 'amplitud de menu',
 'autobus',
 'bandeja de infusiones y café',
 'bar',
 'barandillas',
 'baños',
 'bollería',
 'brares',
 'bus urbanos.se',
 'cadena Playa Senator',
 'cafes',
 'cafetera eléctrica',
 'calidad',
 'camas queen',
 'centro de Peñiscola',
 'cereales',
 'cesta de frutas',
 'cines',
 'con Moet Chandon',
 'congreso de los diputados',
 'conocer la ciudad',
 'cualquier punto monumental de la ciudad',
 'cuarto de baño',
 'cuartos',
 'cuidado',
 'cultura',
 'de Sevilla capital',
 'de cocoteros',
 

## Holder

In [16]:
en_holder_diff_test = en_holder_tokens_test.difference(en_holder_tokens_pred)
en_holder_diff_pred = en_holder_tokens_pred.difference(en_holder_tokens_test)

print(len(en_holder_tokens_test), len(en_holder_tokens_pred))

print(len(en_holder_diff_test),len(en_holder_diff_pred))

10 12
5 7


In [17]:
en_holder_diff_test

# find you, me holder sentences

{'all eight couples', 'eyes.I', 'me', 'my experience', 'you'}

In [18]:
en_holder_diff_pred

{'building',
 'everyone',
 'he',
 'many different activities',
 'reason',
 'the linen',
 'the room'}

In [19]:
es_holder_diff_test = es_holder_tokens_test.difference(es_holder_tokens_pred)
es_holder_diff_pred = es_holder_tokens_pred.difference(es_holder_tokens_test)

print(len(es_holder_tokens_test), len(es_holder_tokens_pred))

print(len(es_holder_diff_test),len(es_holder_diff_pred))

8 3
5 0


In [20]:
print(es_holder_diff_test , es_holder_diff_pred) # check yo, nosotros

{'me', 'hemos', 'Yo', 'nosotros', 'yo'} set()


In [21]:
## Set Expression

In [22]:
en_express_diff_test = en_expression_tokens_test.difference(en_expression_tokens_pred)
en_express_diff_pred = en_expression_tokens_pred.difference(en_expression_tokens_test)

print(len(en_expression_tokens_test), len(en_expression_tokens_pred))

print(len(en_express_diff_test),len(en_express_diff_pred))

594 547
220 173


In [23]:
en_express_diff_test

{"'s not worth",
 '12 times',
 'BEST',
 'Central',
 'Clean',
 'Dirty.our',
 'Friendly',
 'Greek dishes with a general European twist',
 'Many people',
 'NOT to eat in the buffet restaurant',
 'Not',
 'One of the best holidays',
 'Only',
 'Relax and enjoy',
 'Robbed in elevator of hotel !',
 'Similar to',
 'There are not',
 'There is no',
 'To be avoided at all costs',
 'To expensive',
 'Took',
 'Totally overpriced',
 'Within 7 minutes',
 'a good option in this category',
 'a happy',
 'a real 4star hotel',
 'a top hotel !',
 'abit repetitive at times',
 'accompanied by a live singer',
 'advise everyone',
 "all in length , which isn 't the most convenient",
 'all tastes',
 'already booked',
 'already went 14 times',
 'always will remember with joy and happiness',
 'annoying to wait for',
 'are satisfied',
 'at affordable prices',
 'attrackts back packing , party focused youngsters',
 'bad quality',
 'be definitely back soon !',
 'bit dated',
 'black mould on tiling',
 'boasts outstanding

In [24]:
es_express_diff_test = es_expression_tokens_test.difference(es_expression_tokens_pred)
es_express_diff_pred = es_expression_tokens_pred.difference(es_expression_tokens_test)

print(len(es_expression_tokens_test), len(es_expression_tokens_pred))

print(len(es_express_diff_test),len(es_express_diff_pred))

688 530
331 173


In [25]:
es_express_diff_test

{'%',
 '0,50 €',
 '10 · 15 min.',
 '2,5 €',
 'Bien',
 'Comodidad',
 'Descanso',
 'ESTUPENDO',
 'El gran problema',
 'En este sentido deja mucho que desear',
 'Enhorabuena',
 'Ideal para pasar un fin de semana de turismo por la capital',
 'Imposible',
 'Inútil',
 'La cercanía',
 'Lamentable',
 'Llegando a la madrugada no hubo',
 'Lo unico q vale',
 'Muy amable',
 'Muy amables',
 'Muy bien',
 'Muy bien ubicado',
 'Muy buenas',
 'NI OS ACERQUÉIS',
 'No',
 'No es un',
 'Nos encantaría volver',
 'Problemas con el estado de la cama',
 'Remoñones',
 'Repetiremos',
 'Repetiremos sin duda',
 'Tranquilo',
 'Tuvieron',
 'Una verdader tortura',
 'a 15 minutos andando',
 'a 5 minutos',
 'a cualquier hora',
 'a la altura de un resort !',
 'a la entrada',
 'a pesar de tener ya algunos años',
 'a una parada de metro',
 'abusando',
 'acogedor',
 'aconsejan',
 'aconsejaria',
 'acorde a la categoria de hotel',
 'acorde con las características que ofrecen',
 'al igual que',
 'al lado',
 'alejado',
 'algo 

# Transformer, token length

In [26]:
fpath = !pwd

from transformers import AutoTokenizer

plm_model_name = 'roberta-large'
plm_model_path= f'{fpath[0]}/pretrained_models/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(plm_model_path)

length = len(tokenizer.encode('text'))
length

3

In [54]:
# Create a new column with the number of words from the 'text' column
df_opener_en_test['Token Count'] = df_opener_en_test['text'].apply(lambda x: len(tokenizer.encode(x)))
df_opener_es_test['Token Count'] = df_opener_es_test['text'].apply(lambda x: len(tokenizer.encode(x)))


In [55]:
df_en_roberta_largev1[df_en_roberta_largev1['sent_id'] == "opener_en/kaf/hotel/english00134_94658e39af0d72b00ead2ac314bf7074-2"]
df_opener_en_test[df_opener_en_test['sent_id'] == "opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6"] 

Unnamed: 0,text,sent_id,Source,Source_string,Source_index,Target,Target_string,Target_index,Polar_expression,Polar_expression_string,...,Implicit_holder_exclusive,H-A-O,H-IA-IO,H-A-IO,H-IA-O,IH-A-O,IH-A-IO,IH-IA-O,IH-IA-IO,Token Count
565,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[a waiter on hand], [87:103]]",[a waiter on hand],[87:103],"[[there was always], [70:86]]",[there was always],...,True,0,False,False,False,True,False,False,False,28
566,The first floor 24 hr bar was well run and no matter what time of day there was always a waiter on hand to serve .,opener_en/kaf/hotel/english00214_f731285c2d232cf15e5cdae66ab186b1-6,"[[], []]",[],[],"[[24 hr bar], [16:25]]",[24 hr bar],[16:25],"[[well run], [30:38]]",[well run],...,True,0,False,False,False,True,False,False,False,28


In [49]:
# df_opener_en_test.sort_values(by="Token Count", ascending=False)

df_opener_en_test[df_opener_en_test['sent_id'] =='opener_en/kaf/hotel/english00134_94658e39af0d72b00ead2ac314bf7074-2']['text']

# df_opener_en_test[df_opener_en_test['sent_id'] =='opener_en/kaf/hotel/english00134_94658e39af0d72b00ead2ac314bf7074-2']

56    really nice hotel , well laid out all rooms over look the pool. rooms are nice and large have tea coffee making facilties , mini fridge. loads of ...
57    really nice hotel , well laid out all rooms over look the pool. rooms are nice and large have tea coffee making facilties , mini fridge. loads of ...
58    really nice hotel , well laid out all rooms over look the pool. rooms are nice and large have tea coffee making facilties , mini fridge. loads of ...
59    really nice hotel , well laid out all rooms over look the pool. rooms are nice and large have tea coffee making facilties , mini fridge. loads of ...
60    really nice hotel , well laid out all rooms over look the pool. rooms are nice and large have tea coffee making facilties , mini fridge. loads of ...
61    really nice hotel , well laid out all rooms over look the pool. rooms are nice and large have tea coffee making facilties , mini fridge. loads of ...
62    really nice hotel , well laid out all rooms over look the 

In [29]:
df_opener_es_test.sort_values(by="Token Count", ascending=False)[:3]

Unnamed: 0,text,sent_id,Source,Source_string,Source_index,Target,Target_string,Target_index,Polar_expression,Polar_expression_string,...,Implicit_holder_exclusive,H-A-O,H-IA-IO,H-A-IO,H-IA-O,IH-A-O,IH-A-IO,IH-IA-O,IH-IA-IO,Token Count
15,"Es un hotel muy casero en todo lo que respecta al trato y a la comida , personal muy atento y simpatico , esta situado en primera linea de playa y...",opener_es/kaf/hotel/spanish00172_bf69ca6910c0aa687b0eab91d0353fd1-2,"[[], []]",[],[],"[[un lavado de cara], [535:552]]",[un lavado de cara],[535:552],"[[necesita], [526:534]]",[necesita],...,True,0,False,False,False,True,False,False,False,220
9,"Es un hotel muy casero en todo lo que respecta al trato y a la comida , personal muy atento y simpatico , esta situado en primera linea de playa y...",opener_es/kaf/hotel/spanish00172_bf69ca6910c0aa687b0eab91d0353fd1-2,"[[], []]",[],[],"[[trato], [50:55]]",[trato],[50:55],"[[muy casero], [12:22]]",[muy casero],...,True,0,False,False,False,True,False,False,False,220
18,"Es un hotel muy casero en todo lo que respecta al trato y a la comida , personal muy atento y simpatico , esta situado en primera linea de playa y...",opener_es/kaf/hotel/spanish00172_bf69ca6910c0aa687b0eab91d0353fd1-2,"[[], []]",[],[],"[[mobiliario], [647:657]]",[mobiliario],[647:657],"[[bastante antiguo], [658:674]]",[bastante antiguo],...,True,0,False,False,False,True,False,False,False,220


## Implicit Percentages

In [30]:
def json_eda(df):
# some basic counts and percentages
    
    sentances = df['sent_id'].nunique()
    holders = len(df[df['Source_string'].apply(len) > 0])
    targets = len(df[df['Target_string'].apply(len) > 0])
    polarity = len(df[df['Polarity'].apply(len) > 0])
    expression = len(df[df['Polar_expression_string'].apply(len) > 0])

    print(f'n sentances: {sentances} \nn holders: {holders} \nn targets: {targets} \nn polarity: {polarity} \nn expression: {expression}')

    perIH = df['Implicit_holder'].sum()/len(df)
    perIT = df['Implicit_target'].sum()/len(df)
    perIE = df['Implicit_expression'].sum()/len(df)
    perIHIT = df['Implicit_holder_target'].sum()/len(df)
    exclIT = df['Implicit_target_exclusive'].sum()/len(df)
    exclIH = df['Implicit_holder_exclusive'].sum()/len(df)

    print(f'\nImplicit percentages in the dataset: \npercent implicit holder: {perIH},\npercent implicit target: {perIT},\npercent implicit holder implicit target: {perIHIT}, \npercent implicit expression: {perIE}')

    print(f'\nImplicit target exclusive: {exclIT}')
    print(f'\nImplicit holder exclusive: {exclIH}')

    HAO=df['H-A-O'].sum()/len(df)
    HIAIO=df['H-IA-IO'].sum()/len(df)
    HAIO=df['H-A-IO'].sum()/len(df)
    HIAO=df['H-IA-O'].sum()/len(df)
    IHAO=df['IH-A-O'].sum()/len(df)
    IHAIO=df['IH-A-IO'].sum()/len(df)
    IHIAO=df['IH-IA-O'].sum()/len(df)

    print(f'H-A-O: {HAO}')
    print(f'H-IA-IO: {HIAIO}')
    print(f'H-A-IO: {HAIO}')
    print(f'H-IA-O: {HIAIO}')
    print(f'IH-A-O: {IHAO}')
    print(f'IH-A-IO: {IHAIO}')
    print(f'IH-IA-O: {IHIAO}')
        

    cross_polarity_intensity = pd.crosstab(df['Polarity'], df['Intensity'], margins = True, normalize='all') * 100
    display(cross_polarity_intensity)

    df_count_per_sent = pd.DataFrame(df.groupby(['sent_id'])['Polar_expression'].count())

    count_mult_opinion =  (df_count_per_sent['Polar_expression'] > 1).sum()
    perc_mult_opinion = (df_count_per_sent['Polar_expression'] > 1).sum() / df.shape[0]

    print(f'The total count of sentiments with multiple opinions is: {count_mult_opinion}')
    print(f'The percent of sentiments with multiple opinions is: {perc_mult_opinion}')
    
    

    

In [31]:
json_eda(df_en_train)

NameError: name 'df_en_train' is not defined

In [None]:
# df_en_train = normalize_jsons("data/opener_en/train.json")
# df_es_train = normalize_jsons("data/opener_es/train.json")
# df_en_dev = normalize_jsons("data/opener_en/dev.json")
# df_es_dev = normalize_jsons("data/opener_es/dev.json")
# df_en_test = normalize_jsons("data/opener_en/test.json")
# df_es_test = normalize_jsons("data/opener_es/test.json")


In [None]:
json_eda(df_es_train)

n sentances: 1252 
n holders: 176 
n targets: 2748 
n polarity: 3042 
n expression: 3042

Implicit percentages in the dataset: 
percent implicit holder: 0.9421433267587114,
percent implicit target: 0.09664694280078895,
percent implicit holder implicit target: 0.08481262327416174, 
percent implicit expression: 0.0

Implicit target exclusive: 0.011834319526627219

Implicit holder exclusive: 0.8573307034845496
H-A-O: 0.04602235371466141
H-IA-IO: 0.0
H-A-IO: 0.0
H-IA-O: 0.0
IH-A-O: 0.8573307034845496
IH-A-IO: 0.0
IH-IA-O: 0.08481262327416174


Intensity,Standard,Strong,All
Polarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,16.074951,2.662722,18.737673
Positive,51.314924,29.947403,81.262327
All,67.389875,32.610125,100.0


The total count of sentiments with multiple opinions is: 752
The percent of sentiments with multiple opinions is: 0.24720578566732412


In [None]:
json_eda(df_en_dev)


n sentances: 198 
n holders: 49 
n targets: 371 
n polarity: 400 
n expression: 400

Implicit percentages in the dataset: 
percent implicit holder: 0.8775,
percent implicit target: 0.0725,
percent implicit holder implicit target: 0.0525, 
percent implicit expression: 0.0

Implicit target exclusive: 0.02

Implicit holder exclusive: 0.825
H-A-O: 0.1025
H-IA-IO: 0.0
H-A-IO: 0.0
H-IA-O: 0.0
IH-A-O: 0.825
IH-A-IO: 0.0
IH-IA-O: 0.0525


Intensity,Standard,Strong,All
Polarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,24.75,4.25,29.0
Positive,42.5,28.5,71.0
All,67.25,32.75,100.0


The total count of sentiments with multiple opinions is: 115
The percent of sentiments with multiple opinions is: 0.2875


In [None]:
json_eda(df_es_dev)

n sentances: 174 
n holders: 23 
n targets: 363 
n polarity: 387 
n expression: 387

Implicit percentages in the dataset: 
percent implicit holder: 0.9405684754521964,
percent implicit target: 0.06201550387596899,
percent implicit holder implicit target: 0.06201550387596899, 
percent implicit expression: 0.0

Implicit target exclusive: 0.0

Implicit holder exclusive: 0.8785529715762274
H-A-O: 0.059431524547803614
H-IA-IO: 0.0
H-A-IO: 0.0
H-IA-O: 0.0
IH-A-O: 0.8785529715762274
IH-A-IO: 0.0
IH-IA-O: 0.06201550387596899


Intensity,Standard,Strong,All
Polarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,16.020672,2.067183,18.087855
Positive,52.713178,29.198966,81.912145
All,68.73385,31.26615,100.0


The total count of sentiments with multiple opinions is: 97
The percent of sentiments with multiple opinions is: 0.25064599483204136


# Fine Grain F1 scores

In [None]:
from nltk.tokenize.simple import SpaceTokenizer
st = SpaceTokenizer()

DATA_DIR = './data'
id2sentiment = {3: 'Negative', 4: 'Neutral', 5: 'Positive'}


class SemEvalEvaluator:

    def __init__(self, predicted_tuples, dataset_name, mode="manual"):

        self.mode = mode

        if mode == "valid":
            gold_path = DATA_DIR / dataset_name / 'test.json'
            self.sent_ids, self.golden_triplets, self.golden_quadtuples = self.load_gold_from_file(gold_path)
            #self.predicted_triplets, self.predicted_quadtuples = self.convert_to_submission_tuple_format(predicted_tuples)
            self.predicted_triplets, self.predicted_quadtuples = self.convert_to_submission_tuple_format_V2(predicted_tuples)
        
        elif mode == "manual":
            gold_path = dataset_name
            pred_path = predicted_tuples
            self.sent_ids, self.golden_quadtuples = self.load_gold_from_file(gold_path)
            self.sent_ids, self.predicted_quadtuples = self.load_preds_from_file(pred_path)
            #self.predicted_triplets, self.predicted_quadtuples = self.convert_to_submission_tuple_format(predicted_tuples)
            # self.predicted_triplets, self.predicted_quadtuples = self.convert_to_submission_tuple_format_V2(predicted_tuples)


    def convert_opinion_to_quadtuples(self, sentence):
        text = sentence["text"]
        opinions = sentence["opinions"]
        opinion_tuples = []
        token_offsets = list(st.span_tokenize(text))
        #
        if len(opinions) > 0:
            for opinion in opinions:
                holder_char_idxs = opinion["Source"][1]
                target_char_idxs = opinion["Target"][1]
                exp_char_idxs = opinion["Polar_expression"][1]
                polarity = opinion["Polarity"]
                #
                holder = self.convert_char_offsets_to_token_idxs(holder_char_idxs, token_offsets)
                target = self.convert_char_offsets_to_token_idxs(target_char_idxs, token_offsets)
                exp = self.convert_char_offsets_to_token_idxs(exp_char_idxs, token_offsets)
                opinion_tuples.append((holder, target, exp, polarity))
        return opinion_tuples

    def convert_char_offsets_to_token_idxs(self, char_offsets, token_offsets):
        """
        char_offsets: list of str
        token_offsets: list of tuples

        >>> text = "I think the new uni ( ) is a great idea"
        >>> char_offsets = ["8:19"]
        >>> token_offsets =
        [(0,1), (2,7), (8,11), (12,15), (16,19), (20,21), (22,23), (24,26), (27,28), (29,34), (35,39)]

        >>> convert_char_offsets_to_token_idxs(char_offsets, token_offsets)
        >>> (2,3,4)
        """
        token_idxs = []
        #
        for char_offset in char_offsets:
            bidx, eidx = char_offset.split(":")
            bidx, eidx = int(bidx), int(eidx)
            intoken = False
            for i, (b, e) in enumerate(token_offsets):
                if b == bidx:
                    intoken = True
                if intoken:
                    token_idxs.append(i)
                if e == eidx:
                    intoken = False
        return set(token_idxs)

    def load_gold_from_file(self, gold_path):

        with open(gold_path) as infile:
            gold = json.load(infile)

        sent_ids = [s["sent_id"] for s in gold] # It is orderly

        if self.mode=='valid' or self.mode=='test' or self.mode=='manual':
        
            golden_quadtuples = dict([(s["sent_id"], self.convert_opinion_to_quadtuples(s)) for s in gold])

            return sent_ids, golden_quadtuples

        elif self.mode=='predict':

            return sent_ids
    
    def load_preds_from_file(self, pred_path): # DL

        with open(pred_path) as infile:
            pred = json.load(infile)

        sent_ids = [s["sent_id"] for s in pred] # It is orderly

        if self.mode=='valid' or self.mode=='test' or self.mode=='manual':
        
            pred_quadtuples = dict([(s["sent_id"], self.convert_opinion_to_quadtuples(s)) for s in pred])

            return sent_ids, pred_quadtuples

        elif self.mode=='predict':

            return sent_ids


    def convert_to_submission_tuple_format_V2(self, tuples): #JR

        assert len(self.sent_ids) == len(tuples)

        all_converted_triplets = {}
        all_converted_quadtuples = {}

        for sent_id, i in zip(self.sent_ids, tuples):

            converted_triplets = []
            converted_quadtuples = []

            for tuple in i:
                holder_start_idx, holder_end_idx = tuple[0], tuple[1]
                aspect_start_idx, aspect_end_idx = tuple[2], tuple[3]
                opinion_start_idx, opinion_end_idx  = tuple[4], tuple[5]
                
                holder_range_set = set(range(holder_start_idx-1, holder_end_idx)) if set(range(holder_start_idx-1, holder_end_idx)) != set([-1]) else set()
                aspect_range_set = set(range(aspect_start_idx-1, aspect_end_idx)) if set(range(aspect_start_idx-1, aspect_end_idx)) != set([-1]) else set()
                opinion_range_set = set(range(opinion_start_idx-1, opinion_end_idx)) if set(range(opinion_start_idx-1, opinion_end_idx)) != set([-1]) else set()
            
                converted_triplets.append((aspect_range_set, opinion_range_set, id2sentiment[tuple[6]]))
                converted_quadtuples.append((holder_range_set, aspect_range_set, opinion_range_set, id2sentiment[tuple[6]]))

            all_converted_triplets[sent_id] = converted_triplets
            all_converted_quadtuples[sent_id] = converted_quadtuples

        return all_converted_triplets, all_converted_quadtuples


    def sent_quadtuples_in_list(self, sent_tuple1, list_of_sent_tuples, keep_polarity=True):
        holder1, target1, exp1, pol1 = sent_tuple1
        if len(holder1) == 0:
            holder1 = frozenset(["_"])
        if len(target1) == 0:
            target1 = frozenset(["_"])
        for holder2, target2, exp2, pol2 in list_of_sent_tuples:
            if len(holder2) == 0:
                holder2 = frozenset(["_"])
            if len(target2) == 0:
                target2 = frozenset(["_"])
            if (
                len(holder1.intersection(holder2)) > 0
                and len(target1.intersection(target2)) > 0
                and len(exp1.intersection(exp2)) > 0
            ):
                if keep_polarity:
                    if pol1 == pol2:
                        return True
                else:
                    return True
        return False

    def quadtuple_weighted_score(self, sent_tuple1, list_of_sent_tuples):
        best_overlap = 0
        holder1, target1, exp1, pol1 = sent_tuple1
        if len(holder1) == 0:
            holder1 = frozenset(["_"])
        if len(target1) == 0:
            target1 = frozenset(["_"])
        for holder2, target2, exp2, pol2 in list_of_sent_tuples:
            if len(holder2) == 0:
                holder2 = frozenset(["_"])
            if len(target2) == 0:
                target2 = frozenset(["_"])
            if (
                len(holder2.intersection(holder1)) > 0
                and len(target2.intersection(target1)) > 0
                and len(exp2.intersection(exp1)) > 0
            ):
                holder_overlap = len(holder2.intersection(holder1)) / len(holder1)
                target_overlap = len(target2.intersection(target1)) / len(target1)
                exp_overlap = len(exp2.intersection(exp1)) / len(exp1)
                overlap = (holder_overlap + target_overlap + exp_overlap) / 3
                if overlap > best_overlap:
                    best_overlap = overlap
        return best_overlap

    def quadtuple_precision(self, gold, pred, keep_polarity=True, weighted=True):
        """
        Weighted true positives / (true positives + false positives)
        """
        weighted_tp = []
        tp = []
        fp = []
        #
        for sent_idx in pred.keys():
            
            atomic_tp = [] # DL
            atomic_fp = [] # Dl

            ptuples = pred[sent_idx]
            gtuples = gold[sent_idx]
            for stuple in ptuples:
                if self.sent_quadtuples_in_list(stuple, gtuples, keep_polarity):
                    if weighted:
                        weighted_tp.append(self.quadtuple_weighted_score(stuple, gtuples))
                        # print('added weighted_tp') # DL
                        tp.append(1)
                        atomic_tp.append(1) # DL

                    else:
                        weighted_tp.append(1)
                        tp.append(1)
                        atomic_fp.append(1) # DL
                else:
                    fp.append(1)
                    atomic_fp.append(1) # DL
                
                print(f'sent_idx: {sent_idx}, {sum(weighted_tp) / (sum(atomic_tp) + sum(atomic_fp) + 0.0000000000000001)}' ) # DL

        return sum(weighted_tp) / (sum(tp) + sum(fp) + 0.0000000000000001)

    def quadtuple_recall(self, gold, pred, keep_polarity=True, weighted=True):
        """
        Weighted true positives / (true positives + false negatives)
        """
        weighted_tp = []
        tp = []
        fn = []
        #
        assert len(gold) == len(pred)
        #
        for sent_idx in pred.keys():
            
            atomic_weighted_tp = [] # DL
            atomic_tp = [] # DL
            atomic_fn = [] # Dl


            ptuples = pred[sent_idx]
            gtuples = gold[sent_idx]
            for stuple in gtuples:
                if self.sent_quadtuples_in_list(stuple, ptuples, keep_polarity):
                    if weighted:
                        weighted_tp.append(self.quadtuple_weighted_score(stuple, ptuples))
                        tp.append(1)
                        atomic_tp.append(1) # DL
                    else:
                        weighted_tp.append(1)
                        tp.append(1)
                        atomic_tp.append(1) # DL
                else:
                    fn.append(1)
                    atomic_fn.append(1)

                print(f'sent_idx: {sent_idx}, {sum(weighted_tp) / (sum(atomic_tp) + sum(atomic_fp) + 0.0000000000000001)}' ) # DL

        return sum(weighted_tp) / (sum(tp) + sum(fn) + 0.0000000000000001)

    def get_all_quadtuple_metrics(self, keep_polarity=True, weighted=True):

        prec = self.quadtuple_precision(self.golden_quadtuples, self.predicted_quadtuples, keep_polarity, weighted)
        rec = self.quadtuple_recall(self.golden_quadtuples, self.predicted_quadtuples, keep_polarity, weighted)

        return prec, rec, 2 * (prec * rec) / (prec + rec + 0.00000000000000001)

In [None]:
semeval_evaluator = SemEvalEvaluator(predicted_tuples="./en_all-roberta-large-v1_pred.json", dataset_name="./data/opener_en/test.json", mode="manual")
_precision, _recall, _f1 = semeval_evaluator.get_all_quadtuple_metrics()
print(_precision, _recall, _f1)

added weighted_tp
sent_idx: opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-8, 1e+16
added weighted_tp
sent_idx: opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-2, 1e+16
added weighted_tp
sent_idx: opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-3, 1e+16
added weighted_tp
sent_idx: opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-4, 1e+16
added weighted_tp
sent_idx: opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-4, 2e+16
added weighted_tp
sent_idx: opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-5, 1e+16
added weighted_tp
sent_idx: opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-6, 1e+16
sent_idx: opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-7, 0.0
added weighted_tp
sent_idx: opener_en/kaf/hotel/english00029_2a5891717bff19947a0b858a3b1e8230-1, 1e+16
added weighted_tp
sent_idx: opener_en/kaf/hotel/english00029_2a5891717bff19947a0b858a3b1e8230

In [None]:
df_en_roberta_largev1

Unnamed: 0,text,sent_id,Source,Source_string,Source_index,Target,Target_string,Target_index,Polar_expression,Polar_expression_string,...,Implicit_target_exclusive,Implicit_holder_exclusive,H-A-O,H-IA-IO,H-A-IO,H-IA-O,IH-A-O,IH-A-IO,IH-IA-O,IH-IA-IO
0,Thank you for such a wonderful experience .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-8,"[[], []]",[],[],"[[experience], [31:41]]",[experience],[31:41],"[[wonderful], [21:30]]",[wonderful],...,False,True,0,False,False,False,True,False,False,False
1,Hotel was under construction .,opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-2,"[[], []]",[],[],"[[Hotel], [0:5]]",[Hotel],[0:5],"[[under construction], [10:28]]",[under construction],...,False,True,0,False,False,False,True,False,False,False
2,Carpets where very old .,opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-3,"[[], []]",[],[],"[[Carpets], [0:7]]",[Carpets],[0:7],"[[very old], [14:22]]",[very old],...,False,True,0,False,False,False,True,False,False,False
3,Staircase only on the outside and not heated .,opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-4,"[[], []]",[],[],"[[Staircase], [0:9]]",[Staircase],[0:9],"[[only on the outside], [10:29]]",[only on the outside],...,False,True,0,False,False,False,True,False,False,False
4,Staircase only on the outside and not heated .,opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-4,"[[], []]",[],[],"[[Staircase], [0:9]]",[Staircase],[0:9],"[[not heated], [34:44]]",[not heated],...,False,True,0,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,"Front desk wasn 't the best , quite slow for passport registration and clerk insisted that we pay up for a better room .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-8,"[[], []]",[],[],"[[Front desk], [0:10]]",[Front desk],[0:10],"[[wasn 't the best], [11:27]]",[wasn 't the best],...,False,True,0,False,False,False,True,False,False,False
842,"Front desk wasn 't the best , quite slow for passport registration and clerk insisted that we pay up for a better room .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-8,"[[], []]",[],[],"[[better room], [107:118]]",[better room],[107:118],"[[pay], [94:97]]",[pay],...,False,True,0,False,False,False,True,False,False,False
843,"In · room breakfast was tasty and quite large , awesome .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-9,"[[], []]",[],[],"[[room breakfast], [5:19]]",[room breakfast],[5:19],"[[tasty], [24:29]]",[tasty],...,False,True,0,False,False,False,True,False,False,False
844,"In · room breakfast was tasty and quite large , awesome .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-9,"[[], []]",[],[],"[[room breakfast], [5:19]]",[room breakfast],[5:19],"[[quite large], [34:45]]",[quite large],...,False,True,0,False,False,False,True,False,False,False


In [None]:
df_opener_en_test

Unnamed: 0,text,sent_id,Source,Source_string,Source_index,Target,Target_string,Target_index,Polar_expression,Polar_expression_string,...,Implicit_target_exclusive,Implicit_holder_exclusive,H-A-O,H-IA-IO,H-A-IO,H-IA-O,IH-A-O,IH-A-IO,IH-IA-O,IH-IA-IO
0,They were there just to keep us happy .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-6,"[[us], [29:31]]",[us],[29:31],"[[They], [0:4]]",[They],[0:4],"[[to keep, happy], [21:28, 32:37]]","[to keep, happy]",...,False,False,1,False,False,False,False,False,False,False
1,A wonderful place to go and we are planning to return as soon as we can .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-7,"[[], []]",[],[],"[[place], [12:17]]",[place],[12:17],"[[wonderful], [2:11]]",[wonderful],...,False,True,0,False,False,False,True,False,False,False
2,A wonderful place to go and we are planning to return as soon as we can .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-7,"[[we], [28:30]]",[we],[28:30],"[[], []]",[],[],"[[planning to return], [35:53]]",[planning to return],...,True,False,0,False,False,True,False,False,False,False
3,Thank you for such a wonderful experience .,opener_en/kaf/hotel/english00148_ddb06f1e4ab012d85f9120c394168c48-8,"[[], []]",[],[],"[[experience], [31:41]]",[experience],[31:41],"[[wonderful], [21:30]]",[wonderful],...,False,True,0,False,False,False,True,False,False,False
4,Hotel was under construction .,opener_en/kaf/hotel/english00028_2a1e03a3e5d6afe2c5024926cb7ff9ab-2,"[[], []]",[],[],"[[Hotel], [0:5]]",[Hotel],[0:5],"[[under construction], [10:28]]",[under construction],...,False,True,0,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
860,"Front desk wasn 't the best , quite slow for passport registration and clerk insisted that we pay up for a better room .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-8,"[[], []]",[],[],"[[Front desk], [0:10]]",[Front desk],[0:10],"[[quite slow], [30:40]]",[quite slow],...,False,True,0,False,False,False,True,False,False,False
861,"Front desk wasn 't the best , quite slow for passport registration and clerk insisted that we pay up for a better room .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-8,"[[], []]",[],[],"[[Front desk], [0:10]]",[Front desk],[0:10],"[[wasn 't the best], [11:27]]",[wasn 't the best],...,False,True,0,False,False,False,True,False,False,False
862,"In · room breakfast was tasty and quite large , awesome .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-9,"[[], []]",[],[],"[[breakfast], [10:19]]",[breakfast],[10:19],"[[quite large], [34:45]]",[quite large],...,False,True,0,False,False,False,True,False,False,False
863,"In · room breakfast was tasty and quite large , awesome .",opener_en/kaf/hotel/english00113_8161bacaf54503f451c7246b073003a5-9,"[[], []]",[],[],"[[breakfast], [10:19]]",[breakfast],[10:19],"[[tasty], [24:29]]",[tasty],...,False,True,0,False,False,False,True,False,False,False
