In [1]:
import os
os.chdir(os.path.abspath(os.curdir).replace('notebooks',''))


In [2]:
from sentence_transformers import SentenceTransformer, models, InputExample, losses
from torch import nn
from torch.utils.data import DataLoader
import pandas as pd

In [3]:
%load_ext autoreload

In [15]:
%autoreload 2

from create_train_data import read_sem_eval
from source.ngram_utils import find_token_range

# High Level

High level use cases here are:
- Generate factually correct text from relationship / entity linkage detail
- Be able to scan text by certain relationships of interest or combinations of entity - relationship: "Owners of boats", "Causes of risk"

Issues with that are that there are two types of entities:
- Named entities: John, Germany, Notre Dame
- Concepts: democracy, operational risk, acquisitions
Most entity models focus on the former and there is no clear framework to define the latter. Queries are likely to be via the latter as the user will not know the names in advance or may be searching for concepts to begin with

## Tool flow
- Specify concepts / entities / events and type of relationship
- Scan text for entities. Annotate text to be ready for model to go through (add <e></e>). 
- Expand each sentence to have multiple vesions for each pair of entities
- For the relationships needed, prepare a list of sample relationship sentences to run through the model
- Embed both sets and look for sentences most similar to each relationship

# Expanding prototype samples

In [42]:
rel_prototype_sents = {
    'cause-effect':[
        {'sent':'Drinking too much causes serious headache and possible brain damage',
        'h': ['Drinking', '--'],
         't': ['serious headache', '--']},
        {'sent':'Drinking too much causes serious headache and possible brain damage',
        'h': ['Drinking', '--'],
         't': ['possible brain damage', '--']},
    ]}

In [43]:
# Expand prototype sentences with adding entities

for rel in rel_prototype_sents:
    sents = rel_prototype_sents[rel]
    for i,sent in enumerate(sents):
        sent_toks = sent['sent'].split()
        rel_prototype_sents[rel][i]['tokens'] = sent_toks
        head = rel_prototype_sents[rel][i]['h'][0]
        rel_prototype_sents[rel][i]['h'] = update_loc(sent_toks, head, rel_prototype_sents[rel][i]['h'])       
        tail = rel_prototype_sents[rel][i]['t'][0]
        rel_prototype_sents[rel][i]['t'] = update_loc(sent_toks, tail, rel_prototype_sents[rel][i]['t'])

In [44]:
rel_prototype_sents

{'cause-effect': [{'sent': 'Drinking too much causes serious headache and possible brain damage',
   'h': ['Drinking', '--', [[0, 0]]],
   't': ['serious headache', '--', [[4, 5]]],
   'tokens': ['Drinking',
    'too',
    'much',
    'causes',
    'serious',
    'headache',
    'and',
    'possible',
    'brain',
    'damage']},
  {'sent': 'Drinking too much causes serious headache and possible brain damage',
   'h': ['Drinking', '--', [[0, 0]]],
   't': ['possible brain damage', '--', [[7, 8, 9]]],
   'tokens': ['Drinking',
    'too',
    'much',
    'causes',
    'serious',
    'headache',
    'and',
    'possible',
    'brain',
    'damage']}]}

In [38]:
from source.data_prep import update_loc

In [None]:
entity_tokens = {
    'h': ['<e1>','</e1>'],
    't': ['<e2>','</e2>']
}
  

In [6]:
df_tr = read_sem_eval('./data/SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT')
df_tr.head()

Unnamed: 0_level_0,sentence,label,comment
i,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8001,The most common <e1>audits</e1> were about <e2...,"Message-Topic(e1,e2)",Comment: Assuming an audit = an audit document.
8002,The <e1>company</e1> fabricates plastic <e2>ch...,"Product-Producer(e2,e1)",Comment: (a) is satisfied
8003,The school <e1>master</e1> teaches the lesson ...,"Instrument-Agency(e2,e1)",Comment:
8004,The suspect dumped the dead <e1>body</e1> into...,"Entity-Destination(e1,e2)",Comment:
8005,Avian <e1>influenza</e1> is an infectious dise...,"Cause-Effect(e2,e1)",Comment:


In [55]:
# Relationships model

model_save_path = './trained_models/distilbert-base-uncased-mtb-rnd_sbert_bi_few_rel/'

model = SentenceTransformer(model_save_path)

Attempting to do something stupid here , namely remove all entnty tags and look for similarity. Results not convincing and sentences suddenly all show super high similarity. Possibly an indication that the model is very confused when there are no tags and considers everything to be the same in the absense of tags

In [52]:

import re
sample_cause_effects = list(df_tr[df_tr.label=='Cause-Effect(e2,e1)'].sample(10,random_state=1701).sentence.values)
sample_cause_effects_clean = [re.sub('<[/]?e[1,2]>','',s) for s in sample_cause_effects]

In [77]:
emb_cause_effect = model.encode(sample_cause_effects)

In [53]:
path_assets= '../endeavor/text_structure_extract/data/assets'
art_name = 'risk__top_10_operational_risks_for_2020_conf_d8d7d7f94f01b730dc3876fc7387d82c_flat_sentence.csv'

df_risk = pd.read_csv(f'{path_assets}/{art_name}', index_col = 0)

In [54]:
list_risk = list(df_risk.sentence_0.values)

In [57]:
%%time 

print(len(list_risk))

emb_list_risk = model.encode(list_risk)

158
CPU times: user 6.35 s, sys: 370 ms, total: 6.72 s
Wall time: 5.83 s


In [59]:
from sklearn.metrics.pairwise import cosine_similarity

In [78]:
sim_score = cosine_similarity(emb_list_risk, emb_cause_effect).mean(axis=1)

In [79]:
df_risk['cause'] = sim_score

In [80]:
pd.set_option('display.max_colwidth', None)

df_risk[['sentence_0','cause']].sort_values('cause',ascending = False).head(20)

Unnamed: 0,sentence_0,cause
105,The category is an aggregation of two key subsets of the risk – mis-selling and unauthorised trading – which have appeared repeatedly in previous years.,0.87303
118,"​Regulatory risk slips back a few places to rank at eighth in this year’s Top 10 – a function, perhaps, of a slowdown in the printing press of rulemakings that have reshaped the post-crisis financial landscape.",0.870863
54,"While the march of progress may produce all sorts of convoluted, tech-centric crime, naturally theft and fraud can still take place in a more mundane fashion.",0.865506
37,"In this year’s Top 10, data management, a discrete category in previous top 10 lists, has been folded into data compromise to form a single topic.",0.865048
145,"Geopolitical risk continues to manifest itself in plenty of other ways, too, such as regulatory uncertainty.",0.857644
22,"In this year’s survey, IT failure has been considered alongside IT disruption, where last year the categories were considered separately.",0.857326
120,"Given the breadth and volume of new sets of rules, the potential for mis-steps and misinterpretation is manifest.",0.854154
107,"“Conduct by its nature tends to take some time to be identified, and then often takes a long time to manifest itself in outflows from fines or restitution.",0.848606
93,"Conversions of this sort, new projects and procedures – such as the long-overdue overhaul of domain models, for example – and the hatching of new enterprises often mean more work for employees who are already under pressure.",0.848598
23,"Although the drivers and risk management of the issues are very different, the consequences – the loss of critical services leading to parts or all of an organisation being unable to function – end up looking much the same.",0.846775


In [None]:
top_10_risks = ['IT disruption',
'Data compromise',
'Resilience risk',
'Theft and fraud',
'Third-party risk',
'Conduct risk',
'Regulatory risk',
'Organisational change',
'Geopolitical risk',
'Employee wellbeing']

In [7]:
df_tr.label.unique()

array(['Message-Topic(e1,e2)', 'Product-Producer(e2,e1)',
       'Instrument-Agency(e2,e1)', 'Entity-Destination(e1,e2)',
       'Cause-Effect(e2,e1)', 'Component-Whole(e1,e2)',
       'Product-Producer(e1,e2)', 'Member-Collection(e2,e1)', 'Other',
       'Entity-Origin(e1,e2)', 'Content-Container(e1,e2)',
       'Entity-Origin(e2,e1)', 'Cause-Effect(e1,e2)',
       'Component-Whole(e2,e1)', 'Content-Container(e2,e1)',
       'Instrument-Agency(e1,e2)', 'Message-Topic(e2,e1)',
       'Member-Collection(e1,e2)', 'Entity-Destination(e2,e1)'],
      dtype=object)