In [1]:
base_dir = '/mnt/efs/shared/meg_shared_scripts/meg-kb'

In [2]:
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


In [182]:
from tqdm.notebook import tqdm
import argparse
import re
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr, entropy
import random
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import json
from collections import defaultdict

import logging
from sklearn.cluster import KMeans, AgglomerativeClustering
import pandas as pd
import os
import sys
import math
from annoy import AnnoyIndex

from spacy.lang.en import English
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
spacy_tokenizer = nlp.tokenizer

from compute_concept_clusters import load_embeddings, knn
from compute_keyphrase_embeddings import get_masked_contexts, ensure_tensor_on_device, mean_pooling
from relation_extraction_avg_scores import LMProbe

from roberta_ses.interface import Roberta_SES_Entailment

# Data Preprocessing

In [2]:
# Input: text corpus
# step 1: extract key phrases (autophrase)
# step 2: generate embeddings

## Extract Key Phrases

In [3]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/keyword_extraction

In [8]:
#change to keyword extractor directory
%cd $base_dir/src/keyword_extraction/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/keyword_extraction


In [9]:
!chmod +x ./corpusProcess.sh

In [5]:
# select the dataset and thread no
data_ac = 'indeeda-meg-ac'
data_pt = 'indeeda-meg-pt'
thread = 8

In [463]:
# process corpus and generate key prhases
!./corpusProcess.sh $data_ac $thread

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/keyword_extraction
[32m===Corpus Name: sample-indeeda-meg-ac===[m
[32m===Current Path: /mnt/efs/shared/meg_shared_scripts/meg-kb/src/keyword_extraction===[m
[32m===Cleaning input corpus===[m
[32m===Running AutoPhrase===[m
make: Nothing to be done for 'all'.
[32m===RAW_TRAIN: ../../../data/sample-indeeda-meg-ac/source/corpus.clean.txt===[m
auto_phrase.sh parameters: sample-indeeda-meg-ac ../../../data/sample-indeeda-meg-ac/source/corpus.clean.txt 10 data/EN/wiki_quality.txt 8
[32m===Compilation===[m
[32m===Tokenization===[m
Current step: Tokenizing input file...[0K
real	0m0.702s
user	0m1.668s
sys	0m0.100s
Detected Language: EN[0K
Current step: Tokenizing wikipedia phrases...[0K
No provided expert labels.[0K
[32m===Part-Of-Speech Tagging===[m
[32m===AutoPhrasing===[m
=== Current Settings ===
Iterations = 2
Minimum Support Threshold = 10
Maximum Length Threshold = 6
POS-Tagging Mode Disabled
Discard Ratio = 0.050000
Numbe

In [464]:
# copy these results to sample-meg-pt
!cp -r ../../data/$data_ac ../../data/$data_pt

## Corpus with company names

In [33]:
dataset_path = '/home/ubuntu/users/nikita/data/indeed/indeedQA/question_answers.csv'
company_path = '/home/ubuntu/users/nikita/data/indeed/indeedQA/fccid-companyName.csv'
entity_emb_num_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum+seeds.txt')
out_corpus_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sentences_with_company.json')

In [34]:
with open(entity_emb_num_path, 'r') as f:
    entities = [l.strip().rsplit(' ', 1)[0] for l in f.readlines()]
len(entities)

8053

In [35]:
df_dataset = pd.read_csv(dataset_path) 
df_dataset = df_dataset[df_dataset['answerContent'].notna()]
df_company = pd.read_csv(company_path)

df_merged_dataset = df_dataset.merge(df_company, how='inner', on='fccompanyId')
df_merged_dataset.head(10)

Unnamed: 0,questionId,fccompanyId,questionCode,topics,questionContent,answerId,DateAnswered,jobTitle,jobLocation,answerContent,companyName
0,1albfs54rak5j9ff,403,FIRST_INTERVIEW,INTERVIEW,How did you get your first interview at Marsha...,1albfs54rak5j9ff,2016-06-16T02:15:39.931Z,"Retail associate, floor maintenance","Lake Placid, NY",I went to the open interview sessions they had...,Marshalls
1,1albfs54rak5j9ff,403,FIRST_INTERVIEW,INTERVIEW,How did you get your first interview at Marsha...,1amcsjj6oakaid9l,2016-06-29T01:33:07.160Z,,,I applied online and submitted all attachments...,Marshalls
2,1albfs54rak5j9ff,403,FIRST_INTERVIEW,INTERVIEW,How did you get your first interview at Marsha...,1aonur8nlas2r8di,2016-07-28T05:15:18.133Z,Sales Associate,"San Marcos, TX",He said he liked me and saw nothing that he di...,Marshalls
3,1albfs54rak5j9ff,403,FIRST_INTERVIEW,INTERVIEW,How did you get your first interview at Marsha...,1ap63mfneaqi1drb,2016-08-02T17:09:26.382Z,Equipment Operator,"Decatur, GA",It was good,Marshalls
4,1albfs54rak5j9ff,403,FIRST_INTERVIEW,INTERVIEW,How did you get your first interview at Marsha...,1apm7cii7aka0csa,2016-08-08T23:21:46.823Z,Sales Associate,"Stone Mountain, GA",Applied for the job.,Marshalls
5,1albfs54rak5j9ff,403,FIRST_INTERVIEW,INTERVIEW,How did you get your first interview at Marsha...,1aq859am1ak569l9,2016-08-15T22:31:23.073Z,Sales Associate,"Redlands, CA",I got my first interview with Marshalls by goi...,Marshalls
6,1albfs54rak5j9ff,403,FIRST_INTERVIEW,INTERVIEW,How did you get your first interview at Marsha...,1au922hjvaqh08g7,2016-10-04T23:57:16.799Z,Customer Service Associate,"Fort Worth, TX 76119",After putting in my application at my nearest ...,Marshalls
7,1albfs54rak5j9ff,403,FIRST_INTERVIEW,INTERVIEW,How did you get your first interview at Marsha...,1b55n7nr00kbr607,2016-12-29T16:40:02.144Z,"Sales Associate, Cashier","Aventura, FL",I walk in and ask to apply,Marshalls
8,1albfs54rak5j9ff,403,FIRST_INTERVIEW,INTERVIEW,How did you get your first interview at Marsha...,1b5b6bkk10kbr2nj,2016-12-31T19:40:30.721Z,Cashier/Fitting Room Attendant,"Miller Place, NY",I received my first interview after a friend r...,Marshalls
9,1albfs54rak5j9ff,403,FIRST_INTERVIEW,INTERVIEW,How did you get your first interview at Marsha...,1b5tkjj5gb85ae4v,2017-01-07T23:35:51.216Z,Inventory Specialist/Sales Associate,"Redmond, WA",I see the hiring sign and apply at marshall,Marshalls


In [36]:
df_merged_dataset.shape

(307122, 11)

In [37]:
row = df_merged_dataset.iloc[1]
_d = nlp(row["answerContent"])
list(_d.sents)
list(list(_d.sents)[0])

[I, applied, online, and, submitted, all, attachments, that, I, could, .]

In [38]:
out_corpus = []

for i, row in df_merged_dataset.iterrows():
    if i > 0 and i % 5000 == 0:
        print(f'Progress: {i} / {df_merged_dataset.shape[0]}')
    
    company = row["companyName"]
    ans = row["answerContent"]
    ans_nlp = nlp(ans)
    for sent in ans_nlp.sents:
        sent_tok_list = [str(t) for t in sent]
        _s = f' {company} : {" ".join(sent_tok_list)} '.lower()
        _ents = []
        for _e in entities:
            if f' {_e} ' in _s:
                _ents.append(_e)
        out_corpus.append({
            "tokens": sent_tok_list,
            "company": company,
            "entities": _ents,
        })

Progress: 5000 / 307122
Progress: 10000 / 307122
Progress: 15000 / 307122
Progress: 20000 / 307122
Progress: 25000 / 307122
Progress: 30000 / 307122
Progress: 35000 / 307122
Progress: 40000 / 307122
Progress: 45000 / 307122
Progress: 50000 / 307122
Progress: 55000 / 307122
Progress: 60000 / 307122
Progress: 65000 / 307122
Progress: 70000 / 307122
Progress: 75000 / 307122
Progress: 80000 / 307122
Progress: 85000 / 307122
Progress: 90000 / 307122
Progress: 95000 / 307122
Progress: 100000 / 307122
Progress: 105000 / 307122
Progress: 110000 / 307122
Progress: 115000 / 307122
Progress: 120000 / 307122
Progress: 125000 / 307122
Progress: 130000 / 307122
Progress: 135000 / 307122
Progress: 140000 / 307122
Progress: 145000 / 307122
Progress: 150000 / 307122
Progress: 155000 / 307122
Progress: 160000 / 307122
Progress: 165000 / 307122
Progress: 170000 / 307122
Progress: 175000 / 307122
Progress: 180000 / 307122
Progress: 185000 / 307122
Progress: 190000 / 307122
Progress: 195000 / 307122
Progre

In [39]:
len(out_corpus), out_corpus[0]

(413232,
 {'tokens': ['I',
   'went',
   'to',
   'the',
   'open',
   'interview',
   'sessions',
   'they',
   'had',
   'an',
   'applied',
   '.'],
  'company': 'Marshalls',
  'entities': ['marshalls']})

In [43]:
with open(out_corpus_path, 'w') as f:
    for d in out_corpus:
        f.write(json.dumps(d) + '\n')

# Generate Embeddings

In [465]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/concept_learning

In [11]:
#change to concept learning directory
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


## Sentence Embedding

In [467]:
!python compute_keyphrase_embeddings.py -m bert-base-uncased -et ac -d ../../data/$data_ac/intermediate -c 750

loading corpus: 100%|█████████████████████| 694/694 [00:00<00:00, 194471.34it/s]
computing entity-wise embedding: 100%|████████| 177/177 [00:03<00:00, 50.59it/s]
Saving embedding


## Concatenated Token Embedding

In [468]:
!python compute_keyphrase_embeddings.py -m bert-base-uncased -et pt -d ../../data/$data_pt/intermediate -c 750

loading corpus: 100%|█████████████████████| 694/694 [00:00<00:00, 191566.11it/s]
computing entity-wise embedding: 100%|████████| 177/177 [00:03<00:00, 53.88it/s]
Saving embedding


## Token Embedding

In [469]:
# change directory to autophrase
%cd $base_dir/src/tools/AutoPhrase

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/tools/AutoPhrase


In [470]:
data_corel = 'sample-indeeda-corel'

In [471]:
!CUDA_VISIBLE_DEVICES=0 python extractBertEmbedding.py ../../../data/$data_corel/intermediate/ $thread

1
2021-06-18 00:36:18,384 : INFO : loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/ubuntu/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
2021-06-18 00:36:18,776 : INFO : loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/ubuntu/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
2021-06-18 00:36:18,777 : INFO : Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embedding

## Add embeddings for seed instances

In [12]:
def load_seed_concepts(path):
    df = pd.read_csv(path)    
    df["seedInstances"] = df["seedInstances"].map(lambda s : eval(str(s)))
    return df

def load_seed_aligned_concepts(path):
    df = pd.read_csv(path)
    df = df[df["generalizations"] != "x"]
    df["seedInstances"] = df["seedInstances"].map(lambda s : eval(str(s)))
    return df

def load_seed_aligned_relations(path):
    df = pd.read_csv(path)
    df = df[df["range"] != "x"]
    return df

In [13]:
def get_masked_contexts_for_entities(entities, input_file):
    """Return a (list of) sentence(s) with entity replaced with MASK."""
    """YS: input should be sentences.json"""
    
    ent_freq = {ent : 0 for ent in entities}
    ent_context = {ent : [] for ent in entities}
    
    with open(input_file, "r") as fin:
        lines = fin.readlines()
        for line in tqdm(lines, total=len(lines), desc="loading corpus"):
            json_dict = json.loads(line)
            sent = ' ' + ' '.join(json_dict['tokens']).lower() + ' '
            #entities = [match.group(1) for match in re.finditer(pat, line)]
            
            for entity in entities:
                pat = f' {entity} '
                if pat not in sent:
                    continue

                context = sent.replace(pat, ' [MASK] ').strip()
                c = context.split('[MASK]')
                if len(c) != 2:  # sanity to not have too many repeating phrases in the context
                    continue

                # ignore too short contexts
                if len(context) < 15:
                    continue

                # print(entity)
                # print(context)
                
                _freq = ent_freq.get(entity, 0)
                ent_freq[entity] = _freq + 1

                context_lst = ent_context.get(entity, [])
                context_lst.append(context)
                ent_context[entity] = context_lst

    dedup_context = {}
    for e, v in ent_context.items():
        dedup_context[e] = list(set(v))
    return ent_freq, dedup_context


In [14]:
def get_avg_context_embedding_for_entities(entities, model_path, input_file, max_context_ct):
    '''
    mean pooling from sentence-transformers
    :param entity: List[str], the entities to compute embeddings for
    :param model_path:
    :param input_file:
    :return:
    '''
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    mask_token_id = tokenizer.mask_token_id

    ent_freq, ent_context = get_masked_contexts_for_entities(entities, input_file)
    
    entity_embeddings = {}
    for entity, en_context_lst in tqdm(ent_context.items(), total=len(ent_context), desc="computing entity-wise embedding"):
#     for entity, en_context_lst in ent_context.items():
        print(entity)
        en_context_lst = random.sample(en_context_lst, min(len(en_context_lst), max_context_ct))
        chunks = [en_context_lst[i:i + 100] for i in range(0, len(en_context_lst), 100)]
        # print(entity)
        # print(len(en_context_lst))
        all_context_embeddings = []
        for chunk in chunks:
            encoded_input = tokenizer.batch_encode_plus(chunk, return_token_type_ids=True, add_special_tokens=True, max_length=128, return_tensors='pt', padding=True, pad_to_max_length=True, truncation=True)
            mask = encoded_input['input_ids'] != mask_token_id
            with torch.no_grad():
                encoded_input = ensure_tensor_on_device(device, **encoded_input)
                model_output = model(**encoded_input)  # Compute token embeddings
            context_embeddings = mean_pooling(model_output, mask)  # mean pooling
            all_context_embeddings.append(context_embeddings)
            
        assert len(all_context_embeddings) > 0
            
        entity_embedding = torch.mean(torch.cat(all_context_embeddings, dim=0), dim=0).cpu().detach().numpy().tolist()
        entity_embeddings[entity] = entity_embedding
    
    return entity_embeddings, ent_freq

In [None]:
corpus_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sentences.json')
seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')

orig_bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed.txt')
orig_bert_emb_num_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum.txt')

new_bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt')
new_bert_emb_num_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum+seeds.txt')

orig_emb_df = load_embeddings(bert_emb_path, 768)
emb_dict = dict(zip(orig_emb_df['entity'].to_list(), orig_emb_df['embedding'].to_list()))

with open(orig_bert_emb_num_path, 'r') as f:
    lines = f.readlines()
    emb_freq_dict = dict([l.strip().rsplit(' ', 1) for l in lines])

concepts_df = load_seed_aligned_concepts(seed_aligned_concepts_path)
seed_instances_list = [inst for _, (_a_con, _u_con, _gnrl, _seed_instances) in concepts_df.iterrows()
                           for inst in _seed_instances]

## debug
seed_instances_list = seed_instances_list[::10]

print(seed_instances_list)

entity_embeddings, ent_freq = \
    get_avg_context_embedding_for_entities(entities=seed_instances_list, 
                                           model_path='bert-base-uncased',
                                           input_file=corpus_path,
                                           max_context_ct=750)

In [None]:
for inst in seed_instances_list:
    emb = entity_embeddings[inst]
    freq = ent_freq[inst]
    if inst in emb_dict:
        print(f'Already exists: {inst}')
#         assert np.allclose(emb_dict[inst], emb)
#         assert emb_freq_dict[inst] == freq, f'{inst}: orig {emb_freq_dict[inst]} != new {freq}'
#         print(f'Check passed: {inst}')
    else:
        emb_dict[inst] = emb
        emb_freq_dict[inst] = freq

In [None]:
entity_embeddings.keys()

In [None]:
with open(new_bert_emb_path, 'w') as f, open(new_bert_emb_num_path, 'w') as f2:
    for inst in seed_instances_list:
        emb = emb_dict[inst]
        freq = ent_freq[inst]
        f.write("{} {}\n".format(inst, ' '.join([str(x) for x in emb])))
        f2.write("{} {}\n".format(inst, freq))


In [735]:
# Using script

!python add_seed_instances_embeddings.py -m bert-base-uncased -et ac -d $base_dir/data/$data_ac/intermediate -b $base_dir/data/indeed-benchmark -c 750


Seed instances: ['walmart', 'amazon', 'subway', 'microsoft', 'target', 'business casual', 'uniform', 'hair color', 'tattoos', 'facial hair', 'shoes', 'piercings', 'delivery driver', 'store manager', 'cashier', 'package handler', 'sales associate', 'barista', 'dishwasher', 'weekly', 'biweekly', 'friday', 'saturday', 'health insurance', 'flexible schedule', '401k', 'paid vacation', 'sick leave', 'vision insurance', 'base pay', 'stock options', 'benefits', 'overtime pay', 'bonus', 'checks', 'direct deposit', 'prepaid card', 'drug test', 'criminal background check', 'employment verification', 'felons', 'criminals', 'disabled', 'drug addicts', 'high schoolers', 'misdemeanor', 'pregnant', 'students', 'seniors', 'hiring age', 'bachelors degree', 'prior experience', 'working permit', 'heavy lifting', 'night shift', 'dinner shift', 'early morning shift', '8 hour shift', 'christmas eve', 'early morning', 'hoilday', '7 days', 'saturday', 'sunday', 'weekend', 'full time', 'part time', 'seasonal', 

## Check embeddings

In [15]:
data_sub_dir = data_ac
bert_emb_path = os.path.join(base_dir, f'data/{data_sub_dir}/intermediate/BERTembed+seeds.txt')

embeddings = load_embeddings(bert_emb_path, 768)
len(embeddings)

8053

In [16]:
embeddings[embeddings['entity'] == 'biweekly']

Unnamed: 0,entity,embedding
8023,biweekly,"[0.06975648552179337, -0.06970633566379547, 0...."


## (X) Other ways of embeddings / clustering

In [155]:
input_file_path = os.path.join(base_dir, f'data/{data_sub_dir}/intermediate/sent_segmentation.txt')
ent_freq, dedup_context = get_masked_contexts(input_file_path)
len(ent_freq), len(dedup_context)

loading corpus: 100%|██████████| 458/458 [00:00<00:00, 73813.30it/s]


(175, 175)

In [452]:
ent_freq['candy'], dedup_context['candy']

(2,
 ["we dropped by in hopes of finding atkinson 's peanut_butter bars ( we first tried them from honey salt 's [MASK] bowl ) and after searching a few minutes , we found it .",
  "if you 're searching for a [MASK] or soda_pop you grew up with and can no longer find , there 's a good chance you 'll find it here ."])

In [208]:
def get_all_context_embeddings(model_path, input_file, max_context_ct):
    '''
    Adapted from get_avg_context_embeddings()
    keep all context embeddings, using max similarity for knn
    :param model_path:
    :param input_file:
    :return:
    '''
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    mask_token_id = tokenizer.mask_token_id

    ent_freq, ent_context = get_masked_contexts(input_file)
    entity_embeddings = {}
    for entity, en_context_lst in tqdm(ent_context.items(), total=len(ent_context), desc="computing entity-wise embedding"):
        en_context_lst = random.sample(en_context_lst, min(len(en_context_lst), max_context_ct))
        chunks = [en_context_lst[i:i + 100] for i in range(0, len(en_context_lst), 100)]
        # print(entity)
        # print(len(en_context_lst))
        all_context_embeddings = []
        for chunk in chunks:
            encoded_input = tokenizer.batch_encode_plus(chunk, return_token_type_ids=True, add_special_tokens=True, max_length=128, return_tensors='pt', padding=True, pad_to_max_length=True, truncation=True)
            mask = encoded_input['input_ids'] != mask_token_id
            with torch.no_grad():
                encoded_input = ensure_tensor_on_device(device, **encoded_input)
                model_output = model(**encoded_input)  # Compute token embeddings
            context_embeddings = mean_pooling(model_output, mask)  # mean pooling
            # print(context_embeddings.size())
            all_context_embeddings.append(context_embeddings)
            
        # entity_embedding = torch.mean(torch.cat(all_context_embeddings, dim=0), dim=0).cpu().detach().numpy().tolist()
        # entity_embeddings[entity] = entity_embedding
        entity_embeddings[entity] = torch.cat(all_context_embeddings, dim=0).cpu().detach().numpy().tolist()
        
    return entity_embeddings, ent_freq

In [209]:
model_path = 'bert-base-uncased'
input_file_path = os.path.join(base_dir, f'data/{data_sub_dir}/intermediate/sent_segmentation.txt')
max_context_ct = 10

entity_embeddings, ent_freq = get_all_context_embeddings(model_path, input_file_path, max_context_ct)
len(entity_embeddings), len(ent_freq)

loading corpus: 100%|██████████| 458/458 [00:00<00:00, 150194.78it/s]
computing entity-wise embedding: 100%|██████████| 175/175 [00:04<00:00, 41.74it/s]


(175, 175)

In [213]:
len(entity_embeddings['candy'][0])

768

In [234]:
def _knn(entity_embeddings, embedding_dim, cluster_size, thread_ct=None, cluster_dest=None, **kwargs):
    # entity_embeddings = load_embeddings(embed_src, embedding_dim)
    
    t = AnnoyIndex(embedding_dim, 'angular')
    # entities = entity_embeddings['entity'].tolist()
    entities = [f'{entity}-{_i}' for entity, embs in entity_embeddings.items() for _i in range(len(embs))]
    # print(entities)
    # for i, row in tqdm(entity_embeddings.iterrows(), total=entity_embeddings.shape[0], desc="building entity index"):
    #     t.add_item(i, row['embedding'])
    i = 0
    for entity, embs in tqdm(entity_embeddings.items(), total=len(entity_embeddings)):
        for emb in embs:
            t.add_item(i, emb)
            i += 1
    assert i == len(entities)
    
    t.build(100)
    
    neighbors = []
    for i, entity in enumerate(tqdm(entities, desc="finding nearest neighbors by entity")):
        # print(i, entity)
        nns, dists = t.get_nns_by_item(i, cluster_size + 1, include_distances=True)
        cos_sim_scores = [(2 - d ** 2) / 2 for d in dists]  # convert angular distance to cosine similarity
        zipped = list(zip(nns, cos_sim_scores))
        sorted_nns = sorted(zipped, key=lambda x: x[1], reverse=True)
        if len(sorted_nns) > 0:
            for nn_idx, d in sorted_nns:
                neighbor_entity = entities[nn_idx]
                if neighbor_entity == entity:
                    continue
                neighbors.append({"entity": entity, "neighbor": neighbor_entity, "sim": d})
    c_df = pd.DataFrame(neighbors)
    return c_df

In [235]:
knn_results = _knn(entity_embeddings, 768, 20)

100%|██████████| 175/175 [00:00<00:00, 24854.50it/s]
finding nearest neighbors by entity: 100%|██████████| 269/269 [00:00<00:00, 6006.44it/s]


In [None]:
query = 'meat'

df = knn_results

n_embs = len(entity_embeddings[query])
sub_frames = []
for _i in range(n_embs):
    ent_name = f'{query}-{_i}'
    sub_frames.append(df[df['entity'] == ent_name])

pd.concat(sub_frames).sort_values('sim', ascending=False).head(10)

In [None]:
# original avg context knn 
knn_path = os.path.join(base_dir, f'data/{data_sub_dir}/intermediate/knn_100.csv')

knn_results = pd.read_csv(knn_path)
df = knn_results

query = 'walmart'
sub_frame = df[df['entity'] == query]
sub_frame.sort_values('sim', ascending=False).head(10)

# Expand Seed Entities (clustering)

In [99]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/concept_learning

In [299]:
#change to concept learning directory
%cd ../../concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


## knn sentence-embedding

In [365]:
clusters = 100
output = '../../data/'+data_ac+'/intermediate/knn_'+str(clusters)+'.csv'
dim = 768

In [366]:
!python compute_concept_clusters.py -d ../../data/$data_ac/intermediate/ -ca knn -s $clusters -dim $dim -o $output

building entity index: 100%|████████████████| 177/177 [00:00<00:00, 5435.26it/s]
finding nearest neighbors by entity: 100%|██| 177/177 [00:00<00:00, 2001.57it/s]


## knn token concatenated

In [308]:
clusters = 20
output = '../../data/'+data_pt+'/intermediate/knn_'+str(clusters)+'.csv'
dim = 3072

In [309]:
!python compute_concept_clusters.py -d ../../data/$data_pt/intermediate/ -ca knn -s $clusters -dim $dim -o $output

building entity index: 100%|████████████████| 177/177 [00:00<00:00, 3661.18it/s]
finding nearest neighbors by entity: 100%|██| 177/177 [00:00<00:00, 4052.00it/s]


## knn token

In [None]:
clusters = 20
output = '../../data/'+data_pt+'/intermediate/knn_'+str(clusters)+'.csv'
dim = 768

In [None]:
!python compute_concept_clusters.py -d ../../data/$data_corel/intermediate/ -ca knn -s $clusters -dim $dim -o $output

## Analyzing Clustering Results

In [107]:
#Visit here: /meg_shared_scripts/meg-kb/src/analysis/concept_learning-test.ipynb

## Seed instances clustering
(using all seed instances of a concept to find neighbors)

In [571]:
seed_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_concepts.csv')
seed_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_relations.csv')

seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')
seed_aligned_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_relations.csv')

In [744]:
def get_concept_knn(embed_src, embedding_dim, seed_aligned_concept_src, cluster_size, thread_ct, cluster_dest, **kwargs):
    seed_concepts_df = load_seed_aligned_concepts(seed_aligned_concept_src)
    
    entity_embeddings = load_embeddings(embed_src, embedding_dim)
    t = AnnoyIndex(embedding_dim, 'angular')
    entities = entity_embeddings['entity'].tolist()
    for i, row in tqdm(entity_embeddings.iterrows(), total=entity_embeddings.shape[0], desc="building entity index"):
        t.add_item(i, row['embedding'])
    t.build(100)
    
    entity_emb_dict = dict(zip(entities, entity_embeddings['embedding'].tolist()))

    neighbors = []
    for i, (a_concept, u_concept, gnrl, seed_instances) in tqdm(seed_concepts_df.iterrows(), desc="finding nearest neighbors by concept"):
        embs = []
        for inst in seed_instances:
            try:
                embs.append(entity_emb_dict[inst])
            except KeyError:
                print(f"{inst} not found in entity_emb_dict??")
                continue
        if len(embs) == 0:
            continue
        concept_emb = np.mean(embs, axis=0)
        
        nns, dists = t.get_nns_by_vector(concept_emb, cluster_size + 1, include_distances=True)
        cos_sim_scores = [(2 - d ** 2) / 2 for d in dists]  # convert angular distance to cosine similarity
        zipped = list(zip(nns, cos_sim_scores))
        sorted_nns = sorted(zipped, key=lambda x: x[1], reverse=True)
        if len(sorted_nns) > 0:
            for nn_idx, d in sorted_nns:
                neighbor_entity = entities[nn_idx]
                if neighbor_entity in seed_instances:
                    continue
                neighbors.append({"concept": a_concept, "neighbor": neighbor_entity, "sim": d})
    c_df = pd.DataFrame(neighbors)
    c_df.to_csv(cluster_dest, index=None)

In [None]:
cluster_size = 1000

bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt')
seed_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_concepts.csv')
seed_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_relations.csv')
concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/concept_knn_{cluster_size}.csv')

get_concept_knn(embed_src=bert_emb_path,
            embedding_dim=768,
            seed_aligned_concept_src=seed_aligned_concepts_path,
            cluster_size=1000,
            thread_ct=1,
            cluster_dest=concept_knn_path)


In [24]:
%cd $base_dir/src/concept_learning

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


In [29]:
# Use script
# cluster_size = 1000
!python compute_concept_seeds_knn.py -d $base_dir/data/$data_ac/intermediate -b $base_dir/data/indeed-benchmark -s 1000 -o $base_dir/data/$data_ac/intermediate/concept_knn_1000.csv

building entity index: 100%|██████████████| 8053/8053 [00:01<00:00, 6459.29it/s]
finding nearest neighbors by concept: 14it [00:00, 115.53it/s]


In [31]:
# check results 
concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/concept_knn_{cluster_size}.csv')

df = pd.read_csv(concept_knn_path)
df[df['concept'] == 'company'].head(10)

Unnamed: 0,concept,neighbor,sim
0,company,wal mart,0.997038
1,company,costco,0.997013
2,company,publix,0.996753
3,company,walgreens,0.996623
4,company,kroger,0.996477
5,company,home depot,0.996124
6,company,sam 's club,0.995978
7,company,dollar general,0.995846
8,company,family dollar,0.995645
9,company,jcpenney,0.995513


In [32]:
df = pd.read_csv(concept_knn_path)
df[df['concept'] == 'pay_schedule'].head(10)

Unnamed: 0,concept,neighbor,sim
2985,pay_schedule,sunday,0.989753
2986,pay_schedule,weekend,0.989047
2987,pay_schedule,7 days,0.984987
2988,pay_schedule,part time,0.984129
2989,pay_schedule,8 hour shift,0.982685
2990,pay_schedule,bonus,0.981976
2991,pay_schedule,full time,0.981485
2992,pay_schedule,seasonal,0.979928
2993,pay_schedule,training,0.978852
2994,pay_schedule,orientation,0.978359


## LM probs correlation

In [None]:
# corpus_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sentences_with_company.json')

# with open(corpus_path, 'r') as f:
#     sent_dicts = [json.loads(l) for l in f]

In [164]:
lm_probe = LMProbe()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [148]:
input_file_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sent_segmentation.txt')
ent_freq, dedup_context = get_masked_contexts(input_file_path)

loading corpus: 100%|██████████| 465226/465226 [00:01<00:00, 264452.90it/s]


In [158]:
len(dedup_context['health insurance'])

426

In [160]:
len(dedup_context['sick leave'])

66

In [161]:
entities = list(ent_freq.keys())
len(entities)

7973

In [318]:
all_ents_tokenized = [tuple(lm_probe.tokenizer.tokenize(e)) for e in entities]
all_ents_tokenized = list(set(all_ents_tokenized))
[sum([len(e_t) == _l for e_t in all_ents_tokenized]) for _l in (1,2,3,4,5)]

[2334, 4086, 1126, 319, 54]

In [321]:
## TODO: use geo-avg instead of prod for scoring 
## TODO: collect contexts from both sides (base / expansion) 

def entity_expansion_multiways(entity, 
                               contexts=None, 
                               all_ents_tokenized=all_ents_tokenized, 
                               lm_probe=lm_probe, 
                               top_k=300):
    if lm_probe is None:
        lm_probe = LMProbe()
    if contexts is None:
        contexts = dedup_context[entity]
    
    entity2probs = defaultdict(list)

    for _context in tqdm(contexts[:50]):
        for n_grams in (1, 2, 3):
            _ctxt = _context.replace('[MASK]', '[MASK]' + ' [MASK]' * (n_grams-1))
            _ctxt = '[CLS] ' + _ctxt + ' [SEP]'
            _cands = [e_t for e_t in all_ents_tokenized if len(e_t) == n_grams]
            _cand_scores = lm_probe.score_candidates(_ctxt, _cands)

            for _d in _cand_scores:
                _c = ' '.join(_d['cand']).replace(' ##', '')
                _s = _d['score']
                entity2probs[_c].append(_s)
    
#     print('entity2probs:', len(entity2probs), len(entity2probs[entity]))
    for _e, _ss in entity2probs.items():
        assert len(_ss) == len(entity2probs[entity]), \
            f'entity: {_e} | {lm_probe.tokenizer.tokenize(_e)}; len(_ss): {len(_ss)}'
    
    _target_ss = entity2probs[entity]
    _target_ss = _target_ss / np.sum(_target_ss)
    
    mean_l = [(_e, np.mean(_ss)) for _e, _ss in entity2probs.items()]
    mean_l.sort(key=lambda p : p[-1], reverse=True)
    kl_l = [(_e, entropy(_target_ss, _ss)) for _e, _ss in entity2probs.items()]
    kl_l.sort(key=lambda p : p[-1], reverse=False)
    pearson_l = [(_e, pearsonr(_target_ss, _ss)[0]) for _e, _ss in entity2probs.items()]
    pearson_l.sort(key=lambda p : p[-1], reverse=True)
    
    mean_set = set([_e for _e, _s in mean_l[:top_k]])
    kl_set = set([_e for _e, _s in kl_l[:top_k]])
    pearson_set = set([_e for _e, _s in pearson_l[:top_k]])
    
    return mean_set, kl_set, pearson_set
    

In [322]:
_entity = 'health insurance'
mean_set, kl_set, pearson_set = entity_expansion_multiways(_entity)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [323]:
len(mean_set & kl_set & pearson_set)

24

In [324]:
mean_set & pearson_set & kl_set

{'bank',
 'basic',
 'cancer',
 'community',
 'disability',
 'family',
 'federal',
 'general',
 'government',
 'health',
 'health insurance',
 'healthcare',
 'insurance',
 'life insurance',
 'marriage',
 'medical',
 'medical insurance',
 'mortgage',
 'pregnancy',
 'private',
 'property',
 'public',
 'social life',
 'state'}

In [313]:
_entity = 'black jeans'
mean_set, kl_set, pearson_set = entity_expansion_multiways(_entity)

n_grams: 2


HBox(children=(FloatProgress(value=0.0, max=95.0), HTML(value='')))




In [314]:
len(mean_set & kl_set & pearson_set), mean_set & kl_set & pearson_set

(64,
 {'beige pants',
  'bermuda shorts',
  'black church',
  'black dress',
  'black jeans',
  'black pants',
  'black polo',
  'black slip',
  'black tennis',
  'black tie',
  'blue collar',
  'blue denim',
  'blue hair',
  'blue jacket',
  'blue jean',
  'blue jeans',
  'blue polo',
  'blue shield',
  'brown apron',
  'brown pants',
  'brown shoes',
  'casual clothes',
  'colored hair',
  'colored jeans',
  'colored pants',
  'colored shoes',
  'dark jeans',
  'dark pants',
  'denim jeans',
  'dress pants',
  'dress slacks',
  'green polo',
  'hot pockets',
  'jean pants',
  'nice jeans',
  'orange apron',
  'orange shirt',
  'pink hair',
  'purple hair',
  'purple heart',
  'red button',
  'red carpet',
  'red cross',
  'red hair',
  'red jacket',
  'red polo',
  'red robin',
  'red shirt',
  'red square',
  'red tape',
  'redbook',
  'ripped jeans',
  'skinny jeans',
  'sweat pants',
  'tan pants',
  'wear jeans',
  'white bread',
  'white button',
  'white city',
  'white collar'

In [315]:
_entity = 'walmart'
mean_set, kl_set, pearson_set = entity_expansion_multiways(_entity)

n_grams: 2


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [316]:
len(mean_set & kl_set & pearson_set), mean_set & kl_set & pearson_set

(2, {'apple store', 'general store'})

In [None]:
len(mean_set & kl_set), mean_set & kl_set

In [None]:
len(mean_set & pearson_set), mean_set & pearson_set

In [None]:
len(pearson_set & kl_set), pearson_set & kl_set

In [None]:
mean_set

## Entity expansion evaluation
Now using benchmark entities, mean reciprocal rank

In [17]:
seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')
seed_aligned_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_relations.csv')
benchmark_path = os.path.join(base_dir, f'data/indeed-benchmark/benchmark.csv')
concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/concept_knn_1000.csv')

seed_aligned_concepts = load_seed_aligned_concepts(seed_aligned_concepts_path)
seed_aligned_relations = load_seed_aligned_relations(seed_aligned_relations_path)
benchmark = pd.read_csv(benchmark_path)
concept_knn = pd.read_csv(concept_knn_path)

print(seed_aligned_concepts['alignedCategoryName'].tolist())
print(seed_aligned_relations['alignedRelationName'].tolist())
print(set(concept_knn['concept'].tolist()))
print(benchmark.shape)

['company', 'dress_code', 'job_position', 'pay_schedule', 'benefits', 'compensation', 'payment_option', 'background_screening', 'person', 'hire_prerequisite', 'shifts', 'schedule', 'employee_type', 'onboarding_steps']
['has_pay_schedule', 'has_pay_schedule', 'has_dress_code', 'has_dress_code', 'has_background_screening', 'has_benefits', 'has_benefits', 'hires_person', 'has_compensation', 'has_compensation', 'has_hire_prerequisite', 'operates_on', 'hires_employee_type', 'has_onboarding_steps', 'has_shifts', 'has_shifts', 'has_job_position', 'has_hiring_policy', 'has_payment_option']
{'dress_code', 'shifts', 'job_position', 'background_screening', 'hire_prerequisite', 'onboarding_steps', 'payment_option', 'benefits', 'employee_type', 'schedule', 'compensation', 'company', 'pay_schedule', 'person'}
(706, 16)


In [18]:
for i, d in seed_aligned_concepts.iterrows():
    a_concept = d["alignedCategoryName"]
    u_concept = d["unalignedCategoryName"]
    seed_instances = d["seedInstances"]

    concept_knn_instances = concept_knn[concept_knn["concept"] == a_concept]["neighbor"].to_list()
    
    _b_head_instances = benchmark[benchmark["n_head_category"] == a_concept]["n_head"].to_list()
    _b_tail_instances = benchmark[benchmark["n_tail_category"] == a_concept]["n_tail"].to_list()
    benchmark_instances = list(set(_b_head_instances + _b_tail_instances))
    
    print(f'Concept: {a_concept} / {u_concept}')
    print(f'seeds: {seed_instances}')
#     print(f'expanded (concept_knn_instances): {concept_knn_instances}')
#     print(f'benchmark_instances: {benchmark_instances}')
    b_inst_ranks = dict()
    recip_ranks = []
    for _inst in benchmark_instances:
        if _inst in seed_instances:
            b_inst_ranks[_inst] = -1
        elif _inst in concept_knn_instances:
            _rank = concept_knn_instances.index(_inst) + 1
            b_inst_ranks[_inst] = _rank
            recip_ranks.append(1.0 / _rank)
        else:
            b_inst_ranks[_inst] = float('nan')
            recip_ranks.append(0.0)
        
    print(json.dumps(b_inst_ranks, indent=4))
    print('MRR:', np.mean(recip_ranks))
    print()

Concept: company / company
seeds: ['walmart', 'amazon', 'subway', 'microsoft', 'target']
{
    "burger king": 32,
    "subway": -1,
    "tim hortons": 97,
    "dunkin donuts": 43,
    "home depot": 6,
    "chipotle": 44,
    "target": -1,
    "family dollar": 9,
    "training": NaN,
    "planet fitness": 52,
    "walgreens": 4,
    "company": 54,
    "pizza hut": 11,
    "primark": 121,
    "wendys": NaN,
    "bonus": NaN,
    "tj maxx": 45,
    "hobby lobby": 18,
    "kroger": 5,
    "mcdonald": 55,
    "panera": NaN,
    "ups": NaN,
    "fedex": 40,
    "frito lay": 42,
    "whataburger": NaN,
    "amazon.com": NaN,
    "dd": 61,
    "costco": 2,
    "pepsico": 30,
    "extensive background checks": 606,
    "dollar tree": 15,
    "cvs": 29,
    "lowes": NaN,
    "marshalls": 26,
    "safeway": 17,
    "electric": NaN,
    "amazon": -1,
    "ihop": 39,
    "heb": NaN,
    "taco bell": 25,
    "extensive background check": NaN,
    "olive garden": 28,
    "starbucks": 12,
    "dollar 

# Relation Extraction Baselines
Currently only for has_dress_code. TODO: include all other relations

In [421]:
# Imported from lm_probing.ipynb 
# TODO: for scoring purpose, maybe better to use GPT-2

class LMProbe(object):
    def __init__(self, model_name='bert-base-uncased', use_gpu=False):
        self.device = torch.device('cuda' if torch.cuda.is_available() and use_gpu else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForMaskedLM.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()

        self.mask_token = self.tokenizer.mask_token

    def fill_multi_mask(self, input_txt, topk=3):
        if not (input_txt.startswith('[CLS]') and input_txt.endswith('[SEP]')):
            raise Exception('Input string must start with [CLS] and end with [SEP]')
        if not '[MASK]' in input_txt:
            raise Exception('Input string must have at least one mask token')
        tokenized_txt = self.tokenizer.tokenize(input_txt)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_txt)
        tokens_tensor = torch.tensor([indexed_tokens])
        mask_indices = [i for i, x in enumerate(tokenized_txt) if x == "[MASK]"]
        segment_idx = tokens_tensor * 0
        tokens_tensor = tokens_tensor.to(self.device)
        segments_tensors = segment_idx.to(self.device)

        with torch.no_grad():
            outputs = self.model(tokens_tensor, token_type_ids=segments_tensors)
            predictions = outputs[0]

        probs = torch.softmax(predictions, dim=-1)[0]
        sorted_probs, sorted_idx = probs.sort(dim=-1, descending=True)
        sorted_probs = sorted_probs.detach().cpu().numpy()
        sorted_idx = sorted_idx.detach().cpu().numpy()

        masked_cands = []
        for k in range(topk):
            predicted_indices = [sorted_idx[i, k].item() for i in mask_indices]
            predicted_tokens = self.tokenizer.convert_ids_to_tokens(predicted_indices)
            predicted_probs = [sorted_probs[i, k].item() for i in mask_indices]
            seq = []
            for token_id, token, prob, masked_index in zip(predicted_indices, predicted_tokens, predicted_probs,
                                                           mask_indices):
                seq.append({"token": token_id, "token_str": token, "prob": prob, "masked_pos": masked_index})
            masked_cands.append(seq)

        return masked_cands
    
    def score_candidates(self, input_txt, cands):
        # cands: List[List[str]], list of tokenized candidates 
        tokenized_txt = self.tokenizer.tokenize(input_txt)
        
        if tokenized_txt[0] != "[CLS]" or tokenized_txt[-1] != "[SEP]":
            raise Exception(f'Input string must start with [CLS] and end with [SEP], got {input_txt}')
        if "[MASK]" not in tokenized_txt:
            raise Exception(f'Input string must have at least one mask token, got {input_txt}')
        
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_txt)
        tokens_tensor = torch.tensor([indexed_tokens])
        mask_indices = [i for i, x in enumerate(tokenized_txt) if x == "[MASK]"]
        segment_idx = tokens_tensor * 0
        tokens_tensor = tokens_tensor.to(self.device)
        segments_tensors = segment_idx.to(self.device)

        with torch.no_grad():
            outputs = self.model(tokens_tensor, token_type_ids=segments_tensors)
            predictions = outputs[0]

        probs = torch.softmax(predictions, dim=-1)[0]
        # sorted_probs, sorted_idx = probs.sort(dim=-1, descending=True)
        # sorted_probs = sorted_probs.detach().cpu().numpy()
        # sorted_idx = sorted_idx.detach().cpu().numpy()
        probs = probs.detach().cpu().numpy()

        cand_scores = []
        for c in cands:
            assert len(c) == len(mask_indices), f'cand {c}; len(mask_indices) = {len(mask_indices)}'
            
            # predicted_indices = [sorted_idx[i, k].item() for i in mask_indices]
            # predicted_tokens = self.tokenizer.convert_ids_to_tokens(predicted_indices)
            # predicted_probs = [sorted_probs[i, k].item() for i in mask_indices]
            _scores = []
            c_token_ids = self.tokenizer.convert_tokens_to_ids(c)
            for i, token_id in zip(mask_indices, c_token_ids):
                _scores.append(probs[i, token_id].item())
            score = np.prod(_scores)
            cand_scores.append({"cand": c, "score": score})

        cand_scores.sort(key=lambda d : d["score"], reverse=True)
        return cand_scores
    

In [None]:
lm_probe = LMProbe()

In [598]:
# Hand-designed. 
# TODO: mining 
# TODO: put to a file

offers_job_position_templates = [
    '{0} hires {1} .',
    '{0} is hiring {1} .',
    '{0} can hire you as a {1} .',
#     'You can get a {1} job at {0} .',
#     'Double check with the {1} at {0} .'
]

has_benefits_templates = [
    '{0} offer {1} for their employees.',
    '{0} provide {1} for employees.',
    '{0} have {1} for their employees.',
]

has_pay_schedule_templates = [
    '{0} pay their employees every {1}',
    '{0} has a pay schedule of {1}',
    '{0} employees get paid {1}',
]

has_dress_code_templates = [
    '{0} don\'t allow workers to wear {1}',
    '{0} allow workers to wear {1}',
    '{0} has a dress code of {1}',
    '{0} require employees to wear {1}',
]

In [348]:
def get_direct_probing_candidates(templates,
                                  lm_probe=None,
                                  head_entity=None,
                                  tail_entity=None,
                                  context=None,
                                  topk=10):
    '''
    Direct probing: let BERT propose possible entities  
    :param templates: List[str]: each have 2 slots, {0} for head, {1} for tail 
    :return: Dict[str, float]: proposed entities and scores 
    '''
    
    # ensure given one and propose one 
    assert (head_entity is None) != (tail_entity is None), f'{head_entity} {tail_entity}'

    if lm_probe is None:
        lm_probe = LMProbe()
    
    names_scores = {}
    for template in templates:
        if head_entity is not None:
            # head -> tail 
            _unigram_template = template.format(head_entity, '[MASK]')
            _bigram_template = template.format(head_entity, '[MASK] [MASK]')
        else:
            # tail -> head 
            _unigram_template = template.format('[MASK]', tail_entity)
            _bigram_template = template.format('[MASK] [MASK]', tail_entity)
        
        for _template in [_unigram_template, _bigram_template]:
            if context:
                query = '[CLS] ' + _template + '[SEP]' + context + '[SEP]'
            else:
                query = '[CLS] ' + _template + '[SEP]'
            preds = lm_probe.fill_multi_mask(query, topk=topk)
            for pred in preds:
                name = ' '.join([p['token_str'] for p in pred])
                name = name.replace(' ##', '')
                score = np.prod([p['prob'] for p in pred])
                scores = names_scores.get(name, [])
                scores.append(score)
                names_scores[name] = scores
                
    names_avg_scores = {k: float(sum(v))/ len(v) for k,v in names_scores.items()}
    names_avg_scores = {k: v for k, v in sorted(names_avg_scores.items(), reverse=True, key=lambda item: item[1])[:topk]}
    return names_avg_scores

In [785]:
def direct_probing_RE_v3(seed_aligned_concepts_path,
                         seed_aligned_relations_path,
                         emb_path,
                         concept_knn_path,
                         templates,
                         lm_probe=None,
                         emb_dim=768,
                         scores_agg_func=None,
                         topk=10,
                         save_path=None):
    '''
    For each head / tail, rank candidate tails / heads by overall scores. 
    Current (default) overall score: 0.1 * ht_sim + 10 * concept_sim + log(lm_prob)
    '''
    
    
    
    seed_concepts_df = load_seed_aligned_concepts(seed_aligned_concepts_path)
#     seed_relations_df = pd.read_csv(seed_relations_path)
#     seed_relations_df = seed_relations_df.iloc[1]
    entity_embeddings = load_embeddings(emb_path, emb_dim)
    entity_emb_dict = dict(zip(entity_embeddings['entity'].tolist(),
                               entity_embeddings['embedding'].tolist()))
    concept_knn_results = pd.read_csv(concept_knn_path)

    if lm_probe is None:
        lm_probe = LMProbe()
    if scores_agg_func is None:
        scores_agg_func = lambda ht_sim, concept_sim, lm_prob : 0.1 * ht_sim + 10 * concept_sim + np.log10(lm_prob)
    
#     head_type = seed_relations_df['domain']
#     tail_type = seed_relations_df['range']
    ## Just for testing
    head_type = "company"
    tail_type = "dress_code"
    print(head_type, '\t', tail_type)
    seed_heads = seed_concepts_df[seed_concepts_df['alignedCategoryName'] == head_type]['seedInstances'].item()
#     seed_heads = eval(list(seed_heads)[0])
    seed_tails = seed_concepts_df[seed_concepts_df['alignedCategoryName'] == tail_type]['seedInstances'].item()
#     seed_tails = eval(list(seed_tails)[0])
    print('seed_heads:', seed_heads)
    print('seed_tails:', seed_tails)

    # Candidate heads / tails from concept knn 
    cand_heads_df = concept_knn_results[concept_knn_results['concept'] == head_type]
    cand_tails_df = concept_knn_results[concept_knn_results['concept'] == tail_type]
    cand_heads_dict = dict(zip(cand_heads_df['neighbor'].tolist(), cand_heads_df['sim'].tolist()))
    cand_tails_dict = dict(zip(cand_tails_df['neighbor'].tolist(), cand_tails_df['sim'].tolist()))
    for h in seed_heads:
        assert h not in cand_heads_dict
        cand_heads_dict[h] = 1.0
    for t in seed_tails:
        assert t not in cand_tails_dict
        cand_tails_dict[t] = 1.0
    
#     print(cand_heads_dict)
#     print(cand_tails_dict)
    
    all_extraction_results = []
    
    # head -> tail 
    for seed_head in seed_heads:
        print(f'seed_head: {seed_head}')
        extraction_results = []

        ## For each tail, extract concept sim, head sim, lm score, combine and report
        
        cand_bins = {1: [], 2: []} ## TODO: allow higher grams; switch to GPT-2 for fair probs 
        for c in cand_tails_dict.keys():
            c_tokenized = lm_probe.tokenizer.tokenize(c)
            if len(c_tokenized) in [1, 2]:
                cand_bins[len(c_tokenized)].append(c_tokenized)
        
        cand_scores_per_template = []
        for template in templates:
            _unigram_template = '[CLS] ' + template.format(seed_head, '[MASK]') + '[SEP]'
            _bigram_template = '[CLS] ' + template.format(seed_head, '[MASK] [MASK]') + '[SEP]'

            _cand_scores_1 = lm_probe.score_candidates(_unigram_template, cand_bins[1])
            _cand_scores_2 = lm_probe.score_candidates(_bigram_template, cand_bins[2])
            _cand_scores = sorted(_cand_scores_1 + _cand_scores_2, key=lambda d : d["cand"])
            # List[Dict["cand", "score"]]
            cand_scores_per_template.append(_cand_scores)
    
        cand_scores = []  # List[Dict["cand", "score"]], for each "cand" the average score 
        for _cand_score_lst in zip(*cand_scores_per_template):
            # _cand_score_lst: List[Dict["cand", "score"]], for the same "cand" and different template 
            _cand = _cand_score_lst[0]["cand"]
            assert all(d["cand"] == _cand for d in _cand_score_lst), _cand_score_lst
            _score = np.mean([d["score"] for d in _cand_score_lst])
            cand_scores.append({"cand": _cand, "score": _score})
#         cand_scores.sort(key = lambda d : d["score"], reverse=True)

        for d in cand_scores:
            e_tail = ' '.join(d["cand"]).replace(' ##', '')
            if e_tail not in cand_tails_dict:
                continue

            lm_score = d["score"]
            try:
                ht_sim_score = 1 - cosine(entity_emb_dict[seed_head], entity_emb_dict[e_tail])
            except KeyError:
                print(f'** embedding of {seed_head}: {(seed_head in entity_emb_dict)}')
                print(f'** embedding of {e_tail}: {(e_tail in entity_emb_dict)}')
                ht_sim_score = float("nan")
            concept_sim_score = cand_tails_dict[e_tail]
            overall_score = scores_agg_func(ht_sim_score, concept_sim_score, lm_score)

            extraction_results.append({'head': seed_head, 'tail': e_tail, 'base': 'HEAD',
                                       'ht_sim_score': ht_sim_score,
                                       'concept_sim_score': concept_sim_score,
                                       'lm_score': lm_score,
                                       'overall_score': overall_score})
        
        extraction_results.sort(key=lambda d : d['overall_score'], reverse=True)
        all_extraction_results.extend(extraction_results[:topk])
        
    # tail -> head 
    for seed_tail in seed_tails:
        print(f'seed_tail: {seed_tail}')
        extraction_results = []
        
        ## For each tail, extract concept sim, head sim, lm score, combine and report
        
        cand_bins = {1: [], 2: []}
        for c in cand_heads_dict.keys():
            c_tokenized = lm_probe.tokenizer.tokenize(c)
            if len(c_tokenized) in [1, 2]:
                cand_bins[len(c_tokenized)].append(c_tokenized)
        
        cand_scores_per_template = []
        for template in templates:
            _unigram_template = '[CLS] ' + template.format('[MASK]', seed_tail) + '[SEP]'
            _bigram_template = '[CLS] ' + template.format('[MASK] [MASK]', seed_tail) + '[SEP]'

            _cand_scores_1 = lm_probe.score_candidates(_unigram_template, cand_bins[1])
            _cand_scores_2 = lm_probe.score_candidates(_bigram_template, cand_bins[2])
            _cand_scores = sorted(_cand_scores_1 + _cand_scores_2, key=lambda d : d["cand"])
            # List[Dict["cand", "score"]]
            cand_scores_per_template.append(_cand_scores)
    
        cand_scores = []  # List[Dict["cand", "score"]], for each "cand" the average score 
        for _cand_score_lst in zip(*cand_scores_per_template):
            # _cand_score_lst: List[Dict["cand", "score"]], for the same "cand" and different template 
            _cand = _cand_score_lst[0]["cand"]
            assert all(d["cand"] == _cand for d in _cand_score_lst), _cand_score_lst
            _score = np.mean([d["score"] for d in _cand_score_lst])
            cand_scores.append({"cand": _cand, "score": _score})
#         cand_scores.sort(key = lambda d : d["score"], reverse=True)

        for d in cand_scores[:topk]:
            e_head = ' '.join(d["cand"]).replace(' ##', '')
            if e_head not in cand_heads_dict:
                continue
                
            lm_score = d["score"]
            try:
                ht_sim_score = 1 - cosine(entity_emb_dict[e_head], entity_emb_dict[seed_tail])
            except KeyError:
                print(f'** embedding of {e_head}: {(e_head in entity_emb_dict)}')
                print(f'** embedding of {seed_tail}: {(seed_tail in entity_emb_dict)}')
                ht_sim_score = float("nan")
            concept_sim_score = cand_heads_dict[e_head]
            overall_score = scores_agg_func(ht_sim_score, concept_sim_score, lm_score)
        
            extraction_results.append({'head': e_head, 'tail': seed_tail, 'base': 'TAIL',
                                       'ht_sim_score': ht_sim_score,
                                       'concept_sim_score': concept_sim_score,
                                       'lm_score': lm_score,
                                       'overall_score': overall_score})
        
        extraction_results.sort(key=lambda d : d['overall_score'], reverse=True)
        all_extraction_results.extend(extraction_results[:topk])
        
    results_df = pd.DataFrame(all_extraction_results)
    if save_path is not None:
        results_df.to_csv(save_path, index=None)
    return results_df

In [None]:
seed_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_concepts.csv')
seed_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_relations.csv')
seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')
seed_aligned_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_relations.csv')
# knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/knn_{cluster_size}.csv')
concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/concept_knn_1000.csv')
bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt')

extraction_save_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/rel_extraction.csv')
# extraction_save_path = None

extraction_results = direct_probing_RE_v3(seed_aligned_concepts_path=seed_aligned_concepts_path,
                                          seed_aligned_relations_path=seed_aligned_relations_path,
                                          emb_path=bert_emb_path,
                                          concept_knn_path=concept_knn_path,
                                          templates=has_dress_code_templates,
                                          lm_probe=lm_probe,
                                          topk=300,
                                          save_path=extraction_save_path)

extraction_results

In [None]:
df = extraction_results.copy()
df[df['head'] == 'walmart'].head(50)
# df['overall_score'] = df['ht_sim_score'] * 0.1 + df['concept_sim_score'] * 10 + np.log10(df['lm_score'])
# df.sort_values(by='overall_score', ascending=False)

In [None]:
df = extraction_results.copy()
df[df['tail'] == 'hair color'].head(50)
# df['overall_score'] = df['ht_sim_score'] * 0.1 + df['concept_sim_score'] * 10 + np.log10(df['lm_score'])
# df.sort_values(by='overall_score', ascending=False).head(50)

In [145]:
# Use script 
!python relation_extraction_avg_scores.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-o $base_dir/data/$data_ac/intermediate/rel_extraction_2.csv \
-cknn $base_dir/data/$data_ac/intermediate/concept_knn_1000.csv \
-topk 300 \
-dim 768


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
company 	 dress_code
seed_heads: ['walmart', 'amazon', 'subway', 'microsoft', 'target']
seed_tails: ['business casual', 'uniform', 'hair color', 'tattoos', 'facial hair', 'shoes', 'piercings']
seed_head: walmart
seed_head: amazon
seed_head: subway
seed_head: microsoft
seed_head: target
seed_tail: business casual
seed_tail: uniform
seed_tail: hair color
seed_tail: t

In [101]:
## Evaluation 

def load_benchmark_relations(benchmark_path):
    '''Currently only has_dress_code.'''
    benchmark = pd.read_csv(benchmark_path)
    
    # List[Dict[head, tail]]
    rel_pairs = []
    
    for i, row in benchmark.iterrows():
        if row['relation_name'] != 'has_dress_code':
            continue
        if row['n_head_category'] != 'company' or row['n_tail_category'] != 'dress_code':
            continue
        if row['type'] != 'fact':
            continue
        
        if row['n_head'] != 'company':
            # already instance 
            rel_pairs.append({'head': str(row['n_head']).lower(),
                              'tail': str(row['n_tail']).lower()})
            continue
        
        evidence_sents = eval(str(row['sentences']))
        head_instances = eval(str(row['Evidence']))
        assert len(evidence_sents) == len(head_instances), f'Line {i} length mismatch'
        
        for inst in head_instances:
            if len(inst) > 0:
                rel_pairs.append({'head': inst.lower(),
                                  'tail': str(row['n_tail']).lower()})
        
    return rel_pairs

In [12]:
benchmark_path = os.path.join(base_dir, f'data/indeed-benchmark/benchmark_evidence_clean.csv')

benchmark_relations_list = load_benchmark_relations(benchmark_path)
len(benchmark_relations_list)

138

In [15]:
rel_extraction_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/rel_extraction.csv')

rel_extraction = pd.read_csv(rel_extraction_path)
rel_extraction_list = rel_extraction[['head', 'tail']].to_dict('records')

In [16]:
benchmark_relations_set = set([tuple(d.values()) for d in benchmark_relations_list])
rel_extraction_set = set([tuple(d.values()) for d in rel_extraction_list])

intersection = benchmark_relations_set & rel_extraction_set

len(benchmark_relations_set), len(rel_extraction_set), len(intersection)


(107, 3597, 6)

# Knowledge Verification baseline
(finding co-occurrences of head / tail from corpus)

In [44]:
# rel_extraction_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/rel_extraction.csv')
# # corpus_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sentences.json')

# indeed_dataset_path = '/home/ubuntu/users/nikita/data/indeed/indeedQA/question_answers.csv'
# company_path = '/home/ubuntu/users/nikita/data/indeed/indeedQA/fccid-companyName.csv'

# # with open(corpus_path, 'r') as f:
# #     sent_dicts = [json.loads(l) for l in tqdm(f.readlines())]

# indeed_dataset = pd.read_csv(indeed_dataset_path)
# indeed_dataset = indeed_dataset[indeed_dataset['answerContent'].notna()]
# company_df = pd.read_csv(company_path)
# company_dict = dict(zip(company_df["fccompanyId"].to_list(), company_df["companyName"].to_list()))

# indeed_dataset.shape, len(company_dict)

In [45]:
rel_extraction_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/rel_extraction.csv')
corpus_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sentences_with_company.json')

In [46]:
df_relations = pd.read_csv(rel_extraction_path)
df_relations.shape

(3600, 7)

In [47]:
with open(corpus_path, 'r') as f:
    sent_dicts = [json.loads(l) for l in tqdm(f.readlines())]

len(sent_dicts)

HBox(children=(FloatProgress(value=0.0, max=413232.0), HTML(value='')))




413232

In [55]:
sent_dicts[1010]

{'tokens': ['What', 'is', 'the', 'age', 'limit', '.'],
 'company': 'Marshalls',
 'entities': ['age limit', 'marshalls']}

In [50]:
## Entailment model 
yutong_base_dir = "/home/ubuntu/users/yutong"
roberta_ses_dir = os.path.join(yutong_base_dir, "repos", "Roberta_SES")

In [51]:
# 0 = contra, 1 = neutral, 2 = entail
entailment_model = Roberta_SES_Entailment(roberta_path='/home/ubuntu/users/yutong/models/roberta-large',
        ckpt_path=os.path.join(roberta_ses_dir, 'checkpoints/epoch=2-valid_loss=-0.2620-valid_acc_end=0.9223.ckpt'),
        max_length=512,
        device_name='cpu')

In [52]:
entailment_model.predict(
    "walmart : no you can have tattoo",
    "walmart allows tattoos"
)

(tensor(0), tensor([0.9972, 0.0014, 0.0014]))

In [60]:
# KV for (walmart, has_dress_code, uniform)

_pos_templates = [
    '{0} allows {1}',
    '{0} requires {1}',
]

_neg_templates = [
    '{0} doesn\'t allow {1}',
    '{0} doesn\'t require {1}',
]

def find_evidences(head, tail, corpus=sent_dicts):
    # (s1(evid), s2(tmpl), score)
    _pos_evidences = []
    _neg_evidences = []

    for i, d in enumerate(sent_dicts):
        if i > 0 and i % 50000 == 0:
            print(f'Progress: {i} / {len(sent_dicts)}')
            
#         _company_id = row['fccompanyId']
#         _company = company_dict[_company_id]

#         _answer = row['answerContent']
#         _tokens = [str(t) for t in spacy_tokenizer(_answer)]
#         _s = f" {_company.lower()} : {' '.join(_tokens).lower()} "
        _company = d['company']
        _tokens = d['tokens']
        _s = f"{_company.lower()} : {' '.join(_tokens).lower()}"

        if head in d['entities'] and tail in d['entities']:
            # Try all pos/neg relation templates, save the best template  
            _max_pos_ev = (None, None, 0)
            for _tmpl in _pos_templates:
                _tmpl_filled = _tmpl.format(head, tail)
                _entail_pred, _entail_probs = entailment_model.predict(_s, _tmpl_filled)
                _entail_score = _entail_probs[2].item()
                if _entail_score > _max_pos_ev[-1]:
                    _max_pos_ev = (_s, _tmpl_filled, _entail_score)

            _max_neg_ev = (None, None, 0)
            for _tmpl in _neg_templates:
                _tmpl_filled = _tmpl.format(head, tail)
                _entail_pred, _entail_probs = entailment_model.predict(_s, _tmpl_filled)
                _entail_score = _entail_probs[2].item()
                if _entail_score > _max_neg_ev[-1]:
                    _max_neg_ev = (_s, _tmpl_filled, _entail_score)

            _pos_evidences.append(_max_pos_ev)
            _neg_evidences.append(_max_neg_ev)
    
    _pos_evidences.sort(key=lambda p : p[-1], reverse=True)
    _neg_evidences.sort(key=lambda p : p[-1], reverse=True)
    
    return _pos_evidences, _neg_evidences

In [None]:
_pos_evidences, _neg_evidences = find_evidences('walmart', 'long sleeve shirt')
'POS:', _pos_evidences[:10], 'NEG:', _neg_evidences[:10]

In [80]:
def find_evidences_RE(df_relations, corpus=sent_dicts, p_thres=0.7):
    ## TODO: to script 
    
    
    # Dict[Tuple(head, rel, tail): List[Tuple(s1(evid), s2(tmpl), score)]]
    pos_evidences = defaultdict(list)
    neg_evidences = defaultdict(list)
    
    # collect all relations 
    rels = []
    head2rels = defaultdict(list)
    tail2rels = defaultdict(list)
    for i, row in df_relations.iterrows():
        _h = row['head']
        _t = row['tail']
        _r = 'has_dress_code'
        rels.append((_h, _r, _t))
        if row['base'] == 'HEAD':
            head2rels[_h].append((_h, _r, _t))
        else:
            tail2rels[_t].append((_h, _r, _t))

    # collect sents for each entity 
    entity2sents = defaultdict(set)
    for i, d in enumerate(sent_dicts):
        _s = f"{d['company']} : {' '.join(d['tokens'])}".lower()
        for _e in d['entities']:
            entity2sents[_e].add(_s)
    
    for _h, _r, _t in tqdm(rels[200::200]):
        # assure key existence
        _ = pos_evidences[(_h, _r, _t)]
        _ = neg_evidences[(_h, _r, _t)]
        
        h_sents = entity2sents[_h]
        t_sents = entity2sents[_t]
        intersect_sents = h_sents & t_sents
        
        for _s in intersect_sents:
            _ss = _s.strip()

            # Try all pos/neg relation templates, save the best template  
            _max_pos_ev = (None, None, 0)
            for _tmpl in _pos_templates:
                _tmpl_filled = _tmpl.format(_h, _t)
                _entail_pred, _entail_probs = entailment_model.predict(_ss, _tmpl_filled)
                _entail_score = _entail_probs[2].item()
                if _entail_score > _max_pos_ev[-1]:
                    _max_pos_ev = (_ss, _tmpl_filled, _entail_score)

            _max_neg_ev = (None, None, 0)
            for _tmpl in _neg_templates:
                _tmpl_filled = _tmpl.format(_h, _t)
                _entail_pred, _entail_probs = entailment_model.predict(_ss, _tmpl_filled)
                _entail_score = _entail_probs[2].item()
                if _entail_score > _max_neg_ev[-1]:
                    _max_neg_ev = (_ss, _tmpl_filled, _entail_score)

            if _max_pos_ev[-1] > p_thres:
                pos_evidences[(_h, _r, _t)].append(_max_pos_ev)
            if _max_neg_ev[-1] > p_thres:
                neg_evidences[(_h, _r, _t)].append(_max_neg_ev)
    
    
#     # Head-base
#     for _h, _rels in rel_head_index.items():
#         # First find sentences with _h
#         _h_sents = [] 
#         for i, d in enumerate(sent_dicts):
#             _company = d['company']
#             _tokens = d['tokens']
#             _s = f" {_company.lower()} : {' '.join(_tokens).lower()} "
#             if f' {_h} ' in _s:
#                 _h_sents.append(_s)
        
#         # KV: check for _t only; entail
#         for _h, _r, _t in tqdm(_rels):
#             for _s in _h_sents:
#                 if f' {_t} ' in _s:
#                     _ss = _s.strip()
                    
#                     # Try all pos/neg relation templates, save the best template  
#                     _max_pos_ev = (None, None, 0)
#                     for _tmpl in _pos_templates:
#                         _tmpl_filled = _tmpl.format(_h, _t)
#                         _entail_pred, _entail_probs = entailment_model.predict(_ss, _tmpl_filled)
#                         _entail_score = _entail_probs[2].item()
#                         if _entail_score > _max_pos_ev[-1]:
#                             _max_pos_ev = (_ss, _tmpl_filled, _entail_score)

#                     _max_neg_ev = (None, None, 0)
#                     for _tmpl in _neg_templates:
#                         _tmpl_filled = _tmpl.format(_h, _t)
#                         _entail_pred, _entail_probs = entailment_model.predict(_ss, _tmpl_filled)
#                         _entail_score = _entail_probs[2].item()
#                         if _entail_score > _max_neg_ev[-1]:
#                             _max_neg_ev = (_ss, _tmpl_filled, _entail_score)

#                     if _max_pos_ev[-1] > p_thres:
#                         pos_evidences[(_h, _r, _t)].append(_max_pos_ev)
#                     if _max_neg_ev[-1] > p_thres:
#                         neg_evidences[(_h, _r, _t)].append(_max_neg_ev)

#     # Tail-base
#     for _t, _rels in tqdm(rel_tail_index.items(), total=len(rel_tail_index)):
#         # First find sentences with _t
#         _t_sents = [] 
#         for i, d in enumerate(sent_dicts):
#             _company = d['company']
#             _tokens = d['tokens']
#             _s = f" {_company.lower()} : {' '.join(_tokens).lower()} "
#             if f' {_t} ' in _s:
#                 _t_sents.append(_s)
        
#         # KV: check for _h only; entail
#         for _h, _r, _t in tqdm(_rels):
#             if (_h, _r, _t) in pos_evidences or (_h, _r, _t) in neg_evidences:
#                 # already computed 
#                 continue
#             for _s in _t_sents:
#                 if f' {_h} ' in _s:
#                     _ss = _s.strip()
                    
#                     # Try all pos/neg relation templates, save the best template  
#                     _max_pos_ev = (None, None, 0)
#                     for _tmpl in _pos_templates:
#                         _tmpl_filled = _tmpl.format(head, tail)
#                         _entail_pred, _entail_probs = entailment_model.predict(_ss, _tmpl_filled)
#                         _entail_score = _entail_probs[2].item()
#                         if _entail_score > _max_pos_ev[-1]:
#                             _max_pos_ev = (_ss, _tmpl_filled, _entail_score)

#                     _max_neg_ev = (None, None, 0)
#                     for _tmpl in _neg_templates:
#                         _tmpl_filled = _tmpl.format(head, tail)
#                         _entail_pred, _entail_probs = entailment_model.predict(_ss, _tmpl_filled)
#                         _entail_score = _entail_probs[2].item()
#                         if _entail_score > _max_neg_ev[-1]:
#                             _max_neg_ev = (_ss, _tmpl_filled, _entail_score)

#                     if _max_pos_ev[-1] > p_thres:
#                         pos_evidences[(_h, _r, _t)].append(_max_pos_ev)
#                     if _max_neg_ev[-1] > p_thres:
#                         neg_evidences[(_h, _r, _t)].append(_max_neg_ev)

    for _rel, _evidences in pos_evidences.items():
        _evidences.sort(key=lambda p : p[-1], reverse=True)
    for _rel, _evidences in neg_evidences.items():
        _evidences.sort(key=lambda p : p[-1], reverse=True)
    
    return pos_evidences, neg_evidences

In [None]:
pos, neg = find_evidences_RE(df_relations)

In [None]:
dict(pos)

In [None]:
dict(neg)

In [99]:
# Use script 
!python knowledge_verification_entail.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-o $base_dir/data/$data_ac/intermediate/kv_evidences.json \
-r $yutong_base_dir/models/roberta-large \
-rs $yutong_base_dir/repos/Roberta_SES/checkpoints/epoch=2-valid_loss=-0.2620-valid_acc_end=0.9223.ckpt \
-p 0.7

Loading files...
Finding evidence for rels: 100%|██████████| 3600/3600 [4:46:02<00:00,  4.77s/it]


In [102]:
# Evaluate 
benchmark_path = os.path.join(base_dir, f'data/indeed-benchmark/benchmark_evidence_clean.csv')

benchmark_relations_list = load_benchmark_relations(benchmark_path)
len(benchmark_relations_list)

138

In [103]:
kv_evidences_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/kv_evidences.json')

with open(kv_evidences_path, 'r') as f:
    kv_evidences = [json.loads(l) for l in f.readlines()]
len(kv_evidences)

933

In [126]:
p_thres = 0.9

kv_filtered_rels = []
for d in kv_evidences:
    _h, _r, _t = d['relation']
    _pos_evs = d['pos_evidences']
    _neg_evs = d['neg_evidences']
    if (len(_pos_evs) > 0 and _pos_evs[0][-1] > p_thres):
        kv_filtered_rels.append((_h, _t))
    elif (len(_neg_evs) > 0 and _neg_evs[0][-1] > p_thres):
        kv_filtered_rels.append((_h, _t))

len(kv_filtered_rels)

666

In [127]:
benchmark_relations_set = set([tuple(d.values()) for d in benchmark_relations_list])
kv_filtered_rels_set = set(kv_filtered_rels)

intersection = benchmark_relations_set & kv_filtered_rels_set

len(benchmark_relations_set), len(kv_filtered_rels_set), len(intersection)


(107, 665, 6)

In [128]:
intersection

{('best buy', 'uniform'),
 ('costco', 'hair color'),
 ('dd', 'facial hair'),
 ('dollar tree', 'uniform'),
 ('family dollar', 'facial hair'),
 ('walmart', 'uniform')}

In [129]:
for d in kv_evidences:
    _h, _r, _t = d['relation']
    _pos_evs = d['pos_evidences']
    _neg_evs = d['neg_evidences']
    
    if (_h, _t) in intersection:
        print(_h, _t)
        _max = sorted(_pos_evs + _neg_evs, key=lambda p : p[-1], reverse=True)[0]
        print(_max)
        print()

walmart uniform
['walmart : no a uniform shirt is required', "walmart doesn't require uniform", 0.9957914352416992]

best buy uniform
['best buy : the required uniform', 'best buy requires uniform', 0.989712119102478]

dollar tree uniform
['dollar tree : yes we have to where uniform that we supply ourselves .', 'dollar tree requires uniform', 0.9870349764823914]

costco hair color
["costco wholesale : costco does n't discriminate with any hair color", 'costco allows hair color', 0.9456473588943481]

dd facial hair
["dunkin' donuts : yes , dd does allow facial hair .", 'dd allows facial hair', 0.9918413162231445]

family dollar facial hair
['family dollar : yes , they allow facial hair .', 'family dollar allows facial hair', 0.9905114769935608]



In [None]:
test_kv_evidences = list(kv_evidences)

for d in test_kv_evidences:
    _h, _r, _t = d['relation']
    _pos_evs = d['pos_evidences'][:5]
    _neg_evs = d['neg_evidences'][:5]
    
    _max = sorted(_pos_evs + _neg_evs, key=lambda p : p[-1], reverse=True)[0]
    d['pos_evidences'] = _pos_evs
    d['neg_evidences'] = _neg_evs
    d['max_ev'] = _max

sorted(test_kv_evidences, key=lambda d : d['max_ev'][-1], reverse=True)[:20]

In [None]:
# Discussions:
# coherence clustering / ensemble models?
# trying for other relations or entities
# using entities in sub-categories
# fine-tuning
# ambiguous samples (high for pos and neg)
# quantitative-evaluation

# Mine Prompts

In [108]:
# Explore various techniques
# Get prompts "between" entities
# Get prompts by syntactic parsing
# Get prompts by paraphrasing
# Get prompts uisng AutoPrompt

In [109]:
# visit here: /meg-kb/src/analysis/pattern_mining.ipynb

# Retrieve Prompt Evidence

In [109]:
# visit here: /meg-kb/src/analysis/lm_probing.ipynb

# Suggest Quality Prompts