In [316]:
base_dir = '/mnt/efs/shared/meg_shared_scripts/meg-kb'

In [660]:
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


In [809]:
from tqdm.notebook import tqdm
import argparse
import re
import numpy as np
from scipy.spatial.distance import cosine
import random
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import json

import logging
from sklearn.cluster import KMeans, AgglomerativeClustering
import pandas as pd
import os
import math
from annoy import AnnoyIndex

from spacy.lang.en import English
nlp = English()
spacy_tokenizer = nlp.tokenizer

from compute_concept_clusters import load_embeddings, knn

from compute_keyphrase_embeddings import get_masked_contexts, ensure_tensor_on_device, mean_pooling

# Data Preprocessing

In [564]:
# Input: text corpus
# step 1: extract key phrases (autophrase)
# step 2: generate embeddings

## Extract Key Phrases

In [565]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/keyword_extraction

In [566]:
#change to keyword extractor directory
%cd $base_dir/src/keyword_extraction/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/keyword_extraction


In [567]:
!chmod +x ./corpusProcess.sh

In [568]:
# select the dataset and thread no
data_ac = 'indeeda-meg-ac'
data_pt = 'indeeda-meg-pt'
thread = 8

In [463]:
# process corpus and generate key prhases
!./corpusProcess.sh $data_ac $thread

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/keyword_extraction
[32m===Corpus Name: sample-indeeda-meg-ac===[m
[32m===Current Path: /mnt/efs/shared/meg_shared_scripts/meg-kb/src/keyword_extraction===[m
[32m===Cleaning input corpus===[m
[32m===Running AutoPhrase===[m
make: Nothing to be done for 'all'.
[32m===RAW_TRAIN: ../../../data/sample-indeeda-meg-ac/source/corpus.clean.txt===[m
auto_phrase.sh parameters: sample-indeeda-meg-ac ../../../data/sample-indeeda-meg-ac/source/corpus.clean.txt 10 data/EN/wiki_quality.txt 8
[32m===Compilation===[m
[32m===Tokenization===[m
Current step: Tokenizing input file...[0K
real	0m0.702s
user	0m1.668s
sys	0m0.100s
Detected Language: EN[0K
Current step: Tokenizing wikipedia phrases...[0K
No provided expert labels.[0K
[32m===Part-Of-Speech Tagging===[m
[32m===AutoPhrasing===[m
=== Current Settings ===
Iterations = 2
Minimum Support Threshold = 10
Maximum Length Threshold = 6
POS-Tagging Mode Disabled
Discard Ratio = 0.050000
Numbe

In [464]:
# copy these results to sample-meg-pt
!cp -r ../../data/$data_ac ../../data/$data_pt

# Generate Embeddings

In [465]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/concept_learning

In [466]:
#change to concept learning directory
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


## Sentence Embedding

In [467]:
!python compute_keyphrase_embeddings.py -m bert-base-uncased -et ac -d ../../data/$data_ac/intermediate -c 750

loading corpus: 100%|█████████████████████| 694/694 [00:00<00:00, 194471.34it/s]
computing entity-wise embedding: 100%|████████| 177/177 [00:03<00:00, 50.59it/s]
Saving embedding


## Concatenated Token Embedding

In [468]:
!python compute_keyphrase_embeddings.py -m bert-base-uncased -et pt -d ../../data/$data_pt/intermediate -c 750

loading corpus: 100%|█████████████████████| 694/694 [00:00<00:00, 191566.11it/s]
computing entity-wise embedding: 100%|████████| 177/177 [00:03<00:00, 53.88it/s]
Saving embedding


## Token Embedding

In [469]:
# change directory to autophrase
%cd $base_dir/src/tools/AutoPhrase

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/tools/AutoPhrase


In [470]:
data_corel = 'sample-indeeda-corel'

In [471]:
!CUDA_VISIBLE_DEVICES=0 python extractBertEmbedding.py ../../../data/$data_corel/intermediate/ $thread

1
2021-06-18 00:36:18,384 : INFO : loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/ubuntu/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
2021-06-18 00:36:18,776 : INFO : loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/ubuntu/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
2021-06-18 00:36:18,777 : INFO : Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embedding

## Add embeddings for seed instances

In [714]:
def load_seed_concepts(path):
    df = pd.read_csv(path)    
    df["seedInstances"] = df["seedInstances"].map(lambda s : eval(str(s)))
    return df

def load_seed_aligned_concepts(path):
    df = pd.read_csv(path)
    df = df[df["generalizations"] != "x"]
    df["seedInstances"] = df["seedInstances"].map(lambda s : eval(str(s)))
    return df

def load_seed_aligned_relations(path):
    df = pd.read_csv(path)
    df = df[df["range"] != "x"]
    return df

In [712]:
def get_masked_contexts_for_entities(entities, input_file):
    """Return a (list of) sentence(s) with entity replaced with MASK."""
    """YS: input should be sentences.json"""
    
    ent_freq = {ent : 0 for ent in entities}
    ent_context = {ent : [] for ent in entities}
    
    with open(input_file, "r") as fin:
        lines = fin.readlines()
        for line in tqdm(lines, total=len(lines), desc="loading corpus"):
            json_dict = json.loads(line)
            sent = ' ' + ' '.join(json_dict['tokens']).lower() + ' '
            #entities = [match.group(1) for match in re.finditer(pat, line)]
            
            for entity in entities:
                pat = f' {entity} '
                if pat not in sent:
                    continue

                context = sent.replace(pat, ' [MASK] ').strip()
                c = context.split('[MASK]')
                if len(c) != 2:  # sanity to not have too many repeating phrases in the context
                    continue

                # ignore too short contexts
                if len(context) < 15:
                    continue

                # print(entity)
                # print(context)
                
                _freq = ent_freq.get(entity, 0)
                ent_freq[entity] = _freq + 1

                context_lst = ent_context.get(entity, [])
                context_lst.append(context)
                ent_context[entity] = context_lst

    dedup_context = {}
    for e, v in ent_context.items():
        dedup_context[e] = list(set(v))
    return ent_freq, dedup_context


In [723]:
def get_avg_context_embedding_for_entities(entities, model_path, input_file, max_context_ct):
    '''
    mean pooling from sentence-transformers
    :param entity: List[str], the entities to compute embeddings for
    :param model_path:
    :param input_file:
    :return:
    '''
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    mask_token_id = tokenizer.mask_token_id

    ent_freq, ent_context = get_masked_contexts_for_entities(entities, input_file)
    
    entity_embeddings = {}
    for entity, en_context_lst in tqdm(ent_context.items(), total=len(ent_context), desc="computing entity-wise embedding"):
#     for entity, en_context_lst in ent_context.items():
        print(entity)
        en_context_lst = random.sample(en_context_lst, min(len(en_context_lst), max_context_ct))
        chunks = [en_context_lst[i:i + 100] for i in range(0, len(en_context_lst), 100)]
        # print(entity)
        # print(len(en_context_lst))
        all_context_embeddings = []
        for chunk in chunks:
            encoded_input = tokenizer.batch_encode_plus(chunk, return_token_type_ids=True, add_special_tokens=True, max_length=128, return_tensors='pt', padding=True, pad_to_max_length=True, truncation=True)
            mask = encoded_input['input_ids'] != mask_token_id
            with torch.no_grad():
                encoded_input = ensure_tensor_on_device(device, **encoded_input)
                model_output = model(**encoded_input)  # Compute token embeddings
            context_embeddings = mean_pooling(model_output, mask)  # mean pooling
            all_context_embeddings.append(context_embeddings)
            
        assert len(all_context_embeddings) > 0
            
        entity_embedding = torch.mean(torch.cat(all_context_embeddings, dim=0), dim=0).cpu().detach().numpy().tolist()
        entity_embeddings[entity] = entity_embedding
    
    return entity_embeddings, ent_freq

In [None]:
corpus_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sentences.json')
seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')

orig_bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed.txt')
orig_bert_emb_num_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum.txt')

new_bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt')
new_bert_emb_num_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum+seeds.txt')

orig_emb_df = load_embeddings(bert_emb_path, 768)
emb_dict = dict(zip(orig_emb_df['entity'].to_list(), orig_emb_df['embedding'].to_list()))

with open(orig_bert_emb_num_path, 'r') as f:
    lines = f.readlines()
    emb_freq_dict = dict([l.strip().rsplit(' ', 1) for l in lines])

concepts_df = load_seed_aligned_concepts(seed_aligned_concepts_path)
seed_instances_list = [inst for _, (_a_con, _u_con, _gnrl, _seed_instances) in concepts_df.iterrows()
                           for inst in _seed_instances]

## debug
seed_instances_list = seed_instances_list[::10]

print(seed_instances_list)

entity_embeddings, ent_freq = \
    get_avg_context_embedding_for_entities(entities=seed_instances_list, 
                                           model_path='bert-base-uncased',
                                           input_file=corpus_path,
                                           max_context_ct=750)

In [None]:
for inst in seed_instances_list:
    emb = entity_embeddings[inst]
    freq = ent_freq[inst]
    if inst in emb_dict:
        print(f'Already exists: {inst}')
#         assert np.allclose(emb_dict[inst], emb)
#         assert emb_freq_dict[inst] == freq, f'{inst}: orig {emb_freq_dict[inst]} != new {freq}'
#         print(f'Check passed: {inst}')
    else:
        emb_dict[inst] = emb
        emb_freq_dict[inst] = freq

In [None]:
entity_embeddings.keys()

In [None]:
with open(new_bert_emb_path, 'w') as f, open(new_bert_emb_num_path, 'w') as f2:
    for inst in seed_instances_list:
        emb = emb_dict[inst]
        freq = ent_freq[inst]
        f.write("{} {}\n".format(inst, ' '.join([str(x) for x in emb])))
        f2.write("{} {}\n".format(inst, freq))


In [735]:
# Using script

!python add_seed_instances_embeddings.py -m bert-base-uncased -et ac -d $base_dir/data/$data_ac/intermediate -b $base_dir/data/indeed-benchmark -c 750


Seed instances: ['walmart', 'amazon', 'subway', 'microsoft', 'target', 'business casual', 'uniform', 'hair color', 'tattoos', 'facial hair', 'shoes', 'piercings', 'delivery driver', 'store manager', 'cashier', 'package handler', 'sales associate', 'barista', 'dishwasher', 'weekly', 'biweekly', 'friday', 'saturday', 'health insurance', 'flexible schedule', '401k', 'paid vacation', 'sick leave', 'vision insurance', 'base pay', 'stock options', 'benefits', 'overtime pay', 'bonus', 'checks', 'direct deposit', 'prepaid card', 'drug test', 'criminal background check', 'employment verification', 'felons', 'criminals', 'disabled', 'drug addicts', 'high schoolers', 'misdemeanor', 'pregnant', 'students', 'seniors', 'hiring age', 'bachelors degree', 'prior experience', 'working permit', 'heavy lifting', 'night shift', 'dinner shift', 'early morning shift', '8 hour shift', 'christmas eve', 'early morning', 'hoilday', '7 days', 'saturday', 'sunday', 'weekend', 'full time', 'part time', 'seasonal', 

## Check embeddings

In [738]:
data_sub_dir = data_ac
bert_emb_path = os.path.join(base_dir, f'data/{data_sub_dir}/intermediate/BERTembed+seeds.txt')

embeddings = load_embeddings(bert_emb_path, 768)
len(embeddings)

8053

In [739]:
embeddings[embeddings['entity'] == 'biweekly']

Unnamed: 0,entity,embedding
8023,biweekly,"[0.06975648552179337, -0.06970633566379547, 0...."


## (X) Other ways of embeddings / clustering

In [155]:
input_file_path = os.path.join(base_dir, f'data/{data_sub_dir}/intermediate/sent_segmentation.txt')
ent_freq, dedup_context = get_masked_contexts(input_file_path)
len(ent_freq), len(dedup_context)

loading corpus: 100%|██████████| 458/458 [00:00<00:00, 73813.30it/s]


(175, 175)

In [452]:
ent_freq['candy'], dedup_context['candy']

(2,
 ["we dropped by in hopes of finding atkinson 's peanut_butter bars ( we first tried them from honey salt 's [MASK] bowl ) and after searching a few minutes , we found it .",
  "if you 're searching for a [MASK] or soda_pop you grew up with and can no longer find , there 's a good chance you 'll find it here ."])

In [208]:
def get_all_context_embeddings(model_path, input_file, max_context_ct):
    '''
    Adapted from get_avg_context_embeddings()
    keep all context embeddings, using max similarity for knn
    :param model_path:
    :param input_file:
    :return:
    '''
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    mask_token_id = tokenizer.mask_token_id

    ent_freq, ent_context = get_masked_contexts(input_file)
    entity_embeddings = {}
    for entity, en_context_lst in tqdm(ent_context.items(), total=len(ent_context), desc="computing entity-wise embedding"):
        en_context_lst = random.sample(en_context_lst, min(len(en_context_lst), max_context_ct))
        chunks = [en_context_lst[i:i + 100] for i in range(0, len(en_context_lst), 100)]
        # print(entity)
        # print(len(en_context_lst))
        all_context_embeddings = []
        for chunk in chunks:
            encoded_input = tokenizer.batch_encode_plus(chunk, return_token_type_ids=True, add_special_tokens=True, max_length=128, return_tensors='pt', padding=True, pad_to_max_length=True, truncation=True)
            mask = encoded_input['input_ids'] != mask_token_id
            with torch.no_grad():
                encoded_input = ensure_tensor_on_device(device, **encoded_input)
                model_output = model(**encoded_input)  # Compute token embeddings
            context_embeddings = mean_pooling(model_output, mask)  # mean pooling
            # print(context_embeddings.size())
            all_context_embeddings.append(context_embeddings)
            
        # entity_embedding = torch.mean(torch.cat(all_context_embeddings, dim=0), dim=0).cpu().detach().numpy().tolist()
        # entity_embeddings[entity] = entity_embedding
        entity_embeddings[entity] = torch.cat(all_context_embeddings, dim=0).cpu().detach().numpy().tolist()
        
    return entity_embeddings, ent_freq

In [209]:
model_path = 'bert-base-uncased'
input_file_path = os.path.join(base_dir, f'data/{data_sub_dir}/intermediate/sent_segmentation.txt')
max_context_ct = 10

entity_embeddings, ent_freq = get_all_context_embeddings(model_path, input_file_path, max_context_ct)
len(entity_embeddings), len(ent_freq)

loading corpus: 100%|██████████| 458/458 [00:00<00:00, 150194.78it/s]
computing entity-wise embedding: 100%|██████████| 175/175 [00:04<00:00, 41.74it/s]


(175, 175)

In [213]:
len(entity_embeddings['candy'][0])

768

In [234]:
def _knn(entity_embeddings, embedding_dim, cluster_size, thread_ct=None, cluster_dest=None, **kwargs):
    # entity_embeddings = load_embeddings(embed_src, embedding_dim)
    
    t = AnnoyIndex(embedding_dim, 'angular')
    # entities = entity_embeddings['entity'].tolist()
    entities = [f'{entity}-{_i}' for entity, embs in entity_embeddings.items() for _i in range(len(embs))]
    # print(entities)
    # for i, row in tqdm(entity_embeddings.iterrows(), total=entity_embeddings.shape[0], desc="building entity index"):
    #     t.add_item(i, row['embedding'])
    i = 0
    for entity, embs in tqdm(entity_embeddings.items(), total=len(entity_embeddings)):
        for emb in embs:
            t.add_item(i, emb)
            i += 1
    assert i == len(entities)
    
    t.build(100)
    
    neighbors = []
    for i, entity in enumerate(tqdm(entities, desc="finding nearest neighbors by entity")):
        # print(i, entity)
        nns, dists = t.get_nns_by_item(i, cluster_size + 1, include_distances=True)
        cos_sim_scores = [(2 - d ** 2) / 2 for d in dists]  # convert angular distance to cosine similarity
        zipped = list(zip(nns, cos_sim_scores))
        sorted_nns = sorted(zipped, key=lambda x: x[1], reverse=True)
        if len(sorted_nns) > 0:
            for nn_idx, d in sorted_nns:
                neighbor_entity = entities[nn_idx]
                if neighbor_entity == entity:
                    continue
                neighbors.append({"entity": entity, "neighbor": neighbor_entity, "sim": d})
    c_df = pd.DataFrame(neighbors)
    return c_df

In [235]:
knn_results = _knn(entity_embeddings, 768, 20)

100%|██████████| 175/175 [00:00<00:00, 24854.50it/s]
finding nearest neighbors by entity: 100%|██████████| 269/269 [00:00<00:00, 6006.44it/s]


In [None]:
query = 'meat'

df = knn_results

n_embs = len(entity_embeddings[query])
sub_frames = []
for _i in range(n_embs):
    ent_name = f'{query}-{_i}'
    sub_frames.append(df[df['entity'] == ent_name])

pd.concat(sub_frames).sort_values('sim', ascending=False).head(10)

In [None]:
# original avg context knn 
knn_path = os.path.join(base_dir, f'data/{data_sub_dir}/intermediate/knn_100.csv')

knn_results = pd.read_csv(knn_path)
df = knn_results

query = 'walmart'
sub_frame = df[df['entity'] == query]
sub_frame.sort_values('sim', ascending=False).head(10)

# Expand Seed Entities (clustering)

In [99]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/concept_learning

In [299]:
#change to concept learning directory
%cd ../../concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


## knn sentence-embedding

In [365]:
clusters = 100
output = '../../data/'+data_ac+'/intermediate/knn_'+str(clusters)+'.csv'
dim = 768

In [366]:
!python compute_concept_clusters.py -d ../../data/$data_ac/intermediate/ -ca knn -s $clusters -dim $dim -o $output

building entity index: 100%|████████████████| 177/177 [00:00<00:00, 5435.26it/s]
finding nearest neighbors by entity: 100%|██| 177/177 [00:00<00:00, 2001.57it/s]


## knn token concatenated

In [308]:
clusters = 20
output = '../../data/'+data_pt+'/intermediate/knn_'+str(clusters)+'.csv'
dim = 3072

In [309]:
!python compute_concept_clusters.py -d ../../data/$data_pt/intermediate/ -ca knn -s $clusters -dim $dim -o $output

building entity index: 100%|████████████████| 177/177 [00:00<00:00, 3661.18it/s]
finding nearest neighbors by entity: 100%|██| 177/177 [00:00<00:00, 4052.00it/s]


## knn token

In [None]:
clusters = 20
output = '../../data/'+data_pt+'/intermediate/knn_'+str(clusters)+'.csv'
dim = 768

In [None]:
!python compute_concept_clusters.py -d ../../data/$data_corel/intermediate/ -ca knn -s $clusters -dim $dim -o $output

## Analyzing Clustering Results

In [107]:
#Visit here: /meg_shared_scripts/meg-kb/src/analysis/concept_learning-test.ipynb

## Seed instances clustering
(using all seed instances of a concept to find neighbors)

In [571]:
seed_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_concepts.csv')
seed_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_relations.csv')

seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')
seed_aligned_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_relations.csv')

In [744]:
def get_concept_knn(embed_src, embedding_dim, seed_aligned_concept_src, cluster_size, thread_ct, cluster_dest, **kwargs):
    seed_concepts_df = load_seed_aligned_concepts(seed_aligned_concept_src)
    
    entity_embeddings = load_embeddings(embed_src, embedding_dim)
    t = AnnoyIndex(embedding_dim, 'angular')
    entities = entity_embeddings['entity'].tolist()
    for i, row in tqdm(entity_embeddings.iterrows(), total=entity_embeddings.shape[0], desc="building entity index"):
        t.add_item(i, row['embedding'])
    t.build(100)
    
    entity_emb_dict = dict(zip(entities, entity_embeddings['embedding'].tolist()))

    neighbors = []
    for i, (a_concept, u_concept, gnrl, seed_instances) in tqdm(seed_concepts_df.iterrows(), desc="finding nearest neighbors by concept"):
        embs = []
        for inst in seed_instances:
            try:
                embs.append(entity_emb_dict[inst])
            except KeyError:
                print(f"{inst} not found in entity_emb_dict??")
                continue
        if len(embs) == 0:
            continue
        concept_emb = np.mean(embs, axis=0)
        
        nns, dists = t.get_nns_by_vector(concept_emb, cluster_size + 1, include_distances=True)
        cos_sim_scores = [(2 - d ** 2) / 2 for d in dists]  # convert angular distance to cosine similarity
        zipped = list(zip(nns, cos_sim_scores))
        sorted_nns = sorted(zipped, key=lambda x: x[1], reverse=True)
        if len(sorted_nns) > 0:
            for nn_idx, d in sorted_nns:
                neighbor_entity = entities[nn_idx]
                if neighbor_entity in seed_instances:
                    continue
                neighbors.append({"concept": a_concept, "neighbor": neighbor_entity, "sim": d})
    c_df = pd.DataFrame(neighbors)
    c_df.to_csv(cluster_dest, index=None)

In [745]:
cluster_size = 1000

bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt')
seed_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_concepts.csv')
seed_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_relations.csv')
concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/concept_knn_{cluster_size}.csv')

get_concept_knn(embed_src=bert_emb_path,
            embedding_dim=768,
            seed_aligned_concept_src=seed_aligned_concepts_path,
            cluster_size=1000,
            thread_ct=1,
            cluster_dest=concept_knn_path)


HBox(children=(FloatProgress(value=0.0, description='building entity index', max=8053.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='finding nearest neighbors by concept', …




In [748]:
df = pd.read_csv(concept_knn_path)
df[df['concept'] == 'company'].head(10)

Unnamed: 0,concept,neighbor,sim
0,company,wal mart,0.997038
1,company,costco,0.997013
2,company,publix,0.996753
3,company,walgreens,0.996623
4,company,kroger,0.996477
5,company,home depot,0.996124
6,company,sam 's club,0.995978
7,company,dollar general,0.995846
8,company,family dollar,0.995645
9,company,jcpenney,0.995513


In [750]:
df = pd.read_csv(concept_knn_path)
df[df['concept'] == 'pay_schedule'].head(10)

Unnamed: 0,concept,neighbor,sim
2985,pay_schedule,sunday,0.989753
2986,pay_schedule,weekend,0.989047
2987,pay_schedule,7 days,0.984987
2988,pay_schedule,part time,0.984129
2989,pay_schedule,8 hour shift,0.982685
2990,pay_schedule,bonus,0.981976
2991,pay_schedule,full time,0.981485
2992,pay_schedule,seasonal,0.979928
2993,pay_schedule,training,0.978852
2994,pay_schedule,orientation,0.978359


## Entity expansion evaluation
Now using benchmark entities, mean reciprocal rank

In [751]:
seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')
seed_aligned_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_relations.csv')
benchmark_path = os.path.join(base_dir, f'data/indeed-benchmark/benchmark.csv')
concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/concept_knn_1000.csv')

seed_aligned_concepts = load_seed_aligned_concepts(seed_aligned_concepts_path)
seed_aligned_relations = load_seed_aligned_relations(seed_aligned_relations_path)
benchmark = pd.read_csv(benchmark_path)
concept_knn = pd.read_csv(concept_knn_path)

print(seed_aligned_concepts['alignedCategoryName'].tolist())
print(seed_aligned_relations['alignedRelationName'].tolist())
print(set(concept_knn['concept'].tolist()))
print(benchmark.shape)

['company', 'dress_code', 'job_position', 'pay_schedule', 'benefits', 'compensation', 'payment_option', 'background_screening', 'person', 'hire_prerequisite', 'shifts', 'schedule', 'employee_type', 'onboarding_steps']
['has_pay_schedule', 'has_pay_schedule', 'has_dress_code', 'has_dress_code', 'has_background_screening', 'has_benefits', 'has_benefits', 'hires_person', 'has_compensation', 'has_compensation', 'has_hire_prerequisite', 'operates_on', 'hires_employee_type', 'has_onboarding_steps', 'has_shifts', 'has_shifts', 'has_job_position', 'has_hiring_policy', 'has_payment_option']
{'background_screening', 'compensation', 'schedule', 'dress_code', 'benefits', 'job_position', 'payment_option', 'hire_prerequisite', 'employee_type', 'shifts', 'pay_schedule', 'onboarding_steps', 'person', 'company'}
(706, 16)


In [753]:
for i, d in seed_aligned_concepts.iterrows():
    a_concept = d["alignedCategoryName"]
    u_concept = d["unalignedCategoryName"]
    seed_instances = d["seedInstances"]

    concept_knn_instances = concept_knn[concept_knn["concept"] == a_concept]["neighbor"].to_list()
    
    _b_head_instances = benchmark[benchmark["n_head_category"] == a_concept]["n_head"].to_list()
    _b_tail_instances = benchmark[benchmark["n_tail_category"] == a_concept]["n_tail"].to_list()
    benchmark_instances = list(set(_b_head_instances + _b_tail_instances))
    
    print(f'Concept: {a_concept} / {u_concept}')
    print(f'seeds: {seed_instances}')
#     print(f'expanded (concept_knn_instances): {concept_knn_instances}')
#     print(f'benchmark_instances: {benchmark_instances}')
    b_inst_ranks = dict()
    recip_ranks = []
    for _inst in benchmark_instances:
        if _inst in seed_instances:
            b_inst_ranks[_inst] = -1
        elif _inst in concept_knn_instances:
            _rank = concept_knn_instances.index(_inst) + 1
            b_inst_ranks[_inst] = _rank
            recip_ranks.append(1.0 / _rank)
        else:
            b_inst_ranks[_inst] = float('nan')
            recip_ranks.append(0.0)
        
    print(json.dumps(b_inst_ranks, indent=4))
    print('MRR:', np.mean(recip_ranks))
    print()

Concept: company / company
seeds: ['walmart', 'amazon', 'subway', 'microsoft', 'target']
{
    "chipotle": 44,
    "olive garden": 28,
    "planet fitness": 52,
    "chilis": 88,
    "dunkin donuts": 43,
    "subways": NaN,
    "lowes": NaN,
    "mcdonalds": 24,
    "taco bell": 25,
    "dollar tree": 15,
    "kroger": 5,
    "primark": 121,
    "amazon.com": NaN,
    "pizza hut": 11,
    "bonus": NaN,
    "safeway": 17,
    "subway": -1,
    "walgreens": 4,
    "burger king": 32,
    "hobby lobby": 18,
    "training": NaN,
    "frito lay": 42,
    "walmart": -1,
    "tj maxx": 45,
    "frito": NaN,
    "company": 54,
    "starbucks": 12,
    "petsmart": 33,
    "extensive background check": NaN,
    "electric": NaN,
    "cvs": 29,
    "home depot": 6,
    "g4s": 48,
    "heb": NaN,
    "instacart": NaN,
    "ihop": 39,
    "costco": 2,
    "marshalls": 26,
    "pepsi": 35,
    "pepsico": 30,
    "extensive background checks": 606,
    "dd": 61,
    "target": -1,
    "tim hortons": 97,

# Relation Extraction Baselines
Currently only for has_dress_code. TODO: include all other relations

In [421]:
# Imported from lm_probing.ipynb 
# TODO: for scoring purpose, maybe better to use GPT-2

class LMProbe(object):
    def __init__(self, model_name='bert-base-uncased', use_gpu=False):
        self.device = torch.device('cuda' if torch.cuda.is_available() and use_gpu else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForMaskedLM.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()

        self.mask_token = self.tokenizer.mask_token

    def fill_multi_mask(self, input_txt, topk=3):
        if not (input_txt.startswith('[CLS]') and input_txt.endswith('[SEP]')):
            raise Exception('Input string must start with [CLS] and end with [SEP]')
        if not '[MASK]' in input_txt:
            raise Exception('Input string must have at least one mask token')
        tokenized_txt = self.tokenizer.tokenize(input_txt)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_txt)
        tokens_tensor = torch.tensor([indexed_tokens])
        mask_indices = [i for i, x in enumerate(tokenized_txt) if x == "[MASK]"]
        segment_idx = tokens_tensor * 0
        tokens_tensor = tokens_tensor.to(self.device)
        segments_tensors = segment_idx.to(self.device)

        with torch.no_grad():
            outputs = self.model(tokens_tensor, token_type_ids=segments_tensors)
            predictions = outputs[0]

        probs = torch.softmax(predictions, dim=-1)[0]
        sorted_probs, sorted_idx = probs.sort(dim=-1, descending=True)
        sorted_probs = sorted_probs.detach().cpu().numpy()
        sorted_idx = sorted_idx.detach().cpu().numpy()

        masked_cands = []
        for k in range(topk):
            predicted_indices = [sorted_idx[i, k].item() for i in mask_indices]
            predicted_tokens = self.tokenizer.convert_ids_to_tokens(predicted_indices)
            predicted_probs = [sorted_probs[i, k].item() for i in mask_indices]
            seq = []
            for token_id, token, prob, masked_index in zip(predicted_indices, predicted_tokens, predicted_probs,
                                                           mask_indices):
                seq.append({"token": token_id, "token_str": token, "prob": prob, "masked_pos": masked_index})
            masked_cands.append(seq)

        return masked_cands
    
    def score_candidates(self, input_txt, cands):
        # cands: List[List[str]], list of tokenized candidates 
        tokenized_txt = self.tokenizer.tokenize(input_txt)
        
        if tokenized_txt[0] != "[CLS]" or tokenized_txt[-1] != "[SEP]":
            raise Exception(f'Input string must start with [CLS] and end with [SEP], got {input_txt}')
        if "[MASK]" not in tokenized_txt:
            raise Exception(f'Input string must have at least one mask token, got {input_txt}')
        
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_txt)
        tokens_tensor = torch.tensor([indexed_tokens])
        mask_indices = [i for i, x in enumerate(tokenized_txt) if x == "[MASK]"]
        segment_idx = tokens_tensor * 0
        tokens_tensor = tokens_tensor.to(self.device)
        segments_tensors = segment_idx.to(self.device)

        with torch.no_grad():
            outputs = self.model(tokens_tensor, token_type_ids=segments_tensors)
            predictions = outputs[0]

        probs = torch.softmax(predictions, dim=-1)[0]
        # sorted_probs, sorted_idx = probs.sort(dim=-1, descending=True)
        # sorted_probs = sorted_probs.detach().cpu().numpy()
        # sorted_idx = sorted_idx.detach().cpu().numpy()
        probs = probs.detach().cpu().numpy()

        cand_scores = []
        for c in cands:
            assert len(c) == len(mask_indices), f'cand {c}; len(mask_indices) = {len(mask_indices)}'
            
            # predicted_indices = [sorted_idx[i, k].item() for i in mask_indices]
            # predicted_tokens = self.tokenizer.convert_ids_to_tokens(predicted_indices)
            # predicted_probs = [sorted_probs[i, k].item() for i in mask_indices]
            _scores = []
            c_token_ids = self.tokenizer.convert_tokens_to_ids(c)
            for i, token_id in zip(mask_indices, c_token_ids):
                _scores.append(probs[i, token_id].item())
            score = np.prod(_scores)
            cand_scores.append({"cand": c, "score": score})

        cand_scores.sort(key=lambda d : d["score"], reverse=True)
        return cand_scores
    

In [None]:
lm_probe = LMProbe()

In [598]:
# Hand-designed. 
# TODO: mining 
# TODO: put to a file

offers_job_position_templates = [
    '{0} hires {1} .',
    '{0} is hiring {1} .',
    '{0} can hire you as a {1} .',
#     'You can get a {1} job at {0} .',
#     'Double check with the {1} at {0} .'
]

has_benefits_templates = [
    '{0} offer {1} for their employees.',
    '{0} provide {1} for employees.',
    '{0} have {1} for their employees.',
]

has_pay_schedule_templates = [
    '{0} pay their employees every {1}',
    '{0} has a pay schedule of {1}',
    '{0} employees get paid {1}',
]

has_dress_code_templates = [
    '{0} don\'t allow workers to wear {1}',
    '{0} allow workers to wear {1}',
    '{0} has a dress code of {1}',
    '{0} require employees to wear {1}',
]

In [348]:
def get_direct_probing_candidates(templates,
                                  lm_probe=None,
                                  head_entity=None,
                                  tail_entity=None,
                                  context=None,
                                  topk=10):
    '''
    Direct probing: let BERT propose possible entities  
    :param templates: List[str]: each have 2 slots, {0} for head, {1} for tail 
    :return: Dict[str, float]: proposed entities and scores 
    '''
    
    # ensure given one and propose one 
    assert (head_entity is None) != (tail_entity is None), f'{head_entity} {tail_entity}'

    if lm_probe is None:
        lm_probe = LMProbe()
    
    names_scores = {}
    for template in templates:
        if head_entity is not None:
            # head -> tail 
            _unigram_template = template.format(head_entity, '[MASK]')
            _bigram_template = template.format(head_entity, '[MASK] [MASK]')
        else:
            # tail -> head 
            _unigram_template = template.format('[MASK]', tail_entity)
            _bigram_template = template.format('[MASK] [MASK]', tail_entity)
        
        for _template in [_unigram_template, _bigram_template]:
            if context:
                query = '[CLS] ' + _template + '[SEP]' + context + '[SEP]'
            else:
                query = '[CLS] ' + _template + '[SEP]'
            preds = lm_probe.fill_multi_mask(query, topk=topk)
            for pred in preds:
                name = ' '.join([p['token_str'] for p in pred])
                name = name.replace(' ##', '')
                score = np.prod([p['prob'] for p in pred])
                scores = names_scores.get(name, [])
                scores.append(score)
                names_scores[name] = scores
                
    names_avg_scores = {k: float(sum(v))/ len(v) for k,v in names_scores.items()}
    names_avg_scores = {k: v for k, v in sorted(names_avg_scores.items(), reverse=True, key=lambda item: item[1])[:topk]}
    return names_avg_scores

In [406]:
def direct_probing_RE(seed_concepts_path,
                      seed_relations_path,
                      knn_result_path,
                      lm_probe=None,
                      topk=10):
    
    seed_concepts_df = pd.read_csv(seed_concepts_path)
    seed_relations_df = pd.read_csv(seed_relations_path)
    seed_relations_df = seed_relations_df.iloc[1]  ## Only using offer_job_position for now 
    knn_results = pd.read_csv(knn_path)

    if lm_probe is None:
        lm_probe = LMProbe()
    
    head_type = seed_relations_df['domain']
    tail_type = seed_relations_df['range']
    print(head_type, '\t', tail_type)
    seed_heads = seed_concepts_df[seed_concepts_df['categoryName'] == head_type]['seedInstances']
    seed_heads = eval(list(seed_heads)[0])
    seed_tails = seed_concepts_df[seed_concepts_df['categoryName'] == tail_type]['seedInstances']
    seed_tails = eval(list(seed_tails)[0])
    
    # print(seed_heads)
    # print(seed_tails)
    
    extraction_results = []
    
    # head -> tail 
    for seed_head in seed_heads:
        lm_cands = get_direct_probing_candidates(offers_job_position_templates,
                              lm_probe=lm_probe,
                              head_entity=seed_head,
                              topk=topk)
        neighbors = knn_results[knn_results['entity'] == seed_head]['neighbor']
        
        extracted_tails = list(set(lm_cands.keys()) & set(neighbors))
        
        print(f'seed_head: {seed_head}')
        print(f'extr_tails: {extracted_tails}')
        
        for _e in extracted_tails:
            extraction_results.append({'head': seed_head, 'tail': _e, 'new': 'TAIL'})
        
    # tail -> head 
    for seed_tail in seed_tails:
        lm_cands = get_direct_probing_candidates(offers_job_position_templates,
                              lm_probe=lm_probe,
                              tail_entity=seed_tail,
                              topk=topk)
        neighbors = knn_results[knn_results['entity'] == seed_tail]['neighbor']
        
        extracted_heads = list(set(lm_cands.keys()) & set(neighbors))
        
        print(f'seed_tail: {seed_tail}')
        print(f'extr_heads: {extracted_heads}')
        
        for _e in extracted_heads:
            extraction_results.append({'head': _e, 'tail': seed_tail, 'new': 'HEAD'})
    
    return pd.DataFrame(extraction_results)

In [455]:
def direct_probing_RE_v2(seed_concepts_path,
                         seed_relations_path,
                         knn_result_path,
                         templates,
                         lm_probe=None,
                         topk=10):
    
    seed_concepts_df = pd.read_csv(seed_concepts_path)
    seed_relations_df = pd.read_csv(seed_relations_path)
    seed_relations_df = seed_relations_df.iloc[1]  ## Only using offer_job_position for now 
    knn_results = pd.read_csv(knn_path)

    if lm_probe is None:
        lm_probe = LMProbe()
    
    head_type = seed_relations_df['domain']
    tail_type = seed_relations_df['range']
    print(head_type, '\t', tail_type)
    seed_heads = seed_concepts_df[seed_concepts_df['categoryName'] == head_type]['seedInstances']
    seed_heads = eval(list(seed_heads)[0])
    seed_tails = seed_concepts_df[seed_concepts_df['categoryName'] == tail_type]['seedInstances']
    seed_tails = eval(list(seed_tails)[0])
    
    # print(seed_heads)
    # print(seed_tails)
    
    all_extraction_results = []
    
    # head -> tail 
    for seed_head in seed_heads:
        print(f'seed_head: {seed_head}')
        extraction_results = []
        
        _df = knn_results[knn_results['entity'] == seed_head]
        cand_tails = set(_df['neighbor'])
        
        print(cand_tails)
        
        cand_bins = {1: [], 2: []}
        for c in cand_tails:
            c_tokenized = lm_probe.tokenizer.tokenize(c)
            if len(c_tokenized) in [1, 2]:
                cand_bins[len(c_tokenized)].append(c_tokenized)
        
        cand_scores_per_template = []
        for template in templates:
            _unigram_template = '[CLS] ' + template.format(seed_head, '[MASK]') + '[SEP]'
            _bigram_template = '[CLS] ' + template.format(seed_head, '[MASK] [MASK]') + '[SEP]'

            _cand_scores_1 = lm_probe.score_candidates(_unigram_template, cand_bins[1])
            _cand_scores_2 = lm_probe.score_candidates(_bigram_template, cand_bins[2])
            _cand_scores = sorted(_cand_scores_1 + _cand_scores_2, key=lambda d : d["cand"])
            # List[Dict["cand", "score"]]
            cand_scores_per_template.append(_cand_scores)
    
        cand_scores = []  # List[Dict["cand", "score"]], for each "cand" the average score 
        for _cand_score_lst in zip(*cand_scores_per_template):
            # _cand_score_lst: List[Dict["cand", "score"]], for the same "cand" and different template 
            _cand = _cand_score_lst[0]["cand"]
            assert all(d["cand"] == _cand for d in _cand_score_lst), _cand_score_lst
            _score = np.mean([d["score"] for d in _cand_score_lst])
            cand_scores.append({"cand": _cand, "score": _score})
        cand_scores.sort(key = lambda d : d["score"], reverse=True)
    
#         extracted_tails = [d["cand"] for d in cand_scores[:topk]]
#         print(f'seed_head: {seed_head}')
#         print(f'extr_tails: {extracted_tails}')

        for d in cand_scores[:topk]:
            e_tail = ' '.join(d["cand"]).replace(' ##', '')
            if e_tail not in cand_tails:
                continue
            emb_score = _df[_df["neighbor"] == e_tail]["sim"].item()
            lm_score = d["score"]
            extraction_results.append({'head': seed_head, 'tail': e_tail, 'new': 'TAIL',
                                       'emb_score': emb_score, 'lm_score': lm_score})
        
        all_extraction_results.extend(extraction_results)
        
    # tail -> head 
    for seed_tail in seed_tails:
        print(f'seed_tail: {seed_tail}')
        extraction_results = []
        
        _df = knn_results[knn_results['entity'] == seed_tail]
        cand_heads = set(_df['neighbor'])
        
        print(cand_heads)
        
        cand_bins = {1: [], 2: []}
        for c in cand_heads:
            c_tokenized = lm_probe.tokenizer.tokenize(c)
            if len(c_tokenized) in [1, 2]:
                cand_bins[len(c_tokenized)].append(c_tokenized)
        
        cand_scores_per_template = []
        for template in templates:
            _unigram_template = '[CLS] ' + template.format('[MASK]', seed_tail) + '[SEP]'
            _bigram_template = '[CLS] ' + template.format('[MASK] [MASK]', seed_tail) + '[SEP]'

            _cand_scores_1 = lm_probe.score_candidates(_unigram_template, cand_bins[1])
            _cand_scores_2 = lm_probe.score_candidates(_bigram_template, cand_bins[2])
            _cand_scores = sorted(_cand_scores_1 + _cand_scores_2, key=lambda d : d["cand"])
            # List[Dict["cand", "score"]]
            cand_scores_per_template.append(_cand_scores)
            
        print(cand_scores_per_template)
    
        cand_scores = []  # List[Dict["cand", "score"]], for each "cand" the average score 
        for _cand_score_lst in zip(*cand_scores_per_template):
            # _cand_score_lst: List[Dict["cand", "score"]], for the same "cand" and different template 
            _cand = _cand_score_lst[0]["cand"]
            assert all(d["cand"] == _cand for d in _cand_score_lst), _cand_score_lst
            _score = np.mean([d["score"] for d in _cand_score_lst])
            cand_scores.append({"cand": _cand, "score": _score})
        cand_scores.sort(key = lambda d : d["score"], reverse=True)
        
        print(cand_scores)

        for d in cand_scores[:topk]:
            e_head = ' '.join(d["cand"]).replace(' ##', '')
            if e_head not in cand_heads:
                continue
            emb_score = _df[_df["neighbor"] == e_head]["sim"].item()
            lm_score = d["score"]
            extraction_results.append({'head': e_head, 'tail': sead_tail, 'new': 'HEAD',
                                       'emb_score': emb_score, 'lm_score': lm_score})
        
        all_extraction_results.extend(extraction_results)
        
    return pd.DataFrame(all_extraction_results)

In [785]:
def direct_probing_RE_v3(seed_aligned_concepts_path,
                         seed_aligned_relations_path,
                         emb_path,
                         concept_knn_path,
                         templates,
                         lm_probe=None,
                         emb_dim=768,
                         scores_agg_func=None,
                         topk=10,
                         save_path=None):
    '''
    For each head / tail, rank candidate tails / heads by overall scores. 
    Current (default) overall score: 0.1 * ht_sim + 10 * concept_sim + log(lm_prob)
    '''
    
    
    
    seed_concepts_df = load_seed_aligned_concepts(seed_aligned_concepts_path)
#     seed_relations_df = pd.read_csv(seed_relations_path)
#     seed_relations_df = seed_relations_df.iloc[1]
    entity_embeddings = load_embeddings(emb_path, emb_dim)
    entity_emb_dict = dict(zip(entity_embeddings['entity'].tolist(),
                               entity_embeddings['embedding'].tolist()))
    concept_knn_results = pd.read_csv(concept_knn_path)

    if lm_probe is None:
        lm_probe = LMProbe()
    if scores_agg_func is None:
        scores_agg_func = lambda ht_sim, concept_sim, lm_prob : 0.1 * ht_sim + 10 * concept_sim + np.log10(lm_prob)
    
#     head_type = seed_relations_df['domain']
#     tail_type = seed_relations_df['range']
    ## Just for testing
    head_type = "company"
    tail_type = "dress_code"
    print(head_type, '\t', tail_type)
    seed_heads = seed_concepts_df[seed_concepts_df['alignedCategoryName'] == head_type]['seedInstances'].item()
#     seed_heads = eval(list(seed_heads)[0])
    seed_tails = seed_concepts_df[seed_concepts_df['alignedCategoryName'] == tail_type]['seedInstances'].item()
#     seed_tails = eval(list(seed_tails)[0])
    print('seed_heads:', seed_heads)
    print('seed_tails:', seed_tails)

    # Candidate heads / tails from concept knn 
    cand_heads_df = concept_knn_results[concept_knn_results['concept'] == head_type]
    cand_tails_df = concept_knn_results[concept_knn_results['concept'] == tail_type]
    cand_heads_dict = dict(zip(cand_heads_df['neighbor'].tolist(), cand_heads_df['sim'].tolist()))
    cand_tails_dict = dict(zip(cand_tails_df['neighbor'].tolist(), cand_tails_df['sim'].tolist()))
    for h in seed_heads:
        assert h not in cand_heads_dict
        cand_heads_dict[h] = 1.0
    for t in seed_tails:
        assert t not in cand_tails_dict
        cand_tails_dict[t] = 1.0
    
#     print(cand_heads_dict)
#     print(cand_tails_dict)
    
    all_extraction_results = []
    
    # head -> tail 
    for seed_head in seed_heads:
        print(f'seed_head: {seed_head}')
        extraction_results = []

        ## For each tail, extract concept sim, head sim, lm score, combine and report
        
        cand_bins = {1: [], 2: []} ## TODO: allow higher grams; switch to GPT-2 for fair probs 
        for c in cand_tails_dict.keys():
            c_tokenized = lm_probe.tokenizer.tokenize(c)
            if len(c_tokenized) in [1, 2]:
                cand_bins[len(c_tokenized)].append(c_tokenized)
        
        cand_scores_per_template = []
        for template in templates:
            _unigram_template = '[CLS] ' + template.format(seed_head, '[MASK]') + '[SEP]'
            _bigram_template = '[CLS] ' + template.format(seed_head, '[MASK] [MASK]') + '[SEP]'

            _cand_scores_1 = lm_probe.score_candidates(_unigram_template, cand_bins[1])
            _cand_scores_2 = lm_probe.score_candidates(_bigram_template, cand_bins[2])
            _cand_scores = sorted(_cand_scores_1 + _cand_scores_2, key=lambda d : d["cand"])
            # List[Dict["cand", "score"]]
            cand_scores_per_template.append(_cand_scores)
    
        cand_scores = []  # List[Dict["cand", "score"]], for each "cand" the average score 
        for _cand_score_lst in zip(*cand_scores_per_template):
            # _cand_score_lst: List[Dict["cand", "score"]], for the same "cand" and different template 
            _cand = _cand_score_lst[0]["cand"]
            assert all(d["cand"] == _cand for d in _cand_score_lst), _cand_score_lst
            _score = np.mean([d["score"] for d in _cand_score_lst])
            cand_scores.append({"cand": _cand, "score": _score})
#         cand_scores.sort(key = lambda d : d["score"], reverse=True)

        for d in cand_scores:
            e_tail = ' '.join(d["cand"]).replace(' ##', '')
            if e_tail not in cand_tails_dict:
                continue

            lm_score = d["score"]
            try:
                ht_sim_score = 1 - cosine(entity_emb_dict[seed_head], entity_emb_dict[e_tail])
            except KeyError:
                print(f'** embedding of {seed_head}: {(seed_head in entity_emb_dict)}')
                print(f'** embedding of {e_tail}: {(e_tail in entity_emb_dict)}')
                ht_sim_score = float("nan")
            concept_sim_score = cand_tails_dict[e_tail]
            overall_score = scores_agg_func(ht_sim_score, concept_sim_score, lm_score)

            extraction_results.append({'head': seed_head, 'tail': e_tail, 'base': 'HEAD',
                                       'ht_sim_score': ht_sim_score,
                                       'concept_sim_score': concept_sim_score,
                                       'lm_score': lm_score,
                                       'overall_score': overall_score})
        
        extraction_results.sort(key=lambda d : d['overall_score'], reverse=True)
        all_extraction_results.extend(extraction_results[:topk])
        
    # tail -> head 
    for seed_tail in seed_tails:
        print(f'seed_tail: {seed_tail}')
        extraction_results = []
        
        ## For each tail, extract concept sim, head sim, lm score, combine and report
        
        cand_bins = {1: [], 2: []}
        for c in cand_heads_dict.keys():
            c_tokenized = lm_probe.tokenizer.tokenize(c)
            if len(c_tokenized) in [1, 2]:
                cand_bins[len(c_tokenized)].append(c_tokenized)
        
        cand_scores_per_template = []
        for template in templates:
            _unigram_template = '[CLS] ' + template.format('[MASK]', seed_tail) + '[SEP]'
            _bigram_template = '[CLS] ' + template.format('[MASK] [MASK]', seed_tail) + '[SEP]'

            _cand_scores_1 = lm_probe.score_candidates(_unigram_template, cand_bins[1])
            _cand_scores_2 = lm_probe.score_candidates(_bigram_template, cand_bins[2])
            _cand_scores = sorted(_cand_scores_1 + _cand_scores_2, key=lambda d : d["cand"])
            # List[Dict["cand", "score"]]
            cand_scores_per_template.append(_cand_scores)
    
        cand_scores = []  # List[Dict["cand", "score"]], for each "cand" the average score 
        for _cand_score_lst in zip(*cand_scores_per_template):
            # _cand_score_lst: List[Dict["cand", "score"]], for the same "cand" and different template 
            _cand = _cand_score_lst[0]["cand"]
            assert all(d["cand"] == _cand for d in _cand_score_lst), _cand_score_lst
            _score = np.mean([d["score"] for d in _cand_score_lst])
            cand_scores.append({"cand": _cand, "score": _score})
#         cand_scores.sort(key = lambda d : d["score"], reverse=True)

        for d in cand_scores[:topk]:
            e_head = ' '.join(d["cand"]).replace(' ##', '')
            if e_head not in cand_heads_dict:
                continue
                
            lm_score = d["score"]
            try:
                ht_sim_score = 1 - cosine(entity_emb_dict[e_head], entity_emb_dict[seed_tail])
            except KeyError:
                print(f'** embedding of {e_head}: {(e_head in entity_emb_dict)}')
                print(f'** embedding of {seed_tail}: {(seed_tail in entity_emb_dict)}')
                ht_sim_score = float("nan")
            concept_sim_score = cand_heads_dict[e_head]
            overall_score = scores_agg_func(ht_sim_score, concept_sim_score, lm_score)
        
            extraction_results.append({'head': e_head, 'tail': seed_tail, 'base': 'TAIL',
                                       'ht_sim_score': ht_sim_score,
                                       'concept_sim_score': concept_sim_score,
                                       'lm_score': lm_score,
                                       'overall_score': overall_score})
        
        extraction_results.sort(key=lambda d : d['overall_score'], reverse=True)
        all_extraction_results.extend(extraction_results[:topk])
        
    results_df = pd.DataFrame(all_extraction_results)
    if save_path is not None:
        results_df.to_csv(save_path, index=None)
    return results_df

In [786]:
seed_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_concepts.csv')
seed_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_relations.csv')
seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')
seed_aligned_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_relations.csv')
# knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/knn_{cluster_size}.csv')
concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/concept_knn_1000.csv')
bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt')

extraction_save_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/rel_extraction.csv')
# extraction_save_path = None

extraction_results = direct_probing_RE_v3(seed_aligned_concepts_path=seed_aligned_concepts_path,
                                          seed_aligned_relations_path=seed_aligned_relations_path,
                                          emb_path=bert_emb_path,
                                          concept_knn_path=concept_knn_path,
                                          templates=has_dress_code_templates,
                                          lm_probe=lm_probe,
                                          topk=300,
                                          save_path=extraction_save_path)

extraction_results

company 	 dress_code
seed_heads: ['walmart', 'amazon', 'subway', 'microsoft', 'target']
seed_tails: ['business casual', 'uniform', 'hair color', 'tattoos', 'facial hair', 'shoes', 'piercings']
seed_head: walmart
seed_head: amazon
seed_head: subway
seed_head: microsoft
seed_head: target
seed_tail: business casual
seed_tail: uniform
seed_tail: hair color
seed_tail: tattoos
seed_tail: facial hair
seed_tail: shoes
seed_tail: piercings


Unnamed: 0,head,tail,base,ht_sim_score,concept_sim_score,lm_score,overall_score
0,walmart,shoes,HEAD,0.882266,1.000000,6.872273e-07,3.925327
1,walmart,clothing,HEAD,0.952076,0.976629,7.555166e-07,3.739739
2,walmart,uniform,HEAD,0.916240,1.000000,1.908441e-07,3.372303
3,walmart,public,HEAD,0.983622,0.952179,5.193604e-07,3.335625
4,walmart,pink,HEAD,0.925074,0.947710,4.597000e-07,3.232084
...,...,...,...,...,...,...,...
3595,exact same,piercings,TAIL,0.917901,0.971146,4.260951e-12,-1.567248
3596,decent amount,piercings,TAIL,0.923236,0.971478,1.833157e-12,-1.929699
3597,exact amount,piercings,TAIL,0.927078,0.970303,1.432278e-12,-2.048235
3598,fairly easy,piercings,TAIL,0.917490,0.967498,9.032076e-13,-2.277485


In [787]:
df = extraction_results.copy()
df[df['head'] == 'walmart'].head(50)
# df['overall_score'] = df['ht_sim_score'] * 0.1 + df['concept_sim_score'] * 10 + np.log10(df['lm_score'])
# df.sort_values(by='overall_score', ascending=False)

Unnamed: 0,head,tail,base,ht_sim_score,concept_sim_score,lm_score,overall_score
0,walmart,shoes,HEAD,0.882266,1.0,6.872273e-07,3.925327
1,walmart,clothing,HEAD,0.952076,0.976629,7.555166e-07,3.739739
2,walmart,uniform,HEAD,0.91624,1.0,1.908441e-07,3.372303
3,walmart,public,HEAD,0.983622,0.952179,5.193604e-07,3.335625
4,walmart,pink,HEAD,0.925074,0.94771,4.597e-07,3.232084
5,walmart,perfume,HEAD,0.932586,0.955838,2.919247e-07,3.116911
6,walmart,watch,HEAD,0.964459,0.971305,2.017021e-07,3.114202
7,walmart,products,HEAD,0.970532,0.948523,2.951575e-07,3.052335
8,walmart,logo,HEAD,0.914076,0.944882,3.047899e-07,3.024233
9,walmart,short,HEAD,0.977509,0.966827,1.743601e-07,3.007472


In [802]:
df = extraction_results.copy()
df[df['tail'] == 'hair color'].head(50)
# df['overall_score'] = df['ht_sim_score'] * 0.1 + df['concept_sim_score'] * 10 + np.log10(df['lm_score'])
# df.sort_values(by='overall_score', ascending=False).head(50)

Unnamed: 0,head,tail,base,ht_sim_score,concept_sim_score,lm_score,overall_score
2100,banks,hair color,TAIL,0.907055,0.978189,0.00081,6.780825
2101,customs,hair color,TAIL,0.901078,0.968866,0.000612,6.565287
2102,company,hair color,TAIL,0.921313,0.99199,0.000117,6.078816
2103,doctors,hair color,TAIL,0.906826,0.968265,0.00016,5.978069
2104,airports,hair color,TAIL,0.909719,0.974302,0.0001,5.835198
2105,california,hair color,TAIL,0.884908,0.967407,8.5e-05,5.693969
2106,factory,hair color,TAIL,0.920048,0.989166,5e-05,5.683523
2107,chicago,hair color,TAIL,0.884863,0.970459,7.3e-05,5.654504
2108,ceo,hair color,TAIL,0.908835,0.981629,5.3e-05,5.629846
2109,contract,hair color,TAIL,0.91839,0.985901,4.6e-05,5.612891


In [789]:
## Evaluation 

def load_benchmark_relations(benchmark_path):
    '''Currently only has_dress_code.'''
    benchmark = pd.read_csv(benchmark_path)
    
    # List[Dict[head, tail]]
    rel_pairs = []
    
    for i, row in benchmark.iterrows():
        if row['relation_name'] != 'has_dress_code':
            continue
        if row['n_head_category'] != 'company' or row['n_tail_category'] != 'dress_code':
            continue
        if row['type'] != 'fact':
            continue
        
        if row['n_head'] != 'company':
            # already instance 
            rel_pairs.append({'head': str(row['n_head']).lower(),
                              'tail': str(row['n_tail']).lower()})
            continue
        
        evidence_sents = eval(str(row['sentences']))
        head_instances = eval(str(row['Evidence']))
        assert len(evidence_sents) == len(head_instances), f'Line {i} length mismatch'
        
        for inst in head_instances:
            if len(inst) > 0:
                rel_pairs.append({'head': inst.lower(),
                                  'tail': str(row['n_tail']).lower()})
        
    return rel_pairs

In [792]:
benchmark_path = os.path.join(base_dir, f'data/indeed-benchmark/benchmark_evidence.csv')

benchmark_relations_list = load_benchmark_relations(benchmark_path)

In [793]:
rel_extraction_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/rel_extraction.csv')

rel_extraction = pd.read_csv(rel_extraction_path)
rel_extraction_list = rel_extraction[['head', 'tail']].to_dict('records')

In [803]:
benchmark_relations_set = set([tuple(d.values()) for d in benchmark_relations_list])
rel_extraction_set = set([tuple(d.values()) for d in rel_extraction_list])

intersection = benchmark_relations_set & rel_extraction_set

len(benchmark_relations_set), len(rel_extraction_set), len(intersection)


(60, 3597, 3)

# Knowledge Verification baseline
(finding co-occurrences of head / tail from corpus)

In [821]:
rel_extraction_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/rel_extraction.csv')
# corpus_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sentences.json')

indeed_dataset_path = '/home/ubuntu/users/nikita/data/indeed/indeedQA/question_answers.csv'
company_path = '/home/ubuntu/users/nikita/data/indeed/indeedQA/fccid-companyName.csv'

# with open(corpus_path, 'r') as f:
#     sent_dicts = [json.loads(l) for l in tqdm(f.readlines())]

indeed_dataset = pd.read_csv(indeed_dataset_path)
indeed_dataset = indeed_dataset[indeed_dataset['answerContent'].notna()]
company_df = pd.read_csv(company_path)
company_dict = dict(zip(company_df["fccompanyId"].to_list(), company_df["companyName"].to_list()))

indeed_dataset.shape, len(company_dict)

((307122, 10), 100)

In [822]:
# for sent_dict in sent_dicts:
#     _s = ' ' + ' '.join(sent_dict['tokens']).lower() + ' '
#     if (' walmart ' in _s) and (' red ' in _s):
#         print(_s.strip())

In [826]:
for i, row in tqdm(indeed_dataset.iterrows()):
    _company_id = row['fccompanyId']
    _company = company_dict[_company_id]
    
    _answer = row['answerContent']
    _tokens = [str(t) for t in spacy_tokenizer(_answer)]
    _s = f" {_company} : {' '.join(_tokens).lower()} "
    if (' walmart ' in _s) and (' perfume ' in _s):
        print(_s.strip())

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

KeyboardInterrupt: 

# Mine Prompts

In [108]:
# Explore various techniques
# Get prompts "between" entities
# Get prompts by syntactic parsing
# Get prompts by paraphrasing
# Get prompts uisng AutoPrompt

In [109]:
# visit here: /meg-kb/src/analysis/pattern_mining.ipynb

# Retrieve Prompt Evidence

In [109]:
# visit here: /meg-kb/src/analysis/lm_probing.ipynb

# Suggest Quality Prompts