In [None]:
base_dir="/mnt/efs/shared/meg_shared_scripts/meg-kb"
# data_ac="indeeda-meg-ac"
# data_pt="indeeda-meg-pt"
yutong_base_dir="/home/ubuntu/users/yutong"

In [2]:
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


In [3]:
from tqdm.notebook import tqdm
import argparse
import re
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr, entropy, gmean
import random
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertTokenizer, BertModel, BertForMaskedLM
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import json
from collections import defaultdict
import time

import logging
from sklearn.cluster import KMeans, AgglomerativeClustering
import pandas as pd
import os
import sys
import math
from annoy import AnnoyIndex

from spacy.lang.en import English
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
spacy_tokenizer = nlp.tokenizer

from compute_concept_clusters import knn
from compute_keyphrase_embeddings import ensure_tensor_on_device, mean_pooling

from utils import LMProbe, LMProbe_GPT2
from utils import load_embeddings, load_seed_aligned_concepts, load_seed_aligned_relations, load_benchmark
from utils import get_masked_contexts

from roberta_ses.interface import Roberta_SES_Entailment

In [4]:
seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')
seed_aligned_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_relations.csv')

# Data Preprocessing

In [2]:
# Input: text corpus
# step 1: extract key phrases (autophrase)
# step 2: generate embeddings

## Extract Key Phrases

In [5]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/keyword_extraction

In [6]:
#change to keyword extractor directory
%cd $base_dir/src/keyword_extraction/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/keyword_extraction


In [7]:
!chmod +x ./corpusProcess.sh

In [10]:
# select the dataset and thread no
data_ac = 'indeeda-meg-ac'
data_pt = 'indeeda-meg-pt'
thread = 1

In [11]:
# process corpus and generate key prhases (long time! ~90min)
!./corpusProcess.sh $data_ac $thread

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/keyword_extraction
[32m===Corpus Name: indeeda-meg-ac===[m
[32m===Current Path: /mnt/efs/shared/meg_shared_scripts/meg-kb/src/keyword_extraction===[m
[32m===Cleaning input corpus===[m
[32m===Running AutoPhrase===[m
make: Nothing to be done for 'all'.
[32m===RAW_TRAIN: ../../../data/indeeda-meg-ac/source/corpus.clean.txt===[m
auto_phrase.sh parameters: indeeda-meg-ac ../../../data/indeeda-meg-ac/source/corpus.clean.txt 10 data/EN/wiki_quality.txt 1
[32m===Compilation===[m
[32m===Tokenization===[m
Current step: Tokenizing input file...[0K
real	0m12.787s
user	0m19.848s
sys	0m2.588s
Detected Language: EN[0K
Current step: Tokenizing wikipedia phrases...[0K
No provided expert labels.[0K
[32m===Part-Of-Speech Tagging===[m
[32m===AutoPhrasing===[m
=== Current Settings ===
Iterations = 2
Minimum Support Threshold = 10
Maximum Length Threshold = 6
POS-Tagging Mode Disabled
Discard Ratio = 0.050000
Number of threads = 1
Labeling 

In [12]:
# copy these results to sample-meg-pt
# !cp -r ../../data/$data_ac ../../data/$data_pt

## Corpus with company names

In [13]:
# dataset_path = '/home/ubuntu/users/nikita/data/indeed/indeedQA/question_answers.csv'
# company_path = '/home/ubuntu/users/nikita/data/indeed/indeedQA/fccid-companyName.csv'
# entity_emb_num_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum+seeds.txt')
# out_corpus_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sentences_with_company.json')

In [15]:
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


In [16]:
# Use script
!python build_corpus_with_companies.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-rd /home/ubuntu/users/nikita/data/indeed/indeedQA


Processing lines: 100%|████████████████| 307122/307122 [11:54<00:00, 430.02it/s]


# Generate Embeddings

In [17]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/concept_learning

In [18]:
#change to concept learning directory
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


## Sentence Embedding

In [19]:
!python compute_keyphrase_embeddings.py -m bert-base-uncased -et ac -d ../../data/$data_ac/intermediate -c 750

loading corpus: 100%|███████████████| 901796/901796 [00:03<00:00, 267569.33it/s]
computing entity-wise embedding: 100%|██████| 8028/8028 [07:01<00:00, 19.03it/s]
Saving embedding


## Concatenated Token Embedding

In [20]:
!python compute_keyphrase_embeddings.py -m bert-base-uncased -et pt -d ../../data/$data_pt/intermediate -c 750

loading corpus: 100%|███████████████| 465226/465226 [00:01<00:00, 257167.06it/s]
computing entity-wise embedding:   4%|▏    | 311/7973 [01:59<1:01:48,  2.07it/s]####NOT FOUND#####
computing entity-wise embedding:   4%|▎      | 316/7973 [02:01<56:16,  2.27it/s]####NOT FOUND#####
computing entity-wise embedding:  14%|▊     | 1136/7973 [04:04<07:51, 14.51it/s]####NOT FOUND#####
computing entity-wise embedding:  26%|█▌    | 2103/7973 [04:58<08:12, 11.93it/s]####NOT FOUND#####
computing entity-wise embedding:  29%|█▊    | 2340/7973 [05:10<02:54, 32.25it/s]####NOT FOUND#####
computing entity-wise embedding:  68%|████  | 5457/7973 [06:27<00:49, 50.69it/s]####NOT FOUND#####
computing entity-wise embedding: 100%|██████| 7973/7973 [07:15<00:00, 18.32it/s]
Saving embedding


## Token Embedding

In [21]:
# change directory to autophrase
%cd $base_dir/src/tools/AutoPhrase

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/tools/AutoPhrase


In [22]:
data_corel = 'sample-indeeda-corel'

In [23]:
!CUDA_VISIBLE_DEVICES=0 python extractBertEmbedding.py ../../../data/$data_corel/intermediate/ $thread

1
2021-07-07 00:47:48,693 : INFO : loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/ubuntu/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
2021-07-07 00:47:49,035 : INFO : loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/ubuntu/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
2021-07-07 00:47:49,036 : INFO : Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embedding

## Add embeddings for seed instances

In [24]:
# corpus_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sentences.json')
# seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')

# orig_bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed.txt')
# orig_bert_emb_num_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum.txt')

# new_bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt')
# new_bert_emb_num_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum+seeds.txt')

In [26]:
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


In [27]:
# Using script

!python add_seed_instances_embeddings.py -m bert-base-uncased -et ac -d $base_dir/data/$data_ac/intermediate -b $base_dir/data/indeed-benchmark -c 750


Seed instances: ['walmart', 'amazon', 'subway', 'microsoft', 'target', 'business casual', 'uniform', 'hair color', 'tattoos', 'facial hair', 'shoes', 'piercings', 'delivery driver', 'store manager', 'cashier', 'package handler', 'sales associate', 'barista', 'dishwasher', 'weekly', 'biweekly', 'friday', 'saturday', 'health insurance', 'flexible schedule', '401k', 'paid vacation', 'sick leave', 'vision insurance', 'base pay', 'stock options', 'benefits', 'overtime pay', 'bonus', 'checks', 'direct deposit', 'prepaid card', 'drug test', 'criminal background check', 'employment verification', 'felons', 'criminals', 'disabled', 'drug addicts', 'high schoolers', 'misdemeanor', 'pregnant', 'students', 'seniors', 'hiring age', 'bachelors degree', 'prior experience', 'working permit', 'heavy lifting', 'night shift', 'dinner shift', 'early morning shift', '8 hour shift', 'christmas eve', 'early morning', 'hoilday', '7 days', 'saturday', 'sunday', 'weekend', 'full time', 'part time', 'seasonal', 

# Expand Seed Entities (clustering)

In [28]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/concept_learning

In [29]:
#change to concept learning directory
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


## knn sentence-embedding

In [30]:
clusters = 100
output = '../../data/'+data_ac+'/intermediate/knn_'+str(clusters)+'.csv'
dim = 768

In [31]:
!python compute_concept_clusters.py -d ../../data/$data_ac/intermediate/ -ca knn -s $clusters -dim $dim -o $output

building entity index: 100%|██████████████| 8028/8028 [00:01<00:00, 6489.66it/s]
finding nearest neighbors by entity: 100%|█| 8028/8028 [00:17<00:00, 449.07it/s]


## Analyzing Clustering Results

In [36]:
#Visit here: /meg_shared_scripts/meg-kb/src/analysis/concept_learning-test.ipynb

## Seed instances clustering
(using all seed instances of a concept to find neighbors)

In [37]:
# bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt')
# seed_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_concepts.csv')
# seed_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_relations.csv')
# concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/concept_knn_{cluster_size}.csv')

In [38]:
# Use script
# cluster_size = 100
!python compute_concept_seeds_knn.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-s 100 \
--kdt \
-o $base_dir/data/$data_ac/intermediate/concept_knn_100.csv

building entity index: 100%|██████████████| 8064/8064 [00:01<00:00, 6412.04it/s]
finding nearest neighbors by concept: 14it [00:00, 334.07it/s]


## Entity expansion evaluation
Now using benchmark entities, mean reciprocal rank

In [39]:
# seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')
# seed_aligned_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_relations_nodup.csv')
# benchmark_path = os.path.join(base_dir, f'data/indeed-benchmark/benchmark_evidence_clean.csv')
# concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/concept_knn_1000.csv')

In [41]:
# Use script
!python eval_entities.py \
-b $base_dir/data/indeed-benchmark \
-pred $base_dir/data/$data_ac/intermediate/concept_knn_100.csv

Concept: company / company
seeds: ['walmart', 'amazon', 'subway', 'microsoft', 'target']
{
    "united states postal service": NaN,
    "wendys": NaN,
    "whataburger": NaN,
    "marriott international, inc.": NaN,
    "geico": 30,
    "foot locker": NaN,
    "the wendy's company": NaN,
    "mcdonald": NaN,
    "home depot": 5,
    "allied universal security services, systems and solutions": NaN,
    "goodwill industries": NaN,
    "american eagle outfitters": NaN,
    "chipotle mexican grill": NaN,
    "chick-fil-a": NaN,
    "heb": NaN,
    "sonic drive-in": NaN,
    "frito": NaN,
    "jcpenney": 21,
    "aldi": NaN,
    "electric": NaN,
    "pepsi": 39,
    "sitel": 42,
    "cvs health": NaN,
    "dick's sporting goods": NaN,
    "costco wholesale": NaN,
    "burlington stores": NaN,
    "dd": 84,
    "olive garden": 12,
    "fedex": 36,
    "dunkin donuts": 43,
    "primark": NaN,
    "best buy": 13,
    "frito lay": 33,
    "marshalls": 35,
    "domino's": NaN,
    "tim hortons":

# Relation Extraction Baselines
Currently only for single relation. TODO: include all relations

## Null baseline - Cartesian product

In [226]:
# seed_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_concepts.csv')
# seed_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_relations.csv')
# seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')
# seed_aligned_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_relations_nodup.csv')
# # knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/knn_{cluster_size}.csv')
# concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/concept_knn_1000.csv')

# relation = 'has_benefits'
# cartesian_re_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/rel_cartesian-{relation}.csv')

In [74]:
!python relation_extraction_cartesian.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-o $base_dir/data/$data_ac/intermediate/rel_extraction-has_pay_schedule-RE=Ct.csv \
-r has_pay_schedule \
-cknn $base_dir/data/$data_ac/intermediate/concept_knn_100.csv
# -topk 60

company 	 pay_schedule
seed_heads: ['walmart', 'amazon', 'subway', 'microsoft', 'target']
seed_tails: ['weekly', 'biweekly', 'friday', 'saturday']
cand_heads: ('walmart', 'amazon', 'subway', 'microsoft', 'target', 'wal mart', 'costco', 'dollar general', 'apple', 'home depot', 'walgreens', 'kroger', 'publix', 'pizza hut', 'starbucks', 'panera bread', 'olive garden', 'best buy', 'mcdonalds', 'family dollar', 'old navy', 'dollar tree', 'spectrum', 'kfc', 'cracker barrel', 'jcpenney', 'safeway', 'whole foods', 'menards', 'cvs', 'ihop', 'center', 'burlington', "sam 's club", 'geico', 'usps', 'burger king', 'frito lay', 'chipotle', 'marshalls', 'fedex', 'taco bell', 'verizon', 'pepsi', 'petsmart', 'at&t', 'sitel', 'dunkin donuts', 'g4s', 'company', 'warehouse', 'jcp', 'retail', 'tj maxx', 'wells fargo', 'pharmacy', 'post office', 'pepsico', 'management', "macy 's", 't mobile', 'chick fil a', 'call center', 'union', 'cashier', 'department', 'home office', 'asset protection', 'corporate off

In [75]:
# Use script 
!python relation_extraction_cartesian.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-o $base_dir/data/$data_ac/intermediate/rel_extraction-has_background_screening-RE=Ct.csv \
-r has_background_screening \
-cknn $base_dir/data/$data_ac/intermediate/concept_knn_100.csv
# -topk 60

company 	 background_screening
seed_heads: ['walmart', 'amazon', 'subway', 'microsoft', 'target']
seed_tails: ['drug test', 'criminal background check', 'employment verification']
cand_heads: ('walmart', 'amazon', 'subway', 'microsoft', 'target', 'wal mart', 'costco', 'dollar general', 'apple', 'home depot', 'walgreens', 'kroger', 'publix', 'pizza hut', 'starbucks', 'panera bread', 'olive garden', 'best buy', 'mcdonalds', 'family dollar', 'old navy', 'dollar tree', 'spectrum', 'kfc', 'cracker barrel', 'jcpenney', 'safeway', 'whole foods', 'menards', 'cvs', 'ihop', 'center', 'burlington', "sam 's club", 'geico', 'usps', 'burger king', 'frito lay', 'chipotle', 'marshalls', 'fedex', 'taco bell', 'verizon', 'pepsi', 'petsmart', 'at&t', 'sitel', 'dunkin donuts', 'g4s', 'company', 'warehouse', 'jcp', 'retail', 'tj maxx', 'wells fargo', 'pharmacy', 'post office', 'pepsico', 'management', "macy 's", 't mobile', 'chick fil a', 'call center', 'union', 'cashier', 'department', 'home office', '

## Relation Extraction - scores weighted average

In [None]:
# seed_aligned_concepts_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts.csv')
# seed_aligned_relations_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_relations_nodup.csv')
# # knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/knn_{cluster_size}.csv')
# concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/concept_knn_100.csv')
# bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt')

# templates_path = 'templates_manual.json'

# extraction_save_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/rel_extraction.csv')

In [45]:
# Use script 
!python relation_extraction_avg_scores.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-o $base_dir/data/$data_ac/intermediate/rel_extraction-has_benefits-RE=avg.csv \
-r has_benefits \
-cknn $base_dir/data/$data_ac/intermediate/concept_knn_100.csv \
-dim 768
# -topk 300


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
company 	 benefits
seed_heads: ['walmart', 'amazon', 'subway', 'microsoft', 'target']
seed_tails: ['health insurance', 'flexible schedule', '401k', 'paid vacation', 'sick leave', 'vision insurance']
100%|█████████████████████████████████████████| 102/102 [00:37<00:00,  2.71it/s]


In [46]:
# Use script 
!python relation_extraction_avg_scores.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-o $base_dir/data/$data_ac/intermediate/rel_extraction-has_dress_code-RE=avg.csv \
-r has_dress_code \
-cknn $base_dir/data/$data_ac/intermediate/concept_knn_100.csv \
-dim 768
# -topk 300


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
company 	 dress_code
seed_heads: ['walmart', 'amazon', 'subway', 'microsoft', 'target']
seed_tails: ['business casual', 'uniform', 'hair color', 'tattoos', 'facial hair', 'shoes', 'piercings']
100%|█████████████████████████████████████████| 102/102 [00:30<00:00,  3.31it/s]


# Knowledge Verification baseline
(finding co-occurrences of head / tail from corpus)

## has_dress_code

In [47]:
!python knowledge_verification_entail.py \
-d $base_dir/data/$data_ac/intermediate \
-in $base_dir/data/$data_ac/intermediate/rel_extraction-has_dress_code-RE=Ct.csv \
-o_kv $base_dir/data/$data_ac/intermediate/kv_evidences-has_dress_code-RE=Ct.json \
-o_re $base_dir/data/$data_ac/intermediate/rel_extraction-has_dress_code-RE=Ct+KV=0.9.csv \
-r $yutong_base_dir/models/roberta-large \
-rs $yutong_base_dir/repos/Roberta_SES/checkpoints/epoch=2-valid_loss=-0.2620-valid_acc_end=0.9223.ckpt \
-p_kv 0.7 \
-p_re 0.9 \
--fast_skip

Loading files...
Finding evidence for rels: 100%|████████| 10302/10302 [1:11:38<00:00,  2.40it/s]


In [49]:
!python knowledge_verification_entail.py \
-d $base_dir/data/$data_ac/intermediate \
-in $base_dir/data/$data_ac/intermediate/rel_extraction-has_dress_code-RE=avg.csv \
-o_kv $base_dir/data/$data_ac/intermediate/kv_evidences-has_dress_code-RE=avg.json \
-o_re $base_dir/data/$data_ac/intermediate/rel_extraction-has_dress_code-RE=avg+KV=0.9.csv \
-r $yutong_base_dir/models/roberta-large \
-rs $yutong_base_dir/repos/Roberta_SES/checkpoints/epoch=2-valid_loss=-0.2620-valid_acc_end=0.9223.ckpt \
-p_kv 0.7 \
-p_re 0.9 \
--fast_skip

Loading files...
Finding evidence for rels: 100%|██████████████| 300/300 [01:58<00:00,  2.52it/s]


## has_benefits

In [48]:
!python knowledge_verification_entail.py \
-d $base_dir/data/$data_ac/intermediate \
-in $base_dir/data/$data_ac/intermediate/rel_extraction-has_benefits-RE=Ct.csv \
-o_kv $base_dir/data/$data_ac/intermediate/kv_evidences-has_benefits-RE=Ct.json \
-o_re $base_dir/data/$data_ac/intermediate/rel_extraction-has_benefits-RE=Ct+KV=0.9.csv \
-r $yutong_base_dir/models/roberta-large \
-rs $yutong_base_dir/repos/Roberta_SES/checkpoints/epoch=2-valid_loss=-0.2620-valid_acc_end=0.9223.ckpt \
-p_kv 0.7 \
-p_re 0.9 \
--fast_skip

Loading files...
Finding evidence for rels: 100%|████████| 10404/10404 [2:20:15<00:00,  1.24it/s]


In [50]:
!python knowledge_verification_entail.py \
-d $base_dir/data/$data_ac/intermediate \
-in $base_dir/data/$data_ac/intermediate/rel_extraction-has_benefits-RE=avg.csv \
-o_kv $base_dir/data/$data_ac/intermediate/kv_evidences-has_benefits-RE=avg.json \
-o_re $base_dir/data/$data_ac/intermediate/rel_extraction-has_benefits-RE=avg+KV=0.9.csv \
-r $yutong_base_dir/models/roberta-large \
-rs $yutong_base_dir/repos/Roberta_SES/checkpoints/epoch=2-valid_loss=-0.2620-valid_acc_end=0.9223.ckpt \
-p_kv 0.7 \
-p_re 0.9 \
--fast_skip

Loading files...
Finding evidence for rels: 100%|██████████████| 300/300 [02:53<00:00,  1.73it/s]


## has_pay_schedule

In [76]:
!python knowledge_verification_entail.py \
-d $base_dir/data/$data_ac/intermediate \
-in $base_dir/data/$data_ac/intermediate/rel_extraction-has_pay_schedule-RE=Ct.csv \
-o_kv $base_dir/data/$data_ac/intermediate/kv_evidences-has_pay_schedule-RE=Ct.json \
-o_re $base_dir/data/$data_ac/intermediate/rel_extraction-has_pay_schedule-RE=Ct+KV=0.9.csv \
-r $yutong_base_dir/models/roberta-large \
-rs $yutong_base_dir/repos/Roberta_SES/checkpoints/epoch=2-valid_loss=-0.2620-valid_acc_end=0.9223.ckpt \
-p_kv 0.7 \
-p_re 0.9 \
--fast_skip

Loading files...
Finding evidence for rels: 100%|████████| 10302/10302 [1:33:26<00:00,  1.84it/s]


## has_background_screening

In [78]:
!python knowledge_verification_entail.py \
-d $base_dir/data/$data_ac/intermediate \
-in $base_dir/data/$data_ac/intermediate/rel_extraction-has_background_screening-RE=Ct.csv \
-o_kv $base_dir/data/$data_ac/intermediate/kv_evidences-has_background_screening-RE=Ct.json \
-o_re $base_dir/data/$data_ac/intermediate/rel_extraction-has_background_screening-RE=Ct+KV=0.9.csv \
-r $yutong_base_dir/models/roberta-large \
-rs $yutong_base_dir/repos/Roberta_SES/checkpoints/epoch=2-valid_loss=-0.2620-valid_acc_end=0.9223.ckpt \
-p_kv 0.7 \
-p_re 0.9 \
--fast_skip

Loading files...
Finding evidence for rels: 100%|████████| 10302/10302 [7:21:16<00:00,  2.57s/it]


# Full Pipeline Evaluation (on relations)

## has_dress_code

In [65]:
!python eval_relations.py \
-b $base_dir/data/indeed-benchmark \
-pred $base_dir/data/$data_ac/intermediate/rel_extraction-has_dress_code-RE=Ct.csv \
-r has_dress_code

--- RE Results ---
Benchmark relations: 107
Predicted relations: 10302
Intersection: 15
P = 0.0015, R = 0.1402, F1 = 0.0029

Intersection:
('at&t', 'has_dress_code', 'uniform')
('best buy', 'has_dress_code', 'uniform')
('costco', 'has_dress_code', 'hair color')
('dd', 'has_dress_code', 'facial hair')
('dollar general', 'has_dress_code', 'strict dress code')
('dollar tree', 'has_dress_code', 'professional')
('dollar tree', 'has_dress_code', 'uniform')
('family dollar', 'has_dress_code', 'facial hair')
('jcpenney', 'has_dress_code', 'uniform policy')
('marshalls', 'has_dress_code', 'color hair')
('olive garden', 'has_dress_code', 'facial hair')
('subway', 'has_dress_code', 'piercings')
('taco bell', 'has_dress_code', 'nose rings')
('walgreens', 'has_dress_code', 'hair color')
('walmart', 'has_dress_code', 'uniform')



In [66]:
!python eval_relations.py \
-b $base_dir/data/indeed-benchmark \
-pred $base_dir/data/$data_ac/intermediate/rel_extraction-has_dress_code-RE=Ct+KV=0.9.csv \
-r has_dress_code

--- RE Results ---
Benchmark relations: 107
Predicted relations: 2052
Intersection: 14
P = 0.0068, R = 0.1308, F1 = 0.0130

Intersection:
('at&t', 'has_dress_code', 'uniform')
('best buy', 'has_dress_code', 'uniform')
('costco', 'has_dress_code', 'hair color')
('dd', 'has_dress_code', 'facial hair')
('dollar tree', 'has_dress_code', 'professional')
('dollar tree', 'has_dress_code', 'uniform')
('family dollar', 'has_dress_code', 'facial hair')
('jcpenney', 'has_dress_code', 'uniform policy')
('marshalls', 'has_dress_code', 'color hair')
('olive garden', 'has_dress_code', 'facial hair')
('subway', 'has_dress_code', 'piercings')
('taco bell', 'has_dress_code', 'nose rings')
('walgreens', 'has_dress_code', 'hair color')
('walmart', 'has_dress_code', 'uniform')



In [67]:
!python eval_relations.py \
-b $base_dir/data/indeed-benchmark \
-pred $base_dir/data/$data_ac/intermediate/rel_extraction-has_dress_code-RE=avg.csv \
-r has_dress_code

--- RE Results ---
Benchmark relations: 107
Predicted relations: 300
Intersection: 0
P = 0.0000, R = 0.0000, F1 = 0.0000

Intersection:



## has_benefits

In [71]:
!python eval_relations.py \
-b $base_dir/data/indeed-benchmark \
-pred $base_dir/data/$data_ac/intermediate/rel_extraction-has_benefits-RE=Ct.csv \
-r has_benefits

--- RE Results ---
Benchmark relations: 56
Predicted relations: 10404
Intersection: 9
P = 0.0009, R = 0.1607, F1 = 0.0017

Intersection:
('burger king', 'has_benefits', 'health')
('burger king', 'has_benefits', 'sick leave')
('dollar general', 'has_benefits', 'health insurance')
('g4s', 'has_benefits', 'sick leave')
('starbucks', 'has_benefits', 'health')
('target', 'has_benefits', 'health insurance')
('walmart', 'has_benefits', '401k')
('walmart', 'has_benefits', 'life insurance')
('walmart', 'has_benefits', 'paid vacations')



In [72]:
!python eval_relations.py \
-b $base_dir/data/indeed-benchmark \
-pred $base_dir/data/$data_ac/intermediate/rel_extraction-has_benefits-RE=Ct+KV=0.9.csv \
-r has_benefits

--- RE Results ---
Benchmark relations: 56
Predicted relations: 3384
Intersection: 9
P = 0.0027, R = 0.1607, F1 = 0.0052

Intersection:
('burger king', 'has_benefits', 'health')
('burger king', 'has_benefits', 'sick leave')
('dollar general', 'has_benefits', 'health insurance')
('g4s', 'has_benefits', 'sick leave')
('starbucks', 'has_benefits', 'health')
('target', 'has_benefits', 'health insurance')
('walmart', 'has_benefits', '401k')
('walmart', 'has_benefits', 'life insurance')
('walmart', 'has_benefits', 'paid vacations')



In [70]:
!python eval_relations.py \
-b $base_dir/data/indeed-benchmark \
-pred $base_dir/data/$data_ac/intermediate/rel_extraction-has_benefits-RE=avg.csv \
-r has_benefits

--- RE Results ---
Benchmark relations: 56
Predicted relations: 300
Intersection: 0
P = 0.0000, R = 0.0000, F1 = 0.0000

Intersection:



In [None]:
# Discussions:
# coherence clustering / ensemble models?
# trying for other relations or entities
# using entities in sub-categories
# fine-tuning
# ambiguous samples (high for pos and neg)
# quantitative-evaluation

# Mine Prompts

In [108]:
# Explore various techniques
# Get prompts "between" entities
# Get prompts by syntactic parsing
# Get prompts by paraphrasing
# Get prompts uisng AutoPrompt

In [109]:
# visit here: /meg-kb/src/analysis/pattern_mining.ipynb

# Retrieve Prompt Evidence

In [109]:
# visit here: /meg-kb/src/analysis/lm_probing.ipynb

# Suggest Quality Prompts