In [1]:
import os

# Change paths if needed. Shouldn't need to change yutong_base_dir (which points to roberta_ses ckpt)
base_dir="/mnt/efs/shared/meg_shared_scripts/meg-kb"
data_ac="indeeda-meg-ac"
data_pt="indeeda-meg-pt"
yutong_base_dir="/home/ubuntu/users/yutong"

benchmark_dir = os.path.join(base_dir, 'data/indeed-benchmark')
seed_aligned_concepts_path = os.path.join(benchmark_dir, f'seed_aligned_concepts.csv')
seed_aligned_relations_path = os.path.join(benchmark_dir, f'seed_aligned_relations_nodup.csv')
benchmark_path = os.path.join(benchmark_dir, f'benchmark_evidence_clean.csv')

# Seed files with auxiliary concepts / relations
# seed_aux_concepts_path = os.path.join(benchmark_dir, f'seed_aligned_concepts_aux.csv')
# seed_aux_relations_path = os.path.join(benchmark_dir, f'seed_aligned_relations_aux.csv')


In [2]:
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


In [3]:
from tqdm.notebook import tqdm
import argparse
import re
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr, entropy, gmean
import random
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertTokenizer, BertModel, BertForMaskedLM
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import json
from collections import defaultdict, Counter
import time
import importlib

import logging
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
import pandas as pd
import os
import sys
import math
from annoy import AnnoyIndex
import matplotlib
from matplotlib import pyplot as plt
import networkx as nx

import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
spacy_tokenizer = nlp.tokenizer

nlp_full = spacy.load('en_core_web_sm')

from compute_concept_clusters import knn
from compute_keyphrase_embeddings import ensure_tensor_on_device, mean_pooling

from lm_probes import LMProbe, LMProbe_GPT2, LMProbe_Joint, LMProbe_PMI, LMProbe_PMI_greedy
from utils import load_embeddings, load_seed_aligned_concepts, load_seed_aligned_relations, load_benchmark
from utils import load_EE_labels
from utils import get_masked_contexts, bert_untokenize
from utils import learn_patterns

from roberta_ses.interface import Roberta_SES_Entailment

In [72]:
## Reload modules if changed their code 

# import utils
# importlib.reload(utils)
# from utils import load_embeddings, load_seed_aligned_concepts, load_seed_aligned_relations, load_benchmark
# from utils import load_EE_labels
# from utils import get_masked_contexts, bert_untokenize
# from utils import learn_patterns

# import lm_probes
# importlib.reload(lm_probes)
# from lm_probes import LMProbe, LMProbe_GPT2, LMProbe_Joint, LMProbe_PMI, LMProbe_PMI_greedy

# Data Preprocessing

In [2]:
# Input: text corpus
# step 1: extract key phrases (autophrase)
# step 2: generate embeddings

## Extract Key Phrases

In [3]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/keyword_extraction

In [8]:
#change to keyword extractor directory
%cd $base_dir/src/keyword_extraction/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/keyword_extraction


In [9]:
!chmod +x ./corpusProcess.sh

In [5]:
# select the dataset and thread no
# data_ac = 'indeeda-meg-ac'
# data_pt = 'indeeda-meg-pt'
thread = 8

In [None]:
# process corpus and generate key prhases
!./corpusProcess.sh $data_ac $thread

In [464]:
# copy these results to sample-meg-pt
!cp -r $base_dir/data/$data_ac $base_dir/data/$data_pt

# Generate Embeddings

In [465]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/concept_learning

In [11]:
#change to concept learning directory
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


## Sentence Embedding

In [None]:
## Using BERT

!CUDA_VISIBLE_DEVICES=2 python compute_keyphrase_embeddings.py \
-m bert-base-uncased \
-et ac \
-d $base_dir/data/$data_ac/intermediate \
-c 750

In [86]:
## Using RoBERTa 

# !CUDA_VISIBLE_DEVICES=2 python compute_keyphrase_embeddings.py \
# -m roberta-base \
# -et ac \
# -ename RoBERTa \
# -d $base_dir/data/$data_ac/intermediate \
# -c 750

loading corpus: 100%|███████████████| 901796/901796 [00:03<00:00, 274660.12it/s]
computing entity-wise embedding: 100%|██████| 8028/8028 [04:55<00:00, 27.21it/s]
Saving embedding


## Concatenated Token Embedding

In [None]:
!CUDA_VISIBLE_DEVICES=2 python compute_keyphrase_embeddings.py \
-m bert-base-uncased \
-et pt \
-d $base_dir/data/$data_pt/intermediate \
-c 750

## Token Embedding
not used for now

In [None]:
# change directory to autophrase
# %cd $base_dir/src/tools/AutoPhrase

In [None]:
# data_corel = 'sample-indeeda-corel'

In [None]:
# !CUDA_VISIBLE_DEVICES=0 python extractBertEmbedding.py ../../../data/$data_corel/intermediate/ $thread

## Add embeddings for seed instances
skipped - assume that seed entities are in AutoPhrase output. If not, need some more work

In [None]:
# !CUDA_VISIBLE_DEVICES=2 python add_seed_instances_embeddings.py \
# -m roberta-base \
# -et ac \
# -ename RoBERTa \
# -d $base_dir/data/$data_ac/intermediate \
# -b $base_dir/data/indeed-benchmark \
# -c 750

## Check embeddings

In [None]:
# bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed.txt')

# embeddings = load_embeddings(bert_emb_path, 768)
# len(embeddings)

# Expand Seed Entities (clustering)

In [99]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/concept_learning

In [299]:
#change to concept learning directory
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


## EE-emb (seed instances k-NN)
(using all seed instances of a concept to find neighbors)

In [51]:
%cd $base_dir/src/concept_learning

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


In [89]:
# Use script
!python compute_concept_seeds_knn.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-e $base_dir/data/$data_ac/intermediate/BERTembed.txt \
-o $base_dir/data/$data_ac/intermediate/ee_concept_knn_k=None.csv \
-kdt


building entity index: 100%|██████████████| 8063/8063 [00:01<00:00, 4398.92it/s]
100%|███████████████████████████████████████████| 14/14 [00:01<00:00, 13.38it/s]


In [None]:
# check results 
concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_knn_k=None.csv')

df = pd.read_csv(concept_knn_path)
df[df['concept'] == 'job_position'].head(10)

## EE-LM-probe (prompt)

In [None]:
!python compute_EE_LM_probe.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-lm mlm \
-lm_model bert-base-uncased \
-o $base_dir/data/$data_ac/intermediate/ee_LM_bert_k=None.csv

In [78]:
# Use RoBERTa 

# !python compute_EE_LM_probe.py \
# -d $base_dir/data/$data_ac/intermediate \
# -b $base_dir/data/indeed-benchmark \
# -lm mlm \
# -lm_model roberta-base \
# -o $base_dir/data/$data_ac/intermediate/ee_LM_roberta_k=None.csv


Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████████████████████| 14/14 [00:58<00:00,  4.20s/it]


In [8]:
## Domain-adapted BERT, avg probs across templates (default is max)

# !python compute_EE_LM_probe.py \
# -d $base_dir/data/$data_ac/intermediate \
# -b $base_dir/data/indeed-benchmark \
# -lm bert \
# -lm_model /home/ubuntu/users/nikita/models/bert_finetuned_lm/indeed_reviews_ques_ans \
# -agg avg \
# -o $base_dir/data/$data_ac/intermediate/ee_LM_bert_DA_avg_k=None.csv


100%|███████████████████████████████████████████| 14/14 [00:54<00:00,  3.88s/it]


## Postproc contrastive EE

In [91]:
def postproc_contrastive_EE(ee_in_path, ee_contr_out_path, score_col, seed_concepts_path,
                            seed_score=0.0, keep_all_entities=False):
    
    # score_col: column name of score to use for ranking 
    # seed_score: default score for seeds, since their scores are not computed in EE methods
    # keep_all_entities: if False, only assign entities to best concepts (contr);
    #   if True, keep all entities for each concept (acontr)
    
    seed_concepts_df = load_seed_aligned_concepts(seed_concepts_path)
    seed_instances_dict = dict(zip(
        seed_concepts_df['alignedCategoryName'].tolist(),
        seed_concepts_df['seedInstances'].tolist()
    ))
    all_seed_instances = set([_e for _seeds in seed_instances_dict.values() for _e in _seeds])
    print(sorted(list(all_seed_instances)))
    ee_in_df = pd.read_csv(ee_in_path)
    
    cc_list = list(set(ee_in_df['concept'].tolist()))
    all_ent_list = list(set(ee_in_df['neighbor'].tolist()))
    
    cc_scores_dict = dict()
    for cc in cc_list:
        cc_df = ee_in_df[ee_in_df['concept'] == cc]
        cc_scores_dict[cc] = dict(zip(
            cc_df['neighbor'].tolist(),
            cc_df[score_col].tolist()
        ))
    
    cands_for_concepts = [[] for _ in range(len(cc_list))]
    for _e in all_ent_list:
#         if _e in all_seed_instances:
#             continue
        _scores = [cc_scores_dict[cc].get(_e, seed_score) for cc in cc_list]
        _cc_ranking = np.argsort(_scores).tolist()[::-1]
        _max_cc_id = _cc_ranking[0]
        _second_cc_id = _cc_ranking[1]
#         _score = _scores[_max_cc_id]
#         _2nd_score = _scores[_second_cc_id]
#         _margin = _score - _2nd_score
        
        for cc_id, cc in enumerate(cc_list):
            if _e in seed_instances_dict[cc]:
                continue
            if not keep_all_entities and cc_id != _max_cc_id:
                continue
            obest_cc_id = _second_cc_id if cc_id == _max_cc_id else _max_cc_id
            _score = _scores[cc_id]
            _obest_score = _scores[obest_cc_id]
            _margin = _score - _obest_score
            cands_for_concepts[cc_id].append({
                'concept': cc,
                'obest_concept': cc_list[obest_cc_id],
                'neighbor': _e,
                score_col: _score,
                f'obest_{score_col}': _obest_score,
                'margin': _margin,
                f'{score_col}+margin': _score + _margin
            })

    out_records = []
    for cc_id, cands in enumerate(cands_for_concepts):
        cands_sorted = sorted(cands, key=lambda d: d[f'{score_col}+margin'], reverse=True)
        out_records.extend(cands_sorted)
    
    pd.DataFrame(out_records).to_csv(ee_contr_out_path, index=None)
    

In [None]:
## Generate contrastive scoring preds for EE-emb 

ee_in_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_knn_k=None.csv')
ee_contr_out_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_contr_knn_k=None.csv')

postproc_contrastive_EE(ee_in_path=ee_in_path,
                        ee_contr_out_path=ee_contr_out_path,
                        score_col='sim',
                        seed_concepts_path=seed_aligned_concepts_path,
                        keep_all_entities=False)

In [None]:
## Generate contrastive scoring preds for EE-LM 

ee_in_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_k=None.csv')
ee_contr_out_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_contr_k=None.csv')

postproc_contrastive_EE(ee_in_path=ee_in_path,
                        ee_contr_out_path=ee_contr_out_path,
                        score_col='lm_score',
                        seed_concepts_path=seed_aligned_concepts_path,
                        keep_all_entities=False)

## MRR combination

In [5]:
ee_contr_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_contr_knn_k=None.csv')
ee_LM_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_k=None.csv')

# ee_labels_path = os.path.join(benchmark_dir, 'ee-labels-2.csv')

In [6]:
ee_contr_emb_df = pd.read_csv(ee_contr_emb_path)
ee_LM_df = pd.read_csv(ee_LM_path)

concept_list = ee_LM_df['concept'].drop_duplicates().to_list()
concept_list

['company',
 'dress_code',
 'job_position',
 'pay_schedule',
 'benefits',
 'compensation',
 'payment_option',
 'background_screening',
 'person',
 'hire_prerequisite',
 'shifts',
 'schedule',
 'employee_type',
 'onboarding_steps']

In [7]:
## Using MRR to combine ranking 

ee_mrr_combine_list = []

for _cc in sorted(concept_list):
    _ce_df = ee_contr_emb_df[ee_contr_emb_df['concept'] == _cc].sort_values(by='sim+margin', ascending=False)
    _ee_contr_emb_list = _ce_df['neighbor'].tolist()
    
    _ee_LM_list = ee_LM_df[ee_LM_df['concept'] == _cc]['neighbor'].tolist()
    
    _all_entities_mrr = defaultdict(float)
    for i, _e in enumerate(_ee_contr_emb_list):
        _all_entities_mrr[_e] += 1.0 / (i+1)
    for i, _e in enumerate(_ee_LM_list):
        _all_entities_mrr[_e] += 1.0 / (i+1)

    _all_entities_mrr_list = sorted(list(_all_entities_mrr.items()), key=lambda p: p[-1], reverse=True)
    
    for _e, _mrr in _all_entities_mrr_list:
        ee_mrr_combine_list.append((_cc, _e, _mrr))

len(ee_mrr_combine_list)

112324

In [8]:
ee_mrr_combine_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_mrr_combine_k=None.csv')
pd.DataFrame(ee_mrr_combine_list, columns=['concept', 'neighbor', 'MRR']).to_csv(ee_mrr_combine_path, index=None)

## Entity expansion evaluation
Now using benchmark entities, mean reciprocal rank

In [None]:
!python eval_entities.py \
-b $benchmark_dir \
-pred $base_dir/data/$data_ac/intermediate/ee_concept_knn_k=None.csv \
-o $base_dir/data/$data_ac/intermediate/ee_concept_knn_k=None_eval.csv

In [None]:
!python eval_entities.py \
-b $benchmark_dir \
-pred $base_dir/data/$data_ac/intermediate/ee_concept_contr_knn_k=None.csv \
-o $base_dir/data/$data_ac/intermediate/ee_concept_contr_knn_k=None_eval.csv

In [None]:
!python eval_entities.py \
-b $benchmark_dir \
-pred $base_dir/data/$data_ac/intermediate/ee_LM_bert_k=None.csv \
-o $base_dir/data/$data_ac/intermediate/ee_LM_bert_k=None_eval.csv

In [None]:
!python eval_entities.py \
-b $benchmark_dir \
-pred $base_dir/data/$data_ac/intermediate/ee_LM_bert_contr_k=None.csv \
-o $base_dir/data/$data_ac/intermediate/ee_LM_bert_contr_k=None_eval.csv

In [None]:
!python eval_entities.py \
-b $benchmark_dir \
-pred $base_dir/data/$data_ac/intermediate/ee_mrr_combine_k=None.csv \
-o $base_dir/data/$data_ac/intermediate/ee_mrr_combine_k=None_eval.csv

# Relation Extraction Baselines
Currently only for single relation. TODO: include all relations

## Null baseline - Cartesian product

In [None]:
# Use script 
!python relation_extraction_cartesian.py \
-d $base_dir/data/$data_ac/intermediate \
-b $benchmark_dir \
-o $base_dir/data/$data_ac/intermediate/rel_extraction-EE=mrr_combine-K=100-RE=Ct.csv \
-cknn $base_dir/data/$data_ac/intermediate/ee_mrr_combine_k=None.csv \
-topk 100 \
--exclude_aux

## Relation Extraction Evaluation

In [None]:
!python eval_relations.py \
-b $benchmark_dir \
-pred $base_dir/data/$data_ac/intermediate/rel_extraction-EE=mrr_combine-K=100-RE=Ct.csv

In [None]:
# !python eval_relations.py \
# -b $base_dir/data/indeed-benchmark \
# -pred $base_dir/data/$data_ac/intermediate/rel_extraction-EE=mrr_combine-K=100-RE=Ct+KV=0.9.csv \
# -r has_dress_code

# Knowledge Verification baseline
(finding co-occurrences of head / tail from corpus)

## Knowledge Verification

In [165]:
# Use script 
!python knowledge_verification_entail.py \
-d $base_dir/data/$data_ac/intermediate \
-in $base_dir/data/$data_ac/intermediate/rel_extraction-EE=mrr_combine-K=100-RE=Ct.csv \
-o_kv $base_dir/data/$data_ac/intermediate/kv_evidences-EE=mrr_combine-K=100-RE=Ct.json \
-o_re $base_dir/data/$data_ac/intermediate/rel_extraction-EE=mrr_combine-K=100-RE=Ct+KV=0.9.csv \
-r $yutong_base_dir/models/roberta-large \
-rs $yutong_base_dir/repos/Roberta_SES/checkpoints/epoch=2-valid_loss=-0.2620-valid_acc_end=0.9223.ckpt \
-p_kv 0.7 \
-p_re 0.9 \
--fast_skip 3

Loading files...
Finding evidence for rels: 100%|████████████████| 68/68 [00:08<00:00,  8.16it/s]


In [None]:
# evaluate relations 
!python eval_relations.py \
-b $base_dir/data/indeed-benchmark \
-pred $base_dir/data/$data_ac/intermediate/rel_extraction-EE=mrr_combine-K=100-RE=Ct+KV=0.9.csv