In [1]:
base_dir="/mnt/efs/shared/meg_shared_scripts/meg-kb"
data_ac="indeeda-meg-ac"
data_pt="indeeda-meg-pt"
yutong_base_dir="/home/ubuntu/users/yutong"

In [2]:
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


In [3]:
from tqdm.notebook import tqdm
import argparse
import re
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr, entropy, gmean
import random
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertTokenizer, BertModel, BertForMaskedLM
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import json
from collections import defaultdict, Counter
import time
import importlib
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers.tensorboard import TensorBoardLogger

import logging
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import pandas as pd
import os
import sys
import math
from annoy import AnnoyIndex
import matplotlib
from matplotlib import pyplot as plt
import networkx as nx
from glob import glob
import ahocorasick

import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
spacy_tokenizer = nlp.tokenizer

nlp_full = spacy.load('en_core_web_sm')

from compute_concept_clusters import knn
from compute_keyphrase_embeddings import ensure_tensor_on_device, mean_pooling

from compute_multi_view_embeddings import get_lm_probe_concept_embeddings

from lm_probes import LMProbe, LMProbe_GPT2, LMProbe_Joint, LMProbe_PMI, LMProbe_PMI_greedy
from utils import load_seed_aligned_concepts, load_seed_aligned_relations, load_benchmark
from utils import load_embeddings, load_embeddings_dict, load_EE_labels
from utils import load_EE_labels
from utils import get_masked_contexts, bert_untokenize
from utils import learn_patterns

from roberta_ses.interface import Roberta_SES_Entailment

from multiview_EE_datasets import Wiki_EE_Dataset, Wiki_EE_Dataset_2, Indeed_EE_Dataset, Indeed_EE_Dataset_2
from multiview_EE_models import EE_Classifier

In [4]:
import utils
importlib.reload(utils)
from utils import load_seed_aligned_concepts, load_seed_aligned_relations, load_benchmark
from utils import load_embeddings, load_embeddings_dict, load_EE_labels
from utils import load_EE_labels
from utils import get_masked_contexts, bert_untokenize
from utils import learn_patterns

import lm_probes
importlib.reload(lm_probes)
from lm_probes import LMProbe, LMProbe_GPT2, LMProbe_Joint, LMProbe_PMI, LMProbe_PMI_greedy

import compute_multi_view_embeddings
importlib.reload(compute_multi_view_embeddings)
from compute_multi_view_embeddings import get_lm_probe_concept_embeddings

In [5]:
benchmark_dir = os.path.join(base_dir, f'data/indeed-benchmark')

seed_aligned_concepts_path = os.path.join(benchmark_dir, f'seed_aligned_concepts.csv')
seed_aux_concepts_path = os.path.join(benchmark_dir, f'seed_aligned_concepts_aux.csv')
seed_aligned_relations_path = os.path.join(benchmark_dir, f'seed_aligned_relations_nodup.csv')
seed_aux_relations_path = os.path.join(benchmark_dir, f'seed_aligned_relations_aux.csv')
benchmark_path = os.path.join(benchmark_dir, f'benchmark_evidence_clean.csv')

In [6]:
wiki_data_dir = os.path.join(base_dir, f'data/wiki-meg-ac/intermediate')
wiki_gt_dir = os.path.join(base_dir, f'data/wiki-meg-ac/gt')

In [7]:
wiki_data_dir

'/mnt/efs/shared/meg_shared_scripts/meg-kb/data/wiki-meg-ac/intermediate'

# Data Preprocessing

In [2]:
# Input: text corpus
# step 1: extract key phrases (autophrase)
# step 2: generate embeddings

## Extract Key Phrases

In [3]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/keyword_extraction

In [8]:
#change to keyword extractor directory
%cd $base_dir/src/keyword_extraction/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/keyword_extraction


In [9]:
!chmod +x ./corpusProcess.sh

In [5]:
# select the dataset and thread no
data_ac = 'indeeda-meg-ac'
data_pt = 'indeeda-meg-pt'
thread = 8

In [None]:
# process corpus and generate key prhases
!./corpusProcess.sh $data_ac $thread

In [464]:
# copy these results to sample-meg-pt
!cp -r $base_dir/data/$data_ac $base_dir/data/$data_pt

## Corpus with company names

In [33]:
dataset_path = '/home/ubuntu/users/nikita/data/indeed/indeedQA/question_answers.csv'
company_path = '/home/ubuntu/users/nikita/data/indeed/indeedQA/fccid-companyName.csv'
entity_emb_num_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum+seeds.txt')
out_corpus_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sentences_with_company.json')

In [161]:
# Use script
!python build_corpus_with_companies.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-rd /home/ubuntu/users/nikita/data/indeed/indeedQA


Processing lines: 100%|████████████████| 307122/307122 [11:31<00:00, 444.12it/s]


In [114]:
!cp $base_dir/data/$data_ac/intermediate/sentences_with_company.json $base_dir/data/$data_pt/intermediate/

# Generate Embeddings

In [465]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/concept_learning

In [11]:
#change to concept learning directory
%cd $base_dir/src/concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


## Sentence Embedding

In [86]:
!CUDA_VISIBLE_DEVICES=2 python compute_keyphrase_embeddings.py \
-m roberta-base \
-et ac \
-ename RoBERTa \
-d $base_dir/data/$data_ac/intermediate \
-c 750

loading corpus: 100%|███████████████| 901796/901796 [00:03<00:00, 274660.12it/s]
computing entity-wise embedding: 100%|██████| 8028/8028 [04:55<00:00, 27.21it/s]
Saving embedding


## Concatenated Token Embedding

In [None]:
!CUDA_VISIBLE_DEVICES=3 python compute_keyphrase_embeddings.py -m bert-base-uncased -et pt -d $base_dir/data/$data_pt/intermediate -c 750

## Token Embedding

In [None]:
# change directory to autophrase
%cd $base_dir/src/tools/AutoPhrase

In [None]:
data_corel = 'sample-indeeda-corel'

In [None]:
!CUDA_VISIBLE_DEVICES=0 python extractBertEmbedding.py ../../../data/$data_corel/intermediate/ $thread

## Add embeddings for seed instances

In [None]:
# Using script

!CUDA_VISIBLE_DEVICES=2 python add_seed_instances_embeddings.py \
-m roberta-base \
-et ac \
-ename RoBERTa \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-c 750

## Check embeddings

In [None]:
bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed.txt')

embeddings = load_embeddings(bert_emb_path, 768)
len(embeddings)

# Expand Seed Entities (clustering)

In [99]:
# details here: https://github.com/rit-git/meg-kb/tree/main/src/concept_learning

In [299]:
#change to concept learning directory
%cd ../../concept_learning/

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


## knn sentence-embedding

In [365]:
clusters = 100
output = '../../data/'+data_ac+'/intermediate/knn_'+str(clusters)+'.csv'
dim = 768

In [366]:
!python compute_concept_clusters.py -d ../../data/$data_ac/intermediate/ -ca knn -s $clusters -dim $dim -o $output

building entity index: 100%|████████████████| 177/177 [00:00<00:00, 5435.26it/s]
finding nearest neighbors by entity: 100%|██| 177/177 [00:00<00:00, 2001.57it/s]


## knn token concatenated

In [308]:
clusters = 20
output = '../../data/'+data_pt+'/intermediate/knn_'+str(clusters)+'.csv'
dim = 3072

In [309]:
!python compute_concept_clusters.py -d ../../data/$data_pt/intermediate/ -ca knn -s $clusters -dim $dim -o $output

building entity index: 100%|████████████████| 177/177 [00:00<00:00, 3661.18it/s]
finding nearest neighbors by entity: 100%|██| 177/177 [00:00<00:00, 4052.00it/s]


## knn token

In [None]:
clusters = 20
output = '../../data/'+data_pt+'/intermediate/knn_'+str(clusters)+'.csv'
dim = 768

In [None]:
!python compute_concept_clusters.py -d ../../data/$data_corel/intermediate/ -ca knn -s $clusters -dim $dim -o $output

## Seed instances clustering (EE-emb)
(using all seed instances of a concept to find neighbors)

In [51]:
%cd $base_dir/src/concept_learning

/mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning


In [185]:
# Use script
!python compute_concept_seeds_knn.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-e $base_dir/data/$data_ac/intermediate/BERTembed+seeds.txt \
-o $base_dir/data/$data_ac/intermediate/ee_concept_knn_k=None-2.csv \
-kdt


building entity index: 100%|██████████████| 8064/8064 [00:01<00:00, 6444.07it/s]
100%|███████████████████████████████████████████| 14/14 [00:00<00:00, 16.47it/s]


In [None]:
# check results 
concept_knn_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_knn_k=None.csv')

df = pd.read_csv(concept_knn_path)
df[df['concept'] == 'company'].head(10)

In [None]:
df = pd.read_csv(concept_knn_path)
df[df['concept'] == 'pay_schedule'].head(10)

## EE-LM-probe (prompt)

In [None]:
lm_probe = LMProbe(model_name='bert-base-uncased')

In [78]:
# Use scripts 
!python compute_EE_LM_probe.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-lm mlm \
-lm_model roberta-base \
-o $base_dir/data/$data_ac/intermediate/ee_LM_roberta_k=None.csv


Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████████████████████| 14/14 [00:58<00:00,  4.20s/it]


In [8]:
!python compute_EE_LM_probe.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-lm bert \
-lm_model /home/ubuntu/users/nikita/models/bert_finetuned_lm/indeed_reviews_ques_ans \
-agg avg \
-o $base_dir/data/$data_ac/intermediate/ee_LM_bert_DA_avg_k=None.csv


100%|███████████████████████████████████████████| 14/14 [00:54<00:00,  3.88s/it]


In [184]:
!python compute_EE_LM_probe.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-lm mlm \
-lm_model bert-base-uncased \
-o $base_dir/data/$data_ac/intermediate/ee_LM_bert_k=None-2.csv


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|███████████████████████████████████████████| 14/14 [00:52<00:00,  3.78s/it]


## Class name prediction

In [203]:
lm_probe = LMProbe()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
_input_txt = '[MASK] such as bachelor degree is required to work at Walmart.'
lm_probe.score_candidates(_input_txt, ['benefits', 'roles', 'persons', 'schedules', 'dresses', 'departments', 'checks', 'qualifications'])

[{'cand': 'qualifications', 'score': 0.010107847861945629},
 {'cand': 'benefits', 'score': 1.0626394214341422e-05},
 {'cand': 'roles', 'score': 1.079390585800865e-06},
 {'cand': 'checks', 'score': 5.771184419245406e-07},
 {'cand': 'persons', 'score': 3.987514958225805e-07},
 {'cand': 'departments', 'score': 2.8143759323029387e-07},
 {'cand': 'schedules', 'score': 2.6243087702937383e-08},
 {'cand': 'dresses', 'score': 1.1577242275961901e-08}]

In [28]:
_input_txt = '[MASK] such as bachelor degree is required to work at the company.'
lm_probe.score_candidates(_input_txt, ['benefits', 'roles', 'persons', 'schedules', 'dresses', 'departments', 'checks', 'qualifications'])

[{'cand': 'qualifications', 'score': 0.014812425710260875},
 {'cand': 'benefits', 'score': 9.158307875622997e-06},
 {'cand': 'persons', 'score': 9.146327784037573e-07},
 {'cand': 'roles', 'score': 6.875428653074773e-07},
 {'cand': 'checks', 'score': 2.984555749208085e-07},
 {'cand': 'departments', 'score': 1.602442836201591e-07},
 {'cand': 'dresses', 'score': 8.730975054049843e-09},
 {'cand': 'schedules', 'score': 7.1630932296784555e-09}]

In [25]:
_input_txt = 'They have [MASK] such as bachelor degree.'
lm_probe.score_candidates(_input_txt, ['benefits', 'roles', 'persons', 'schedules', 'dresses', 'departments', 'checks', 'qualifications'])

[{'cand': 'qualifications', 'score': 0.06600436568260193},
 {'cand': 'benefits', 'score': 0.0003170075651723892},
 {'cand': 'departments', 'score': 0.00019293477816972865},
 {'cand': 'roles', 'score': 0.00011820000509032974},
 {'cand': 'persons', 'score': 9.001246326079126e-06},
 {'cand': 'checks', 'score': 8.256243745563545e-06},
 {'cand': 'schedules', 'score': 3.3704397992551064e-06},
 {'cand': 'dresses', 'score': 8.162938911482339e-07}]

In [205]:
emb_num_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum+seeds.txt')
with open(emb_num_path, 'r') as f:
    all_entities = [l.rsplit(' ', 1)[0] for l in f]
len(all_entities), all_entities[:3]

(8064, ['multiple times', 'upper', 'management'])

In [30]:
# cn_scores_dict = dict()  # Dict[cn, Dict[e, score]]
cn_scores_dict_qual = dict()

for _e in tqdm(all_entities):
    _input_txt = f'[MASK] such as {_e} is required to work at the company.'
    _d = lm_probe.score_candidates(_input_txt, ['qualifications'])
    cn_scores_dict_qual[_e] = _d[0]['score']

HBox(children=(FloatProgress(value=0.0, max=8064.0), HTML(value='')))




In [31]:
_l = sorted(cn_scores_dict_qual.items(), key=lambda p: p[1], reverse=True)
_l[:100]

[('national background', 0.18993115425109863),
 ('excellent service', 0.17657598853111267),
 ('excellent training', 0.14337441325187683),
 ('election', 0.13679932057857513),
 ('civil service examination', 0.13190346956253055),
 ('latitude', 0.117889866232872),
 ('grade point average', 0.11734356731176378),
 ('473 exam', 0.11213560402393344),
 ('higher position', 0.11185302585363385),
 ('skill level', 0.10884014517068863),
 ('citizenship', 0.10140147805213928),
 ('advancement opportunity', 0.09752645343542099),
 ('career opportunity', 0.09613030403852464),
 ('great place', 0.09332876652479172),
 ('entry level', 0.09109833091497423),
 ('great opportunity', 0.08629722893238066),
 ('teaching experience', 0.08455906808376311),
 ('drivers license', 0.0844525247812271),
 ('excellent customer', 0.07997493445873259),
 ('perfect attendance', 0.07865294814109802),
 ('pay grade', 0.07696734368801117),
 ('pse position', 0.0760236829519272),
 ('highschool diploma', 0.07536540925502776),
 ('diploma',

In [209]:
cn_scores_dict_qual_2 = dict()

for _e in tqdm(all_entities):
    _input_txt = f'[MASK] such as {_e}.'
    _d = lm_probe.score_candidates(_input_txt, ['qualifications'])
    cn_scores_dict_qual_2[_e] = _d[0]['score']

HBox(children=(FloatProgress(value=0.0, max=8064.0), HTML(value='')))




In [None]:
_l = sorted(cn_scores_dict_qual_2.items(), key=lambda p: p[1], reverse=True)
_l[:100]

## Postproc contrastive EE

In [186]:
def postproc_contrastive_EE(ee_in_path, ee_contr_out_path, score_col, seed_concepts_path,
                            seed_score=0.0, keep_all_entities=False):
    
    # score_col: column name of score to use for ranking 
    # seed_score: default score for seeds, since their scores are not computed in EE methods
    # keep_all_entities: if False, only assign entities to best concepts (contr);
    #   if True, keep all entities for each concept (acontr)
    
    seed_concepts_df = load_seed_aligned_concepts(seed_concepts_path)
    seed_instances_dict = dict(zip(
        seed_concepts_df['alignedCategoryName'].tolist(),
        seed_concepts_df['seedInstances'].tolist()
    ))
    all_seed_instances = set([_e for _seeds in seed_instances_dict.values() for _e in _seeds])
    print(sorted(list(all_seed_instances)))
    ee_in_df = pd.read_csv(ee_in_path)
    
    cc_list = list(set(ee_in_df['concept'].tolist()))
    all_ent_list = list(set(ee_in_df['neighbor'].tolist()))
    
    cc_scores_dict = dict()
    for cc in cc_list:
        cc_df = ee_in_df[ee_in_df['concept'] == cc]
        cc_scores_dict[cc] = dict(zip(
            cc_df['neighbor'].tolist(),
            cc_df[score_col].tolist()
        ))
    
    cands_for_concepts = [[] for _ in range(len(cc_list))]
    for _e in all_ent_list:
#         if _e in all_seed_instances:
#             continue
        _scores = [cc_scores_dict[cc].get(_e, seed_score) for cc in cc_list]
        _cc_ranking = np.argsort(_scores).tolist()[::-1]
        _max_cc_id = _cc_ranking[0]
        _second_cc_id = _cc_ranking[1]
#         _score = _scores[_max_cc_id]
#         _2nd_score = _scores[_second_cc_id]
#         _margin = _score - _2nd_score
        
        for cc_id, cc in enumerate(cc_list):
            if _e in seed_instances_dict[cc]:
                continue
            if not keep_all_entities and cc_id != _max_cc_id:
                continue
            obest_cc_id = _second_cc_id if cc_id == _max_cc_id else _max_cc_id
            _score = _scores[cc_id]
            _obest_score = _scores[obest_cc_id]
            _margin = _score - _obest_score
            cands_for_concepts[cc_id].append({
                'concept': cc,
                'obest_concept': cc_list[obest_cc_id],
                'neighbor': _e,
                score_col: _score,
                f'obest_{score_col}': _obest_score,
                'margin': _margin,
                f'{score_col}+margin': _score + _margin
            })

    out_records = []
    for cc_id, cands in enumerate(cands_for_concepts):
        cands_sorted = sorted(cands, key=lambda d: d[f'{score_col}+margin'], reverse=True)
        out_records.extend(cands_sorted)
    
    pd.DataFrame(out_records).to_csv(ee_contr_out_path, index=None)
    

In [187]:
# ee_in_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_roberta_k=None.csv')
# ee_contr_out_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_roberta_contr_k=None.csv')
ee_in_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_knn_k=None-2.csv')
ee_contr_out_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_contr_knn_k=None-2.csv')
# ee_in_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_k=None.csv')
# ee_contr_out_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_acontr_k=None.csv')
# seed_concepts_aux_path = os.path.join(base_dir, f'data/indeed-benchmark/seed_aligned_concepts_aux.csv')

postproc_contrastive_EE(ee_in_path=ee_in_path,
                        ee_contr_out_path=ee_contr_out_path,
                        score_col='sim',
                        seed_concepts_path=seed_aligned_concepts_path,
                        keep_all_entities=False)

['401k', '7 days', '8 hour shift', 'amazon', 'bachelors degree', 'barista', 'base pay', 'biweekly', 'bonus', 'business casual', 'cashier', 'checks', 'christmas eve', 'criminal background check', 'criminals', 'delivery driver', 'dinner shift', 'direct deposit', 'disabled', 'dishwasher', 'drug addicts', 'drug test', 'early morning', 'early morning shift', 'employment verification', 'facial hair', 'felons', 'flexible schedule', 'friday', 'full time', 'hair color', 'health insurance', 'heavy lifting', 'high schoolers', 'hiring age', 'hoilday', 'introduction', 'microsoft', 'misdemeanor', 'night shift', 'orientation', 'overtime pay', 'package handler', 'paid vacation', 'part time', 'piercings', 'pregnant', 'prepaid card', 'prior experience', 'sales associate', 'saturday', 'seasonal', 'seniors', 'shoes', 'sick leave', 'stock options', 'store manager', 'students', 'subway', 'sunday', 'target', 'tattoos', 'team lunch', 'training', 'uniform', 'vision insurance', 'walmart', 'weekend', 'weekly', '

## Iterative EE

In [None]:
class IterativeEntityExpander:
    def __init__(self,
                 step_K,
                 total_steps,
                 out_dir,
                 bert_emb_path,
                 bert_emb_dim,
                 benchmark_dir,
                 entity_emb_dict=None):
        
        self.step_K = step_K
        self.total_steps = total_steps
        self.out_dir = out_dir
        self.bert_emb_path = bert_emb_path
        self.bert_emb_dim = bert_emb_dim
        
        if entity_emb_dict is None:
            _entity_embeddings = load_embeddings(bert_emb_path, bert_emb_dim)
            entity_emb_dict = dict(zip(_entity_embeddings['entity'].tolist(),
                                       _entity_embeddings['embedding'].tolist()))
        self.entity_emb_dict = entity_emb_dict
        
        self.benchmark_dir = benchmark_dir
        _seed_concepts_path = os.path.join(benchmark_dir, 'seed_aligned_concepts.csv')
        _concepts_df = load_seed_aligned_concepts(_seed_concepts_path)
        self.init_seeds_dict = dict(zip(_concepts_df['alignedCategoryName'].tolist(),
                                        _concepts_df['seedInstances'].tolist()))
        self.seeds_dict = dict(self.init_seeds_dict)
    
    def expand(self, start_step=0, end_step=None):
        if end_step is None:
            end_step = self.total_steps
        for step_id in tqdm(range(start_step, end_step)):
            self.expand_step(step_id)
            
    def expand_step(self, step_id):
        raise NotImplementedError

        

In [88]:
class IterativeEE_Emb_Contrastive(IterativeEntityExpander):        
    def expand_step(self, step_id):
        entities, embeddings = zip(*self.entity_emb_dict.items())
        
        neighbors = []
        
        concept_emb_dict = dict()
        for a_concept, seed_instances in self.seeds_dict.items():
            embs = []
            for inst in seed_instances:
                try:
                    embs.append(self.entity_emb_dict[inst])
                except KeyError:
                    print(f"{inst} not found in entity_emb_dict??")
                    continue
            if len(embs) == 0:
                continue
            concept_emb = np.mean(embs, axis=0)
            concept_emb_dict[a_concept] = concept_emb

        concepts = list(concept_emb_dict.keys())
        concept_embs = list(concept_emb_dict.values())

        # (n_concepts, n_entities)
        cos_matrix = cosine_similarity(concept_embs, embeddings)

        cands_for_concepts = [[] for _ in range(len(concepts))]
        for e_id, e in enumerate(entities):
            _scores = cos_matrix[:, e_id]
            _cc_ranking = np.argsort(-_scores)
            _max_cc_id = _cc_ranking[0]
            _second_cc_id = _cc_ranking[1]
            _score = _scores[_max_cc_id]
            _2nd_score = _scores[_second_cc_id]
            _margin = _score - _2nd_score
            
            if e in self.seeds_dict[concepts[_max_cc_id]]:
                continue
            
            cands_for_concepts[_max_cc_id].append({
                'concept': concepts[_max_cc_id],
                '2nd_concept': concepts[_second_cc_id],
                'neighbor': e,
                'sim': _score,
                '2nd_sim': _2nd_score,
                'margin': _margin,
                'sim+margin': _score + _margin
            })

        for cc_id, cands in enumerate(cands_for_concepts):
            cands_sorted = sorted(cands, key=lambda d: d['sim+margin'], reverse=True)
            neighbors.extend(cands_sorted)
            
            _cc = concepts[cc_id]
            _orig_seeds = self.seeds_dict[_cc]
            # _ranked_cands = [d['neighbor'] for d in cands_sorted if d['neighbor'] not in _orig_seeds]
            _ranked_cands = [d['neighbor'] for d in cands_sorted]
            _add_seeds = _ranked_cands[:self.step_K]
            _new_seeds = list(set(_orig_seeds + _add_seeds))
            self.seeds_dict[_cc] = _new_seeds
    
        _rankings_dest = os.path.join(self.out_dir, f'rankings-step={step_id}.csv')
        c_df = pd.DataFrame(neighbors)
        c_df.to_csv(_rankings_dest, index=None)
        
        _EE_dest = os.path.join(self.out_dir, f'expanded-step={step_id}.csv')
        e_list = []
        for _cc, _e_list in self.seeds_dict.items():
            for _e in _e_list:
                e_list.append((_cc, _e))
        e_df = pd.DataFrame(e_list, columns=['concept', 'neighbor'])
        e_df.to_csv(_EE_dest, index=None)
    
    

In [93]:
class IterativeEE_Emb(IterativeEntityExpander):        
    def expand_step(self, step_id):
        entities, embeddings = zip(*self.entity_emb_dict.items())
        
        neighbors = []
        
        concept_emb_dict = dict()
        for a_concept, seed_instances in self.seeds_dict.items():
            embs = []
            for inst in seed_instances:
                try:
                    embs.append(self.entity_emb_dict[inst])
                except KeyError:
                    print(f"{inst} not found in entity_emb_dict??")
                    continue
            if len(embs) == 0:
                continue
            concept_emb = np.mean(embs, axis=0)
            concept_emb_dict[a_concept] = concept_emb

        concepts = list(concept_emb_dict.keys())
        concept_embs = list(concept_emb_dict.values())

        # (n_concepts, n_entities)
        cos_matrix = cosine_similarity(concept_embs, embeddings)

        cands_for_concepts = [[] for _ in range(len(concepts))]
        for e_id, e in enumerate(entities):
            _scores = cos_matrix[:, e_id]
            _max_cc_id = np.argmax(_scores)
            _score = _scores[_max_cc_id]
            
            if e in self.seeds_dict[concepts[_max_cc_id]]:
                continue
            
            cands_for_concepts[_max_cc_id].append({
                'concept': concepts[_max_cc_id],
                'neighbor': e,
                'sim': _score
            })

        for cc_id, cands in enumerate(cands_for_concepts):
            cands_sorted = sorted(cands, key=lambda d: d['sim'], reverse=True)
            neighbors.extend(cands_sorted)
            
            _cc = concepts[cc_id]
            _orig_seeds = self.seeds_dict[_cc]
            # _ranked_cands = [d['neighbor'] for d in cands_sorted if d['neighbor'] not in _orig_seeds]
            _ranked_cands = [d['neighbor'] for d in cands_sorted]
            _add_seeds = _ranked_cands[:self.step_K]
            _new_seeds = list(set(_orig_seeds + _add_seeds))
            self.seeds_dict[_cc] = _new_seeds
    
        _rankings_dest = os.path.join(self.out_dir, f'rankings-step={step_id}.csv')
        c_df = pd.DataFrame(neighbors)
        c_df.to_csv(_rankings_dest, index=None)
        
        _EE_dest = os.path.join(self.out_dir, f'expanded-step={step_id}.csv')
        e_list = []
        for _cc, _e_list in self.seeds_dict.items():
            for _e in _e_list:
                e_list.append((_cc, _e))
        e_df = pd.DataFrame(e_list, columns=['concept', 'neighbor'])
        e_df.to_csv(_EE_dest, index=None)
    
    

In [86]:
bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt')
bert_emb_dim = 768
benchmark_dir = os.path.join(base_dir, f'data/indeed-benchmark')


In [87]:
_entity_embeddings = load_embeddings(bert_emb_path, bert_emb_dim)
entity_emb_dict = dict(zip(_entity_embeddings['entity'].tolist(),
                           _entity_embeddings['embedding'].tolist()))
len(entity_emb_dict)

8064

In [94]:
emb_contr_out_dir = os.path.join(base_dir, f'data/{data_ac}/intermediate/EE_emb_contr-K=3')
os.makedirs(emb_contr_out_dir, exist_ok=True)

In [34]:
emb_contr_expander = IterativeEE_Emb_Contrastive(
    step_K=3,
    total_steps=20,
    out_dir=emb_contr_out_dir,
    bert_emb_path=bert_emb_path,
    bert_emb_dim=bert_emb_dim,
    benchmark_dir=benchmark_dir
)

In [35]:
emb_contr_expander.expand()

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [95]:
emb_out_dir = os.path.join(base_dir, f'data/{data_ac}/intermediate/EE_emb-K=3')
os.makedirs(emb_out_dir, exist_ok=True)

In [98]:
emb_expander = IterativeEE_Emb(
    step_K=3,
    total_steps=20,
    out_dir=emb_out_dir,
    bert_emb_path=bert_emb_path,
    bert_emb_dim=bert_emb_dim,
    benchmark_dir=benchmark_dir,
    entity_emb_dict=entity_emb_dict
)

In [99]:
emb_expander.expand()

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




## Different methods overlapping

In [None]:
# ee_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_knn_k=None.csv')
# ee_contr_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_contr_knn_k=None.csv')
# ee_emb_aux_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_acontr_knn-aux-k=None.csv')
# ee_LM_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_k=None.csv')
# ee_LM_contr_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_contr_k=None.csv')
# ee_LM_aux_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_acontr-aux-k=None.csv')

# ee_labels_path = os.path.join(base_dir, f'data/indeed-benchmark/ee-labels-2.csv')

In [188]:
ee_contr_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_contr_knn_k=None-2.csv')
ee_LM_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_k=None-2.csv')

ee_labels_path = os.path.join(base_dir, f'data/indeed-benchmark/ee-labels-2.csv')

In [189]:
ee_labels_dict = load_EE_labels(ee_labels_path)
concept_list = sorted(list(ee_labels_dict.keys()))

In [190]:
for _cc, _e_list in ee_labels_dict.items():
    print(_cc, len(_e_list))

background_screening 104
person 67
schedule 69
company 155
employee_type 30
job_position 152
dress_code 268
payment_option 30
compensation 47
shifts 87
onboarding_steps 30
benefits 116
pay_schedule 37
hire_prerequisite 137


In [191]:
_concept = 'dress_code'
ee_contr_emb_df = pd.read_csv(ee_contr_emb_path)
# ee_LM_df = pd.read_csv(ee_LM_path)
ee_LM_df = pd.read_csv(ee_LM_path)

ee_contr_emb_list = ee_contr_emb_df[ee_contr_emb_df['concept'] == _concept]['neighbor'].tolist()
ee_LM_list = ee_LM_df[ee_LM_df['concept'] == _concept]['neighbor'].tolist()
ee_contr_emb_list[:5], ee_LM_list[:5]

(['leggings', 'logos', 'flip flops', 'black jeans', 'jeans'],
 ['hair', 'body piercings', 'lipstick', 'ear piercings', 'face piercings'])

In [192]:
for K in [5, 10, 20, 50, 100, 300, 1000]:
    _s1 = set(ee_contr_emb_list[:K])
    _s2 = set(ee_LM_list[:K])
    
    _s1_corr = [_e for _e in _s1 if _e in ee_labels_dict[_concept]]
    _s2_corr = [_e for _e in _s2 if _e in ee_labels_dict[_concept]]

    _ints = _s1 & _s2
    _ints_corr = [_e for _e in _ints if _e in ee_labels_dict[_concept]]
    
    _union = _s1 | _s2
    _union_corr = [_e for _e in _union if _e in ee_labels_dict[_concept]]
    
    print(f"K = {K:<4d} Emb = {len(_s1)}({len(_s1_corr)})\
    LM = {len(_s2)}({len(_s2_corr)})\
    Intersect = {len(_ints)}({len(_ints_corr)})\
    Union = {len(_union)}({len(_union_corr)})")

K = 5    Emb = 5(5)    LM = 5(5)    Intersect = 0(0)    Union = 10(10)
K = 10   Emb = 10(10)    LM = 10(10)    Intersect = 2(2)    Union = 18(18)
K = 20   Emb = 20(20)    LM = 20(20)    Intersect = 5(5)    Union = 35(35)
K = 50   Emb = 50(49)    LM = 50(48)    Intersect = 12(12)    Union = 88(85)
K = 100  Emb = 100(94)    LM = 100(87)    Intersect = 31(31)    Union = 169(150)
K = 300  Emb = 300(228)    LM = 298(151)    Intersect = 135(121)    Union = 463(258)
K = 1000 Emb = 387(239)    LM = 996(204)    Intersect = 237(183)    Union = 1146(260)


In [193]:
for _cc in sorted(concept_list):
    print(f'Concept: {_cc}({len(ee_labels_dict[_cc])})')
    _ee_contr_emb_list = ee_contr_emb_df[ee_contr_emb_df['concept'] == _cc]['neighbor'].tolist()
    _ee_LM_list = ee_LM_df[ee_LM_df['concept'] == _cc]['neighbor'].tolist()
    
    for K in [5, 10, 20, 50, 100, 300, 1000]:
        _s1 = set(_ee_contr_emb_list[:K])
        _s2 = set(_ee_LM_list[:K])

        _s1_corr = [_e for _e in _s1 if _e in ee_labels_dict[_cc]]
        _s2_corr = [_e for _e in _s2 if _e in ee_labels_dict[_cc]]

        _ints = _s1 & _s2
        _ints_corr = [_e for _e in _ints if _e in ee_labels_dict[_cc]]

        _union = _s1 | _s2
        _union_corr = [_e for _e in _union if _e in ee_labels_dict[_cc]]

        print(f"K = {K:<4d} Emb = {len(_s1)}({len(_s1_corr)})    " + \
            f"LM = {len(_s2)}({len(_s2_corr)})    " + \
            f"Intersect = {len(_ints)}({len(_ints_corr)})    " + \
            f"Union = {len(_union)}({len(_union_corr)})")
    print()

Concept: background_screening(104)
K = 5    Emb = 5(5)    LM = 5(5)    Intersect = 1(1)    Union = 9(9)
K = 10   Emb = 10(9)    LM = 10(10)    Intersect = 2(2)    Union = 18(17)
K = 20   Emb = 20(11)    LM = 20(16)    Intersect = 3(2)    Union = 37(25)
K = 50   Emb = 50(24)    LM = 50(33)    Intersect = 8(6)    Union = 92(51)
K = 100  Emb = 100(41)    LM = 100(56)    Intersect = 14(10)    Union = 186(87)
K = 300  Emb = 300(54)    LM = 300(75)    Intersect = 52(27)    Union = 548(102)
K = 1000 Emb = 932(61)    LM = 999(87)    Intersect = 194(46)    Union = 1737(102)

Concept: benefits(116)
K = 5    Emb = 5(5)    LM = 5(5)    Intersect = 0(0)    Union = 10(10)
K = 10   Emb = 10(10)    LM = 10(10)    Intersect = 4(4)    Union = 16(16)
K = 20   Emb = 20(15)    LM = 20(20)    Intersect = 7(7)    Union = 33(28)
K = 50   Emb = 50(31)    LM = 50(42)    Intersect = 17(15)    Union = 83(58)
K = 100  Emb = 100(42)    LM = 100(66)    Intersect = 29(26)    Union = 171(82)
K = 300  Emb = 107(45)    

### MRR combination

In [194]:
## Using MRR to combine ranking 

ee_mrr_combine_list = []

for _cc in sorted(concept_list):
#     print(f'Concept: {_cc}({len(ee_labels_dict[_cc])})')
    _ce_df = ee_contr_emb_df[ee_contr_emb_df['concept'] == _cc].sort_values(by='sim+margin', ascending=False)
    _ee_contr_emb_list = _ce_df['neighbor'].tolist()
    
    _ee_LM_list = ee_LM_df[ee_LM_df['concept'] == _cc]['neighbor'].tolist()
    
#     _all_entities = set(_ee_contr_emb_list) & set(_ee_LM_list)
    
    _all_entities_mrr = defaultdict(float)
    for i, _e in enumerate(_ee_contr_emb_list):
        _all_entities_mrr[_e] += 1.0 / (i+1)
    for i, _e in enumerate(_ee_LM_list):
        _all_entities_mrr[_e] += 1.0 / (i+1)

    _all_entities_mrr_list = sorted(list(_all_entities_mrr.items()), key=lambda p: p[-1], reverse=True)
    
    for _e, _mrr in _all_entities_mrr_list:
        ee_mrr_combine_list.append((_cc, _e, _mrr))

len(ee_mrr_combine_list)

112260

In [195]:
ee_mrr_combine_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_mrr_combine_k=None.csv')
pd.DataFrame(ee_mrr_combine_list, columns=['concept', 'neighbor', 'MRR']).to_csv(ee_mrr_combine_path, index=None)

In [200]:
## Weighted MRR 

LM_weight = 3.0
ee_mrr_combine_list = []

for _cc in sorted(concept_list):
#     print(f'Concept: {_cc}({len(ee_labels_dict[_cc])})')
    _ce_df = ee_contr_emb_df[ee_contr_emb_df['concept'] == _cc].sort_values(by='sim+margin', ascending=False)
    _ee_contr_emb_list = _ce_df['neighbor'].tolist()
    
    _ee_LM_list = ee_LM_df[ee_LM_df['concept'] == _cc]['neighbor'].tolist()
    
#     _all_entities = set(_ee_contr_emb_list) & set(_ee_LM_list)
    
    _all_entities_mrr = defaultdict(float)
    for i, _e in enumerate(_ee_contr_emb_list):
        _all_entities_mrr[_e] += 1.0 / (i+1)
    for i, _e in enumerate(_ee_LM_list):
        _all_entities_mrr[_e] += LM_weight / (i+1)

    _all_entities_mrr_list = sorted(list(_all_entities_mrr.items()), key=lambda p: p[-1], reverse=True)
    
    for _e, _mrr in _all_entities_mrr_list:
        ee_mrr_combine_list.append((_cc, _e, _mrr))

len(ee_mrr_combine_list)

112260

In [201]:
ee_mrr_combine_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_mrr_combine-lm*3-k=None.csv')
pd.DataFrame(ee_mrr_combine_list, columns=['concept', 'neighbor', 'MRR']).to_csv(ee_mrr_combine_path, index=None)

## Methods difference - rank difference

In [160]:
ee_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_contr_knn_k=None.csv')
ee_LM_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_k=None.csv')

In [161]:
emb_df = pd.read_csv(ee_emb_path)
lm_df = pd.read_csv(ee_LM_path)

concept_list = lm_df['concept'].drop_duplicates().to_list()
entity_list = list(set(lm_df['neighbor'].to_list() + emb_df['neighbor'].to_list()))
len(concept_list), len(entity_list)

(14, 8111)

In [162]:
seed_concepts_df = load_seed_aligned_concepts(seed_aligned_concepts_path)
seed_instances_dict = dict(zip(
    seed_concepts_df['alignedCategoryName'].tolist(),
    seed_concepts_df['seedInstances'].tolist()
))

In [163]:
ee_labels_path = os.path.join(base_dir, f'data/indeed-benchmark/ee-labels-2.csv')
ee_labels_dict = load_EE_labels(ee_labels_path)

In [164]:
## Only consider entities in top-K of at least 1 method 
K = 200

## Diff: emb_rank - lm_rank, higher = more favored by lm, vice versa 
## diff = 9999: entity not assigned to this concept in emb 

rank_records = []

for _cc in concept_list:
    _emb_preds = emb_df[emb_df['concept'] == _cc]['neighbor'].to_list()
    _lm_preds = lm_df[lm_df['concept'] == _cc]['neighbor'].to_list()
    
    _e2ranks = dict([(_e, [-1, -1]) for _e in entity_list])
    for i, _e in enumerate(_emb_preds):
        _e2ranks[_e][0] = i + 1
    for i, _e in enumerate(_lm_preds):
        _e2ranks[_e][1] = i + 1
    
    for _e in seed_instances_dict[_cc]:
        if _e in _e2ranks:
            del _e2ranks[_e]
    
    _records = []
    for _e, (_r1, _r2) in _e2ranks.items():
        # emb, lm 
        if _r1 not in range(K) and _r2 not in range(K):
            continue
        if _r1 == -1:
            _diff = 9999
        else:
            _diff = _r1 - _r2
        _records.append({
            'concept': _cc,
            'neighbor': _e,
            'emb_rank': _r1,
            'lm_rank': _r2,
            'diff': _diff,
            'human_label': int(_e in ee_labels_dict[_cc])
        })
    
    _records.sort(key=lambda d: (d['diff'], d['lm_rank']))
    rank_records.extend(_records)

In [165]:
rank_df = pd.DataFrame(rank_records)

In [166]:
rank_out_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/rank_diff.csv')
rank_df.to_csv(rank_out_path, index=None)

## Score calibration

In [165]:
# def scalar_features(X, normalize=False):
#     f1 = np.array(X)
#     f2 = np.log(f1)
#     f3 = np.log(1-f1)
#     if normalize:
#         f1 = (f1 - f1.min()) / (f1.max() - f1.min())
#         f2 = (f2 - f2.min()) / (f2.max() - f2.min())
#         f3 = (f3 - f3.min()) / (f3.max() - f3.min())
#     return np.stack([f1, f2, f3], axis=-1)


In [None]:
# scalar_features([1e-5, 1e-4, 0.1, 0.5, 0.99])

In [213]:
ee_labels_path = os.path.join(base_dir, f'data/indeed-benchmark/ee-labels-2.csv')
ee_labels_dict = load_EE_labels(ee_labels_path)

In [214]:
ee_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_contr_knn_k=None.csv')
ee_LM_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_k=None.csv')

In [215]:
seed_concepts_df = load_seed_aligned_concepts(seed_aligned_concepts_path)
seed_instances_dict = dict(zip(
    seed_concepts_df['alignedCategoryName'].tolist(),
    seed_concepts_df['seedInstances'].tolist()
))

### V1: feats, neg sampling

In [169]:
_concept = 'job_position'

_df = pd.read_csv(ee_emb_path)
_emb_preds = _df[_df['concept'] == _concept].to_dict('records')[:100]

_df = pd.read_csv(ee_LM_path)
_lm_preds = _df[_df['concept'] == _concept].to_dict('records')[:100]

_ints = set([d['neighbor'] for d in _emb_preds]) & set([d['neighbor'] for d in _lm_preds])

_emb_pos_list = [d for d in _emb_preds if d['neighbor'] in _ints]
_emb_neg_list = [d for d in _emb_preds if d['neighbor'] not in _ints]

len(_emb_pos_list), len(_emb_neg_list)

(24, 76)

In [227]:
_ints

{'assistant manager',
 'assistant store manager',
 'bakery',
 'bartender',
 'courtesy clerk',
 'crew member',
 'district manager',
 'door greeter',
 'driver helper',
 'family member',
 'general manager',
 'grocery store',
 'line cook',
 'mail carrier',
 'manager',
 'night stocker',
 'packer',
 'pharmacy',
 'sales rep',
 'sales representative',
 'salon manager',
 'seasonal worker',
 'shift supervisor',
 'store manger'}

In [None]:
_emb_pos_list, _emb_neg_list

In [248]:
# N: pos | neg (total = 2N)
train_N = 20
test_N = 4
N = train_N + test_N

random.seed(123)

# pos_samples = random.sample(_emb_pos_list[:N], k=N)
# neg_samples = random.sample(_emb_neg_list[:N], k=N)
# samples = pos_samples + neg_samples

n_pos = len(_emb_pos_list)
n_neg = len(_emb_neg_list)

all_samples = _emb_pos_list + _emb_neg_list
all_labels = [1] * n_pos + [0] * n_neg

pos_ids = random.sample(range(n_pos), k=N)
neg_ids = random.sample(range(n_pos, n_pos+n_neg), k=N)
pos_train_ids = pos_ids[:train_N]
neg_train_ids = neg_ids[:train_N]
pos_test_ids = pos_ids[train_N:]
neg_test_ids = neg_ids[train_N:]

sim_feats = np.array([d['sim'] for d in all_samples])
sim2_feats = np.array([d['2nd_sim'] for d in all_samples])
margin_feats = np.array([d['margin'] for d in all_samples])
feats_X = np.stack([
    sim_feats,
    np.log(1 - sim_feats),
    sim2_feats,
    np.log(1 - sim2_feats),
    margin_feats,
    np.log(margin_feats)
], axis=-1)

labels_Y = np.array(all_labels)

feats_X.shape, labels_Y.shape

((100, 6), (100,))

In [249]:
train_X = feats_X[pos_train_ids + neg_train_ids]
train_Y = labels_Y[pos_train_ids + neg_train_ids]
test_X = feats_X[pos_test_ids + neg_test_ids]
test_Y = labels_Y[pos_test_ids + neg_test_ids]
train_X.shape, test_Y.shape

((40, 6), (8,))

In [250]:
clf = LogisticRegression(tol=1e-4, C=1e4)
clf.fit(train_X, train_Y)

# coef for [sim, log(1-sim), 2nd_sim, log(1-2nd_sim), margin, log(margin)]
clf.coef_

array([[-37.98891136,   2.49464195, -26.57782132,  -3.99934953,
        -11.41109004,   1.60053595]])

In [251]:
_pred_proba = clf.predict_proba(test_X)[:, 1]
_pred_proba

array([0.67116313, 0.62589717, 0.5465702 , 0.45832449, 0.53638333,
       0.30278473, 0.67782842, 0.13473516])

In [252]:
_df = pd.DataFrame(pos_samples[train_N:])
_df['pred_proba'] = _pred_proba[:test_N]
_df

Unnamed: 0,concept,2nd_concept,neighbor,sim,2nd_sim,margin,sim+margin,pred_proba
0,job_position,employee_type,assistant store manager,0.984076,0.978733,0.005343,0.989419,0.671163
1,job_position,employee_type,shift supervisor,0.990097,0.982083,0.008014,0.998112,0.625897
2,job_position,company,family member,0.985568,0.982632,0.002936,0.988504,0.54657
3,job_position,company,district manager,0.98735,0.985278,0.002072,0.989421,0.458324


In [253]:
_df = pd.DataFrame(neg_samples[train_N:])
_df['pred_proba'] = _pred_proba[test_N:]
_df

Unnamed: 0,concept,2nd_concept,neighbor,sim,2nd_sim,margin,sim+margin,pred_proba
0,job_position,employee_type,barista,0.993921,0.986723,0.007198,1.001119,0.536383
1,job_position,employee_type,student,0.991679,0.987705,0.003974,0.995652,0.302785
2,job_position,employee_type,lead,0.991533,0.986845,0.004689,0.996222,0.677828
3,job_position,employee_type,package handler,0.994812,0.987993,0.006818,1.00163,0.134735


In [254]:
_pred_proba_all = clf.predict_proba(feats_X)[:, 1]
_preds_all = clf.predict(feats_X)
assert np.allclose(_preds_all, (_pred_proba_all > 0.5).astype(int))

In [272]:
_df = pd.DataFrame(all_samples)
_df['pred_proba'] = _pred_proba_all
_df['pred'] = _preds_all
_df['label'] = labels_Y
_df['human_label'] = _df.apply(lambda d : d[2] in ee_labels_dict['job_position'], axis=1).astype(int)

_split = ['Test'] * len(all_samples)
for i in pos_train_ids + neg_train_ids:
    _split[i] = 'Train'
# for i in pos_test_ids + neg_test_ids:
#     _split[i] = 'Test'
_df['split'] = _split

_df.sort_values(by='sim+margin', ascending=False, inplace=True)

In [None]:
# pd.set_option("display.max_rows", 100)

_df[['neighbor', 'sim', 'margin', 'sim+margin', 'pred_proba', 'split', 'pred', 'label', 'human_label']].head(100)

In [None]:
_df.sort_values(by='pred_proba', ascending=False, inplace=False)[['neighbor', 'sim', 'margin', 'sim+margin', 'pred_proba', 'split', 'pred', 'label', 'human_label']].head(100)

In [266]:
sum(_df['pred'] == _df['human_label']), sum(_df['label'] == _df['human_label'])

(65, 62)

In [268]:
# TP, TN 
sum(_df['pred'] * _df['human_label']), sum((1 - _df['pred']) * (1 - _df['human_label']))

(41, 24)

In [277]:
pearsonr(_df['sim+margin'], _df['human_label'])

(0.5250346692711781, 2.0483942338052392e-08)

In [278]:
pearsonr(_df['pred_proba'], _df['human_label'])

(0.34894826201089824, 0.0003734901873738247)

### V2: scalar, pos/neg intersection, all concepts

In [76]:
# [1:K1] = pos; [K1:K2] or missing_one = neg 
K1 = 200
K2 = 500

In [77]:
emb_df = pd.read_csv(ee_emb_path)
lm_df = pd.read_csv(ee_LM_path)

concept_list = emb_df['concept'].drop_duplicates().to_list()

cc_pos_ents = dict()
cc_neg_ents = dict()

for _cc in concept_list:
    _emb_df_cc = emb_df[emb_df['concept'] == _cc]
    _lm_df_cc = lm_df[lm_df['concept'] == _cc]
    
    _emb_preds = _emb_df_cc['neighbor'].to_list()
    _lm_preds = _lm_df_cc['neighbor'].to_list()
    
    _pos_ents = []
    for _e in _lm_preds[:K1]:
        if _e in seed_instances_dict[_cc]:
            continue
        if _e in _emb_preds[:K1]:
            _pos_ents.append(_e)
            
    _neg_ents = []
    for _e in _lm_preds[K1:K2]:
        if _e in seed_instances_dict[_cc]:
            continue
        if _e not in _emb_preds[:K1]:
            # emb_contr prediction is not complete 
            _neg_ents.append(_e)
    
    cc_pos_ents[_cc] = _pos_ents
    cc_neg_ents[_cc] = _neg_ents

In [78]:
for _cc in concept_list:
    print(_cc, len(cc_pos_ents[_cc]), len(cc_neg_ents[_cc]))

company 16 279
dress_code 78 265
job_position 60 270
pay_schedule 7 296
benefits 35 292
compensation 36 287
payment_option 32 291
background_screening 31 268
person 24 284
hire_prerequisite 25 279
shifts 18 296
schedule 47 268
employee_type 20 282
onboarding_steps 19 285


#### For Emb-contrastive

In [139]:
# n_pos = sum([len(cc_pos_samples[_cc]) for _cc in concept_list])
# n_neg = sum([len(cc_neg_samples[_cc]) for _cc in concept_list])

random.seed(123)

all_samples = emb_df.to_dict('record')

pos_ids = []
neg_ids = []
for i, d in enumerate(all_samples):
    _e = d['neighbor']
    _cc = d['concept']
    if _e in seed_instances_dict[_cc]:
        continue
    if _e in cc_pos_ents[_cc]:
        pos_ids.append(i)
    elif _e in cc_neg_ents[_cc]:
        neg_ids.append(i)

n_pos = len(pos_ids)
n_neg = len(neg_ids)
train_samples = [all_samples[i] for i in pos_ids + neg_ids]
train_labels = [1] * n_pos + [0] * n_neg

all_score_feats = np.array([d['sim+margin'] for d in all_samples])
all_X = np.expand_dims(np.array(all_score_feats), axis=-1)

train_X = all_X[pos_ids + neg_ids]
train_Y = np.array(train_labels)

train_X.shape, train_Y.shape

((681, 1), (681,))

In [140]:
n_pos, n_neg

(443, 238)

In [141]:
clf = LogisticRegression(tol=1e-4, C=1e4)
clf.fit(train_X, train_Y)

clf.coef_, clf.intercept_

(array([[24.76929231]]), array([-22.69912982]))

In [142]:
_pred_proba_all = clf.predict_proba(all_X)[:, 1]
_preds_all = clf.predict(all_X)
assert np.allclose(_preds_all, (_pred_proba_all > 0.5).astype(int))

In [143]:
_df = pd.DataFrame(all_samples)
_df['pred_proba'] = _pred_proba_all
_df['pred'] = _preds_all
_labels = np.ones(len(all_samples)) * float('nan')
_labels[pos_ids] = 1
_labels[neg_ids] = 0
_df['label'] = _labels
_df['human_label'] = _df.apply(lambda d : d[2] in ee_labels_dict[d[0]], axis=1).astype(int)

_df.sort_values(by='sim+margin', ascending=False, inplace=True)

In [144]:
pearsonr(train_X[:, 0], train_Y[:])

(0.5968390040264266, 5.96968367874835e-67)

In [145]:
pearsonr(_df['sim+margin'][_df['concept'] == 'job_position'],
         _df['label'][_df['concept'] == 'job_position'].fillna(0.5))

(0.24228485244317408, 1.3310348799954575e-05)

In [146]:
pearsonr(_df['sim+margin'][_df['concept'] == 'job_position'],
         _df['human_label'][_df['concept'] == 'job_position'])

(0.23925277614705454, 1.7173308981842377e-05)

In [147]:
pearsonr(_df['label'][_df['concept'] == 'job_position'].fillna(0.5),
         _df['human_label'][_df['concept'] == 'job_position'])

(0.4830851384005777, 6.991380128318044e-20)

In [148]:
pearsonr(_df['pred_proba'][_df['concept'] == 'job_position'],
         _df['human_label'][_df['concept'] == 'job_position'])

(0.2526710578373066, 5.421364437118311e-06)

In [149]:
emb_calib_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_calib-emb_contr-k=None.csv')
_df.to_csv(emb_calib_path, index=None)

#### For LM-bert

In [119]:
# n_pos = sum([len(cc_pos_samples[_cc]) for _cc in concept_list])
# n_neg = sum([len(cc_neg_samples[_cc]) for _cc in concept_list])

random.seed(123)

all_samples = lm_df.to_dict('record')

pos_ids = []
neg_ids = []
for i, d in enumerate(all_samples):
    _e = d['neighbor']
    _cc = d['concept']
    if _e in seed_instances_dict[_cc]:
        continue
    if _e in cc_pos_ents[_cc]:
        pos_ids.append(i)
    elif _e in cc_neg_ents[_cc]:
        neg_ids.append(i)

n_pos = len(pos_ids)
n_neg = len(neg_ids)
train_samples = [all_samples[i] for i in pos_ids + neg_ids]
train_labels = [1] * n_pos + [0] * n_neg

all_score_feats = np.array([d['lm_score'] for d in all_samples])
all_X = np.expand_dims(np.array(all_score_feats), axis=-1)

train_X = all_X[pos_ids + neg_ids]
train_Y = np.array(train_labels)

train_X.shape, train_Y.shape

((4390, 1), (4390,))

In [120]:
n_pos, n_neg

(448, 3942)

In [121]:
clf = LogisticRegression(tol=1e-4, C=1e4)
clf.fit(train_X, train_Y)

clf.coef_, clf.intercept_

(array([[1493.69497205]]), array([-3.51509008]))

In [122]:
_pred_proba_all = clf.predict_proba(all_X)[:, 1]
_preds_all = clf.predict(all_X)
assert np.allclose(_preds_all, (_pred_proba_all > 0.5).astype(int))

In [131]:
_df = pd.DataFrame(all_samples)
_df['pred_proba'] = _pred_proba_all
_df['pred'] = _preds_all
_labels = np.ones(len(all_samples)) * float('nan')
_labels[pos_ids] = 1
_labels[neg_ids] = 0
_df['label'] = _labels
_df['human_label'] = _df.apply(lambda d : d[1] in ee_labels_dict[d[0]], axis=1).astype(int)

_df.sort_values(by='lm_score', ascending=False, inplace=True)

In [132]:
pearsonr(train_X[:, 0], train_Y[:])

(0.3241716503231114, 5.970425324525125e-108)

In [133]:
pearsonr(_df['lm_score'][_df['concept'] == 'job_position'],
         _df['label'][_df['concept'] == 'job_position'].fillna(0.5))

(0.1086978156747781, 1.5008556345975551e-22)

In [134]:
pearsonr(_df['lm_score'][_df['concept'] == 'job_position'],
         _df['human_label'][_df['concept'] == 'job_position'])

(0.3542178935505932, 3.4462133219041016e-236)

In [135]:
pearsonr(_df['label'][_df['concept'] == 'job_position'].fillna(0.5),
         _df['human_label'][_df['concept'] == 'job_position'])

(0.22232557138073736, 1.5034103372699886e-90)

In [136]:
pearsonr(_df['pred_proba'][_df['concept'] == 'job_position'],
         _df['human_label'][_df['concept'] == 'job_position'])

(0.6764257188525772, 0.0)

In [138]:
lm_calib_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert-calib-k=None.csv')
_df.to_csv(lm_calib_path, index=None)

#### Combine: mean calibrated prob

In [154]:
emb_calib_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_calib-emb_contr-k=None.csv')
lm_calib_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_calib-LM_bert-k=None.csv')

merge_calib_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_calib-merged-k=None.csv')

In [151]:
emb_calib_df = pd.read_csv(emb_calib_path)
lm_calib_df = pd.read_csv(lm_calib_path)

emb_calib_scores = dict()
for d in emb_calib_df.to_dict('record'):
    _k = (d['concept'], d['neighbor'])
    _v = d['pred_proba']
    emb_calib_scores[_k] = _v

lm_calib_scores = dict()
for d in lm_calib_df.to_dict('record'):
    _k = (d['concept'], d['neighbor'])
    _v = d['pred_proba']
    lm_calib_scores[_k] = _v

len(emb_calib_scores), len(lm_calib_scores)

(8064, 112166)

In [153]:
merged_records = []

_keys = list(set(emb_calib_scores.keys()) | set(lm_calib_scores.keys()))
for _k in _keys:
    _cc, _e = _k
    _emb_proba = emb_calib_scores.get(_k, 0)
    _lm_proba = lm_calib_scores.get(_k, 0)
    merged_records.append({
        'concept': _cc,
        'neighbor': _e,
        'emb_p': _emb_proba,
        'lm_p': _lm_proba,
        'avg': (_emb_proba + _lm_proba) / 2,
        'lm*3_avg': (_emb_proba + 3 * _lm_proba) / 4,
        'harmonic': (2 * _emb_proba * _lm_proba) / (_emb_proba + _lm_proba),
        'min': min(_emb_proba, _lm_proba),
        'human_label': int(_e in ee_labels_dict[_cc])
    })

merged_records.sort(key=lambda d: (d['concept'], -d['avg']))
len(merged_records)

112324

In [155]:
pd.DataFrame(merged_records).to_csv(merge_calib_path, index=None)

### V3: joint

In [216]:
# [1:K1] = pos; [K1:K2] or missing_one = neg 
K1 = 200
K2 = 500

In [217]:
emb_df = pd.read_csv(ee_emb_path)
lm_df = pd.read_csv(ee_LM_path)

concept_list = emb_df['concept'].drop_duplicates().to_list()

cc_pos_ents = dict()
cc_neg_ents = dict()
all_samples = []

for _cc in concept_list:
    _emb_df_cc = emb_df[emb_df['concept'] == _cc]
    _lm_df_cc = lm_df[lm_df['concept'] == _cc]
    
    _emb_preds = _emb_df_cc['neighbor'].to_list()
    _lm_preds = _lm_df_cc['neighbor'].to_list()
    
    _e2emb_record = dict([(d['neighbor'], d) for d in _emb_df_cc.to_dict('record')])
    _e2lm_record = dict([(d['neighbor'], d) for d in _lm_df_cc.to_dict('record')])
    for _e in (set(_emb_preds) | set(_lm_preds)):
        _emb_d = _e2emb_record.get(_e, None)
        _lm_d = _e2lm_record.get(_e, None)
        
        _emb_sim = 0 if _emb_d is None else _emb_d['sim']
        _emb_sim2 = 0 if _emb_d is None else _emb_d['2nd_sim']
        _lm_score = 0 if _lm_d is None else _lm_d['lm_score']
        all_samples.append({
            'concept': _cc,
            'neighbor': _e,
            'emb_sim': _emb_sim,
            'emb_sim2': _emb_sim2,
            'lm_score': _lm_score
        })
    
    _pos_ents = []
    for _e in _lm_preds[:K1]:
        if _e in seed_instances_dict[_cc]:
            continue
        if _e in _emb_preds[:K1]:
            _pos_ents.append(_e)
            
    _neg_ents = []
    for _e in _lm_preds[K1:K2]:
        if _e in seed_instances_dict[_cc]:
            continue
        if _e not in _emb_preds[:K1]:
            # emb_contr prediction is not complete 
            _neg_ents.append(_e)
    
    cc_pos_ents[_cc] = _pos_ents
    cc_neg_ents[_cc] = _neg_ents

In [218]:
# n_pos = sum([len(cc_pos_samples[_cc]) for _cc in concept_list])
# n_neg = sum([len(cc_neg_samples[_cc]) for _cc in concept_list])

random.seed(123)

pos_ids = []
neg_ids = []
for i, d in enumerate(all_samples):
    _e = d['neighbor']
    _cc = d['concept']
    if _e in seed_instances_dict[_cc]:
        continue
    if _e in cc_pos_ents[_cc]:
        pos_ids.append(i)
    elif _e in cc_neg_ents[_cc]:
        neg_ids.append(i)

n_pos = len(pos_ids)
n_neg = len(neg_ids)
train_samples = [all_samples[i] for i in pos_ids + neg_ids]
train_labels = [1] * n_pos + [0] * n_neg

all_sim_feats = np.array([d['emb_sim'] for d in all_samples])
all_sim2_feats = np.array([d['emb_sim2'] for d in all_samples])
all_lm_score_feats = np.array([d['lm_score'] for d in all_samples])

all_X = np.stack([
    all_sim_feats,
    all_sim2_feats,
    all_lm_score_feats,
], axis=-1)

train_X = all_X[pos_ids + neg_ids]
train_Y = np.array(train_labels)

train_X.shape, train_Y.shape

((4377, 3), (4377,))

In [219]:
n_pos, n_neg

(443, 3934)

In [231]:
clf = LogisticRegression(tol=1e-4, C=1e4, class_weight='balanced')
clf.fit(train_X, train_Y)

# coef_: sim, sim2, lm_score 
clf.coef_, clf.intercept_

(array([[ 26.78777647, -10.75430862, 685.03393977]]), array([-13.31463886]))

In [232]:
_pred_proba_all = clf.predict_proba(all_X)[:, 1]
_preds_all = clf.predict(all_X)
assert np.allclose(_preds_all, (_pred_proba_all > 0.5).astype(int))

In [233]:
_df = pd.DataFrame(all_samples)
_df['pred_proba'] = _pred_proba_all
_df['pred'] = _preds_all
_labels = np.ones(len(all_samples)) * float('nan')
_labels[pos_ids] = 1
_labels[neg_ids] = 0
_df['label'] = _labels
_df['human_label'] = _df.apply(lambda d : d[1] in ee_labels_dict[d[0]], axis=1).astype(int)

_df.sort_values(by=['concept', 'pred_proba'], ascending=False, inplace=True)

In [234]:
pearsonr(_df['label'][_df['concept'] == 'job_position'].fillna(0.5),
         _df['human_label'][_df['concept'] == 'job_position'])

(0.21812334653501303, 5.8811988683958415e-87)

In [235]:
pearsonr(_df['pred_proba'][_df['concept'] == 'job_position'],
         _df['human_label'][_df['concept'] == 'job_position'])

(0.4322842625273697, 0.0)

In [236]:
joint_calib_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_calib-joint-k=None.csv')
_df.to_csv(joint_calib_path, index=None)

### Calibration upper bound

In [289]:
emb_df = pd.read_csv(ee_emb_path)
lm_df = pd.read_csv(ee_LM_path)

concept_list = emb_df['concept'].drop_duplicates().to_list()

cc_all_samples = defaultdict(list)

for _cc in concept_list:
    _emb_df_cc = emb_df[emb_df['concept'] == _cc]
    _lm_df_cc = lm_df[lm_df['concept'] == _cc]
    
    _emb_preds = _emb_df_cc['neighbor'].to_list()
    _lm_preds = _lm_df_cc['neighbor'].to_list()
    
    _e2emb_record = dict([(d['neighbor'], (d, i+1)) for i, d in enumerate(_emb_df_cc.to_dict('record'))])
    _e2lm_record = dict([(d['neighbor'], (d, i+1)) for i, d in enumerate(_lm_df_cc.to_dict('record'))])
    for _e in (set(_emb_preds) | set(_lm_preds)):
        _emb_d, _emb_r = _e2emb_record.get(_e, (None, np.inf))
        _lm_d, _lm_r = _e2lm_record.get(_e, (None, np.inf))
        
        _emb_sim = 0 if _emb_d is None else _emb_d['sim']
        _emb_sim2 = 0 if _emb_d is None else _emb_d['2nd_sim']
        _lm_score = 0 if _lm_d is None else _lm_d['lm_score']
        cc_all_samples[_cc].append({
            'concept': _cc,
            'neighbor': _e,
            'emb_sim': _emb_sim,
            'emb_sim2': _emb_sim2,
            'lm_score': _lm_score,
            'emb_rank': _emb_r,
            'lm_rank': _lm_r
        })
        

In [333]:
# n_pos = sum([len(cc_pos_samples[_cc]) for _cc in concept_list])
# n_neg = sum([len(cc_neg_samples[_cc]) for _cc in concept_list])

random.seed(123)

cc_df = None

for _cc, _all_samples in cc_all_samples.items():
    pos_ids = []
    neg_ids = []
    for i, d in enumerate(_all_samples):
        _e = d['neighbor']
        _cc = d['concept']
        if _e in seed_instances_dict[_cc]:
            continue
        if _e in ee_labels_dict[_cc]:
            pos_ids.append(i)
        else:
            neg_ids.append(i)

    n_pos = len(pos_ids)
    n_neg = len(neg_ids)
    train_samples = [_all_samples[i] for i in pos_ids + neg_ids]
    train_labels = [1] * n_pos + [0] * n_neg

    _all_sim_feats = np.array([d['emb_sim'] for d in _all_samples])
    _all_sim2_feats = np.array([d['emb_sim2'] for d in _all_samples])
    _all_lm_score_feats = np.array([d['lm_score'] for d in _all_samples])
    _all_emb_ranks = np.array([d['emb_rank'] for d in _all_samples])
    _all_lm_ranks = np.array([d['lm_rank'] for d in _all_samples])

    _all_X = np.stack([
        _all_sim_feats,
        np.log(1 - _all_sim_feats),
        _all_sim2_feats,
        np.log(1 - _all_sim2_feats),
        _all_lm_score_feats,
        np.log(_all_lm_score_feats + 1e-9),
        1 / _all_emb_ranks,
        1 / _all_lm_ranks
    ], axis=-1)

    train_X = _all_X[pos_ids + neg_ids]
    train_Y = np.array(train_labels)

    print(_cc, n_pos, n_neg)
#     print(train_X.shape, train_Y.shape)
    
#     clf = make_pipeline(
#         StandardScaler(),
#         LogisticRegression(tol=1e-12, C=1e12, max_iter=1000)
#     )
    clf = LogisticRegression(tol=1e-8, C=1e4, max_iter=1000)
    clf.fit(train_X, train_Y)

    # coef_: sim, sim2, lm_score 
#     _lr = clf.steps[1][1]
#     print(_lr.coef_, _lr.intercept_)
    
    _pred_proba_all = clf.predict_proba(_all_X)[:, 1]
    _preds_all = clf.predict(_all_X)
    assert np.allclose(_preds_all, (_pred_proba_all > 0.5).astype(int))
    
    _df = pd.DataFrame(_all_samples)
    _df['pred_proba'] = _pred_proba_all
    _df['pred'] = _preds_all
    _labels = np.ones(len(_all_samples)) * float('nan')
    _labels[pos_ids] = 1
    _labels[neg_ids] = 0
    _df['human_label'] = _labels

    _df.sort_values(by='pred_proba', ascending=False, inplace=True)
    
    if cc_df is None:
        cc_df = _df
    else:
        cc_df = cc_df.append(_df)

company 150 7873
dress_code 261 7751
job_position 145 7865
pay_schedule 34 7979
benefits 110 7902
compensation 43 7970
payment_option 27 7988
background_screening 102 7927
person 58 7954
hire_prerequisite 132 7884
shifts 86 7943
schedule 62 7949
employee_type 27 7987
onboarding_steps 28 8023


In [334]:
sum((cc_df['pred'] == 1) & (cc_df['human_label'] == 1)), sum(cc_df['human_label'] == 1)

(725, 1265)

In [335]:
sum((cc_df['pred'] == 1) & (cc_df['human_label'] == 0)), sum((cc_df['pred'] == 0) & (cc_df['human_label'] == 1))

(169, 540)

In [321]:
ub_calib_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_calib-ub-k=None-2.csv')
cc_df.to_csv(ub_calib_path, index=None)

## Co-training

### Getting vector representations

In [None]:
import compute_multi_view_embeddings
importlib.reload(compute_multi_view_embeddings)
from compute_multi_view_embeddings import get_lm_probe_concept_embeddings, get_lm_probe_entity_embeddings

In [None]:
_res = get_lm_probe_concept_embeddings(model_path='bert-base-uncased',
                                       seed_concepts_path=seed_aligned_concepts_path)
_res.keys()

In [None]:
_tmp_ccs = ['benefits', 'schedule', 'pay_schedule']

cosine_similarity([_res[_cc] for _cc in _tmp_ccs])

In [None]:
_ent_res = get_lm_probe_entity_embeddings(
    model_path='bert-base-uncased',
    embed_num_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum+seeds.txt'))
len(_ent_res.keys())

In [None]:
_tmp_ents = ['starbucks', '401k', 'sales associate']
_tmp_ccs = ['company', 'benefits', 'job_position']

cosine_similarity([_res[_cc] for _cc in _tmp_ccs],
                  [_ent_res[_e] for _e in _tmp_ents])

In [None]:
# Script: getting vector embeddings : indeedQA 

!python compute_multi_view_embeddings.py \
-d $base_dir/data/$data_ac/intermediate \
-scp $seed_aligned_concepts_path \
-enp $base_dir/data/$data_ac/intermediate/BERTembednum+seeds.txt \
-es $base_dir/data/$data_ac/intermediate/BERTembed+seeds.txt \
-m bert-base-uncased \
--lm_ent_hearst

In [185]:
# Wiki 
!CUDA_VISIBLE_DEVICES=2 python compute_multi_view_embeddings.py \
-d $wiki_data_dir \
-scp $wiki_gt_dir/seed_aligned_concepts.csv \
-enp $wiki_data_dir/BERTembednum_gt.txt \
-es $wiki_data_dir/BERTembed_gt.txt \
-m bert-base-uncased \
-ename gt \
--lm_ent_hearst

  res_values = method(rvalues)
lm_probe entity embeddings: 100%|███████████| 2694/2694 [20:05<00:00,  2.24it/s]
Saving lm entity embeddings


In [541]:
with open(os.path.join(wiki_data_dir, 'BERTembednum_gt.txt'), 'r') as f:
    _entities = [l.strip().rsplit(' ', 1)[0] for l in f]
len(_entities)

2694

In [542]:
_seeds = load_seed_aligned_concepts(os.path.join(wiki_gt_dir, 'seed_aligned_concepts.csv'))
_seed_insts = [_e for _l in _seeds['seedInstances'].tolist() for _e in _l]
len(_seed_insts)

  res_values = method(rvalues)


63

In [543]:
set(_seed_insts) - set(_entities)

set()

In [544]:
_gt_entities = set()
for f_path in glob(f'{wiki_gt_dir}/*.txt'):
    with open(f_path, 'r') as f:
        _gt_entities.update([l.lower().strip().split('\t')[1] for l in f])
len(_gt_entities)

2776

In [545]:
## gt entities having no context 
len(set(_gt_entities) - set(_entities))

82

### Data preproc

In [30]:
import multiview_EE_datasets, multiview_EE_models
importlib.reload(multiview_EE_datasets) 
importlib.reload(multiview_EE_models)
from multiview_EE_datasets import Wiki_EE_Dataset, Wiki_EE_Dataset_2, Indeed_EE_Dataset, Indeed_EE_Dataset_2
from multiview_EE_models import EE_Classifier

In [169]:
def preproc_wiki_data(wiki_gt_dir, wiki_embednum_path, wiki_seeds_path=None, dest=None):
    if wiki_seeds_path is None:
        wiki_seeds_path = os.path.join(wiki_gt_dir, 'seed_aligned_concepts.csv')

    wiki_seeds_df = pd.read_csv(wiki_seeds_path)
    concept_list = wiki_seeds_df['alignedCategoryName'].tolist()
    
    with open(wiki_embednum_path, 'r') as f:
        valid_ents = set([l.strip().rsplit(' ', 1)[0] for l in f])

    cc2ent_records = dict()
    for cc in concept_list:
        _cc_filepath = os.path.join(wiki_gt_dir, f'{cc}.txt')
        _ent_records = []
        with open(_cc_filepath, 'r') as f:
            for l in f:
                _id, _e, _label = l.strip().split('\t')
                if _e.lower() not in valid_ents:
                    continue
                _ent_records.append((_e.lower(), int(_label)))
        cc2ent_records[cc] = _ent_records

    all_samples = []
    for cc, _ent_records in cc2ent_records.items():
        for _e, _label in _ent_records:
            all_samples.append({
                'concept': cc,
                'neighbor': _e,
                'label': _label
            })
    
    if dest is not None:
        pd.DataFrame(all_samples).to_csv(dest, index=None)
        
    return all_samples

def split_wiki_data(all_path, train_path, dev_path, test_path):
    all_samples_df = pd.read_csv(all_path)
    concept_list = all_samples_df['concept'].drop_duplicates().to_list()
    dev_C = test_C = max(int(len(concept_list) / 4), 2)
    train_C = len(concept_list) - dev_C - test_C
    
#     random.seed(123)
#     random.shuffle(concept_list)
#     train_concepts = concept_list[:train_C]
#     dev_concepts = concept_list[train_C:train_C+dev_C]
#     test_concepts = concept_list[train_C+dev_C:]

    ## Manually balanced split 
    train_concepts = ['companies', 'countries', 'diseases', 'parties']
    dev_concepts = ['china_provinces', 'sportsleagues']
    test_concepts = ['tv_channels' ,'us_states']
    print(train_concepts)
    print(dev_concepts)
    print(test_concepts)
    
    all_samples = all_samples_df.to_dict('records')
    train_concepts = [d for d in all_samples if d['concept'] in train_concepts]
    dev_concepts = [d for d in all_samples if d['concept'] in dev_concepts]
    test_concepts = [d for d in all_samples if d['concept'] in test_concepts]
    
    pd.DataFrame(train_concepts).to_csv(train_path, index=None)
    pd.DataFrame(dev_concepts).to_csv(dev_path, index=None)
    pd.DataFrame(test_concepts).to_csv(test_path, index=None)
    
# def load_wiki_data_tensors(ds_paths,
#                            emb_ent_path, 
#                            emb_cc_path, 
#                            lm_ent_path, 
#                            lm_cc_path,
#                            embeddings_dim):
#     # ds_paths = Dict[ds, path]
#     ## Test todo 
    
#     emb_ent_dict = load_embeddings_dict(emb_ent_path, embeddings_dim)
#     emb_cc_dict = load_embeddings_dict(emb_cc_path, embeddings_dim)
#     lm_ent_dict = load_embeddings_dict(lm_ent_path, embeddings_dim)
#     lm_cc_dict = load_embeddings_dict(lm_cc_path, embeddings_dim)
    
#     ds_tensors = dict()  # Dict[ds, Dict[tensor_name, tensor]]
#     for ds, path in ds_paths.items():
#         samples = pd.read_csv(path).to_dict('record')
#         emb_ent = []
#         emb_cc = []
#         lm_ent = []
#         lm_cc = []
#         labels = []
#         for d in samples:
#             _e, _cc, _lbl = d['neighbor'], d['concept'], d['label']
#             emb_ent.append(emb_ent_dict[_e])
#             emb_cc.append(emb_cc_dict[_cc])
#             lm_ent.append(lm_ent_dict[_e])
#             lm_cc.append(lm_cc_dict[_cc])
#             labels.append(_lbl)
#         ds_tensors[ds] = {
#             'emb_ent': torch.tensor(emb_ent, dtype=torch.float32),
#             'emb_cc': torch.tensor(emb_cc, dtype=torch.float32),
#             'lm_ent': torch.tensor(lm_ent, dtype=torch.float32),
#             'lm_cc': torch.tensor(lm_cc, dtype=torch.float32),
#             'label': torch.tensor(label, dtype=torch.int),
#         }
    
#     return ds_tensors

In [547]:
_ = preproc_wiki_data(wiki_gt_dir=wiki_gt_dir,
                  wiki_embednum_path=os.path.join(wiki_data_dir, 'BERTembednum_gt.txt'),
                  dest=os.path.join(wiki_data_dir, 'wiki_ee_all.csv'))

In [170]:
split_wiki_data(all_path=os.path.join(wiki_data_dir, 'wiki_ee_all.csv'),
                train_path=os.path.join(wiki_data_dir, 'wiki_ee_train.csv'),
                dev_path=os.path.join(wiki_data_dir, 'wiki_ee_dev.csv'),
                test_path=os.path.join(wiki_data_dir, 'wiki_ee_test.csv'))

['companies', 'countries', 'diseases', 'parties']
['china_provinces', 'sportsleagues']
['tv_channels', 'us_states']


In [None]:
'''
TODO: preproc of indeed

training: seeds as positive, seeds-mismatch as negative (using 3.15.3 Concepts overlapping to avoid wrong negatives)

valid/test: labed positive & negative (sample negatives to have the same amount)

'''

In [243]:
seed_concepts_df = load_seed_aligned_concepts(seed_aligned_concepts_path)
seed_instances_dict = dict(zip(seed_concepts_df['alignedCategoryName'].tolist(),
                               seed_concepts_df['seedInstances'].tolist()))

In [266]:
indeed_training_records = [] ## concept, neighbor, label 

random.seed(123)
_neg_cnt = Counter()  # do not use one entity >2 times as negative sample
for _cc, _seeds in seed_instances_dict.items():
    _non_overlap_ccs = [c for c in seed_instances_dict.keys() if c != _cc and c not in cc_overlap_dict[_cc]]
    _neg_cands = [e for c in _non_overlap_ccs for e in seed_instances_dict[c] if _neg_cnt[e] < 2]
    _neg_samples = random.sample(_neg_cands, k=len(_seeds))
    
    for _e in _seeds:
        indeed_training_records.append({
            'concept': _cc,
            'neighbor': _e,
            'label': 1
        })
        
    for _e in _neg_samples:
        indeed_training_records.append({
            'concept': _cc,
            'neighbor': _e,
            'label': 0
        })
        _neg_cnt[_e] += 1
        
indeed_training_records.sort(key=lambda d: (d['concept'], -d['label']))

In [267]:
indeed_train_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/indeed_ee_seed_train.csv')
pd.DataFrame(indeed_training_records).to_csv(indeed_train_path, index=None)

In [257]:
raw_ee_label_file = os.path.join(base_dir, f'data/indeed-benchmark/ee-increment-labels-2.csv')

raw_ee_df = pd.read_csv(raw_ee_label_file).dropna()

indeed_test_records = []
random.seed(123)
for _cc, _seeds in seed_instances_dict.items():
    _cc_df = raw_ee_df[raw_ee_df['concept'] == _cc]
    _pos_df = _cc_df[_cc_df['Majority'] == 1]
    _neg_df = _cc_df[_cc_df['Majority'] == 0]
    _pos_list = [e for e in _pos_df['neighbor'].to_list() if e not in _seeds]
    _neg_list = [e for e in _neg_df['neighbor'].to_list() if e not in _seeds]
    
    _n = min(len(_pos_list), len(_neg_list))
    print(_cc, len(_pos_list), len(_neg_list), _n)
    
    _pos_samples = random.sample(_pos_list, k=_n)
    _neg_samples = random.sample(_neg_list, k=_n)
    
    for e in _pos_samples:
        indeed_test_records.append({
            'concept': _cc,
            'neighbor': _e,
            'label': 1
        })
    for e in _neg_samples:
        indeed_test_records.append({
            'concept': _cc,
            'neighbor': _e,
            'label': 0
        })


company 150 124 124
dress_code 261 56 56
job_position 145 108 108
pay_schedule 34 87 34
benefits 110 97 97
compensation 43 164 43
payment_option 27 197 27
background_screening 102 123 102
person 58 198 58
hire_prerequisite 133 157 133
shifts 88 170 88
schedule 62 179 62
employee_type 27 200 27
onboarding_steps 28 232 28


In [259]:
indeed_test_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/indeed_ee_test.csv')
pd.DataFrame(indeed_test_records).to_csv(indeed_test_path, index=None)

### Data loading

In [114]:
wiki_train_set = Wiki_EE_Dataset(
    ds_path=os.path.join(wiki_data_dir, 'wiki_ee_train.csv'), 
    emb_ent_path=os.path.join(wiki_data_dir, 'BERTembed_gt.txt'), 
    emb_cc_path=os.path.join(wiki_data_dir, 'BERTembed_gt_concepts.txt'), 
    lm_ent_path=os.path.join(wiki_data_dir, 'BERTembed_gt_lm_entities.txt'), 
    lm_cc_path=os.path.join(wiki_data_dir, 'BERTembed_gt_lm_concepts.txt'),
)
wiki_train_loader = DataLoader(wiki_train_set, batch_size=4, shuffle=True)

In [115]:
wiki_dev_set = Wiki_EE_Dataset(
    ds_path=os.path.join(wiki_data_dir, 'wiki_ee_dev.csv'), 
    emb_ent_path=os.path.join(wiki_data_dir, 'BERTembed_gt.txt'), 
    emb_cc_path=os.path.join(wiki_data_dir, 'BERTembed_gt_concepts.txt'), 
    lm_ent_path=os.path.join(wiki_data_dir, 'BERTembed_gt_lm_entities.txt'), 
    lm_cc_path=os.path.join(wiki_data_dir, 'BERTembed_gt_lm_concepts.txt'),
)
wiki_dev_loader = DataLoader(wiki_dev_set, batch_size=4, shuffle=False)

In [132]:
wiki_test_set = Wiki_EE_Dataset(
    ds_path=os.path.join(wiki_data_dir, 'wiki_ee_test.csv'), 
    emb_ent_path=os.path.join(wiki_data_dir, 'BERTembed_gt.txt'), 
    emb_cc_path=os.path.join(wiki_data_dir, 'BERTembed_gt_concepts.txt'), 
    lm_ent_path=os.path.join(wiki_data_dir, 'BERTembed_gt_lm_entities.txt'), 
    lm_cc_path=os.path.join(wiki_data_dir, 'BERTembed_gt_lm_concepts.txt'),
)
wiki_test_loader = DataLoader(wiki_test_set, batch_size=1, shuffle=False)

In [133]:
wiki_test_set_2 = Wiki_EE_Dataset_2(
    ds_path=os.path.join(wiki_data_dir, 'wiki_ee_test.csv'), 
    emb_ent_path=os.path.join(wiki_data_dir, 'BERTembed_gt.txt'), 
    emb_cc_path=os.path.join(wiki_data_dir, 'BERTembed_gt_concepts.txt'), 
    lm_ent_path=os.path.join(wiki_data_dir, 'BERTembed_gt_lm_entities_hearst.csv'), 
    lm_cc_path=os.path.join(wiki_data_dir, 'BERTembed_gt_lm_concepts.txt'),
)
wiki_test_loader_2 = DataLoader(wiki_test_set_2, batch_size=1, shuffle=False)

In [None]:
indeed_test_set = Indeed_EE_Dataset(
    ds_path=os.path.join(benchmark_dir, 'ee-labels-2.csv'), 
    emb_ent_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt'), 
    emb_cc_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_concepts.txt'), 
    lm_ent_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_lm_entities.txt'), 
    lm_cc_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_lm_concepts.txt'),
)
indeed_test_loader = DataLoader(indeed_test_set, batch_size=1, shuffle=False)
len(indeed_test_set)

In [40]:
indeed_test_set_2 = Indeed_EE_Dataset_2(
    ds_path=os.path.join(benchmark_dir, 'ee-labels-2.csv'), 
    emb_ent_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt'), 
    emb_cc_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_concepts.txt'), 
    lm_ent_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_lm_entities_hearst.csv'), 
    lm_cc_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_lm_concepts.txt'),
    emb_ent_dict=indeed_ft_train_set.emb_ent_dict,
    emb_cc_dict=indeed_ft_train_set.emb_cc_dict,
    lm_ent_dict=indeed_ft_train_set.lm_ent_dict,
    lm_cc_dict=indeed_ft_train_set.lm_cc_dict,
)
indeed_test_loader_2 = DataLoader(indeed_test_set_2, batch_size=1, shuffle=False)
len(indeed_test_set_2)

112896

In [138]:
wiki_test_set.sample_records[20]

{'concept': 'us_states', 'neighbor': 'iowa', 'label': 1}

In [200]:
wiki_test_set_2.sample_records[10]

{'concept': 'us_states', 'neighbor': 'indiana', 'label': 1}

In [None]:
wiki_test_set_2[10]

In [None]:
!head -n 2 $wiki_data_dir/BERTembed_gt_lm_entities_hearst.csv

### Model

In [33]:
ee_clf = EE_Classifier(
    embeddings_dim=768,
    ff_dims=[8, 4]
)

In [36]:
trainer = pl.Trainer(gpus=None, max_epochs=1)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(


In [None]:
trainer.fit(ee_clf,
            train_dataloaders=wiki_train_loader,
            val_dataloaders=wiki_dev_loader)

In [38]:
wiki_dev_set.sample_records[10]

{'concept': 'companies', 'neighbor': 'siemens', 'label': 1}

In [None]:
trainer.predict(ee_clf, dataloaders=wiki_test_loader)

In [None]:
print(trainer.test.__doc__)

In [35]:
ee_clf(wiki_dev_set[10])

{'emb_logits': tensor(0.6159, grad_fn=<SqueezeBackward1>),
 'lm_logits': tensor(0.5063, grad_fn=<SqueezeBackward1>),
 'joint_logits': tensor(0.5611, grad_fn=<MeanBackward1>)}

In [None]:
# _emb_ent = torch.randn(4, 16)
# _emb_cc = torch.randn(4, 16)
# _lm_ent = torch.randn(4, 16)
# _lm_cc = torch.randn(4, 16)
# _labels = torch.randint(high=2, size=(4,)).to(torch.float32)

# _in_batch = {
#     'emb_ent': _emb_ent,
#     'emb_cc': _emb_cc,
#     'lm_ent': _lm_ent,
#     'lm_cc': _lm_cc,
#     'labels': _labels,
# }

# ee_clf.forward(_in_batch)

# loss = ee_clf.training_step(_in_batch, 0)
# loss

In [None]:
# Script 
!CUDA_VISIBLE_DEVICES=2 python multiview_EE_train.py \
-d $wiki_data_dir \
-o $wiki_data_dir/ee_clf_test.csv \
-ep 100 

### Fine-tuning

In [35]:
# Should rename dataset classes...

indeed_ft_train_set = Wiki_EE_Dataset_2(
    ds_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/indeed_ee_seed_train.csv'), 
    emb_ent_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt'), 
    emb_cc_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_concepts.txt'), 
    lm_ent_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_lm_entities_hearst.csv'), 
    lm_cc_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_lm_concepts.txt'),
)

indeed_ft_train_loader = DataLoader(indeed_ft_train_set, batch_size=4, shuffle=True)

In [36]:
indeed_ft_val_set = Wiki_EE_Dataset_2(
    ds_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/indeed_ee_test.csv'), 
    emb_ent_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt'), 
    emb_cc_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_concepts.txt'), 
    lm_ent_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_lm_entities_hearst.csv'), 
    lm_cc_path=os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed_lm_concepts.txt'),
    emb_ent_dict=indeed_ft_train_set.emb_ent_dict,
    emb_cc_dict=indeed_ft_train_set.emb_cc_dict,
    lm_ent_dict=indeed_ft_train_set.lm_ent_dict,
    lm_cc_dict=indeed_ft_train_set.lm_cc_dict,
)

indeed_ft_val_loader = DataLoader(indeed_ft_val_set, batch_size=1, shuffle=False)

In [37]:
trainer = pl.Trainer(gpus=None, 
                     max_epochs=10,
                     callbacks=ModelCheckpoint(save_top_k=-1),
                     logger=TensorBoardLogger(save_dir='lightning_logs', version='14.2'))

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(


In [38]:
ckpt_path = 'lightning_logs/default/version_14/checkpoints/epoch=999-step=385999.ckpt'
# hparams_file = 'lightning_logs/version_8/hparams.yaml'
ee_clf_pretrained = EE_Classifier.load_from_checkpoint(ckpt_path,
                                                       embeddings_dim=768,
                                                       optim_type='sgd',
                                                       init_lr=1e-5)

In [39]:
trainer.fit(ee_clf_pretrained,
            train_dataloaders=indeed_ft_train_loader,
            val_dataloaders=indeed_ft_val_loader)


  | Name            | Type       | Params
-----------------------------------------------
0 | views_clf_heads | ModuleDict | 1.6 M 
-----------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.426     Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




### Post-training analysis 

In [None]:
## TODO: evaluate each finetuned ckpt

In [42]:
def eval_pretrained_ckpt(ckpt_path, test_loader, ee_pred_out_path=None, trainer=None):
    ee_clf_trained = EE_Classifier.load_from_checkpoint(ckpt_path, embeddings_dim=768)
    test_set = test_loader.dataset
    if trainer is None:
        trainer = pl.Trainer(gpus=None)
    
    preds = trainer.predict(ee_clf_trained, dataloaders=test_loader)
    
    pred_records = []
    for i in range(len(test_set)):
        _raw_d = test_set.sample_records[i]
        _pred_d = preds[i]

        pred_records.append({
            'concept': _raw_d['concept'],
            'neighbor': _raw_d['neighbor'],
            'label': _raw_d['label'],
            'emb_logits': _pred_d['emb_logits'][0],
            'lm_logits': _pred_d['lm_logits'][0],
            'joint_logits': _pred_d['joint_logits'][0],
            'emb_pred': _pred_d['emb_pred'][0],
            'lm_pred': _pred_d['lm_pred'][0],
            'joint_pred': _pred_d['joint_pred'][0]
        })

    if ee_pred_out_path is not None:
        pd.DataFrame(pred_records).to_csv(ee_pred_out_path, index=None)
    
#     print('Emb acc:', sum([d['emb_pred'][0] == d['label'][0] for d in preds]) / len(preds))
#     print('LM acc:', sum([d['lm_pred'][0] == d['label'][0] for d in preds]) / len(preds))
#     print('Joint acc:', sum([d['joint_pred'][0] == d['label'][0] for d in preds]) / len(preds))
    
    print_results(preds)
    
    print('Joint stats:')
    _labels = [d['label'][0] for d in preds]
    _preds = [d['joint_pred'][0] for d in preds]
    print('correct pos', sum([(l == 1) & (p == 1) for l, p in zip(_labels, _preds)]))
    print('correct neg', sum([(l == 0) & (p == 0) for l, p in zip(_labels, _preds)]))
    print('correct all', sum([l == p for l, p in zip(_labels, _preds)]))
    print()
    
    

In [None]:
eval_pretrained_ckpt(ckpt_path='lightning_logs/default/version_14/checkpoints/epoch=999-step=385999.ckpt',
                     test_loader=indeed_ft_val_loader)

In [44]:
def print_results(preds):
    _labels = [d['label'][0] for d in preds]
    
    for m in ['emb', 'lm', 'joint']:
        _preds = [d[f'{m}_pred'][0] for d in preds]
        
        TP = sum([(l == 1) & (p == 1) for l, p in zip(_labels, _preds)])
        FP = sum([(l == 0) & (p == 1) for l, p in zip(_labels, _preds)])
        FN = sum([(l == 1) & (p == 0) for l, p in zip(_labels, _preds)])
        TN = sum([(l == 0) & (p == 0) for l, p in zip(_labels, _preds)])
        acc = (TP + TN) / len(_preds)
        prec = TP / (TP + FP)
        rec = TP / (TP + FN)
        f1 = 2 * prec * rec / (prec + rec + 1e-9)
        
        print(m)
        print(f'Acc: {acc:.4f}')
        print(f'Prec: {prec:.4f}')
        print(f'Recall: {rec:.4f}')
        print(f'F1: {f1:.4f}')
        print()

In [45]:
for _ep in range(10):
    print(f'=== epoch = {_ep} ===')
    _ckpt_path = glob(f'lightning_logs/default/14.2/checkpoints/epoch={_ep}-*')[0]
    _ee_out_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_cotraining_14.2_ep={_ep}.csv')

    eval_pretrained_ckpt(ckpt_path=_ckpt_path,
                         test_loader=indeed_test_loader_2,
                         ee_pred_out_path=_ee_out_path)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


=== epoch = 0 ===


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…


emb
Acc: 0.2988
Prec: 0.0142
Recall: 0.8678
F1: 0.0279

lm
Acc: 0.6393
Prec: 0.0189
Recall: 0.5928
F1: 0.0367

joint
Acc: 0.6505
Prec: 0.0214
Recall: 0.6501
F1: 0.0413

Joint stats:
correct pos 851
correct neg 72583


GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


correct all 73434

=== epoch = 1 ===


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…


emb
Acc: 0.3102
Prec: 0.0143
Recall: 0.8587
F1: 0.0281

lm
Acc: 0.6701
Prec: 0.0194
Recall: 0.5539
F1: 0.0375

joint
Acc: 0.6753
Prec: 0.0223
Recall: 0.6310
F1: 0.0431

Joint stats:
correct pos 826
correct neg 75411


GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


correct all 76237

=== epoch = 2 ===


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…


emb
Acc: 0.3110
Prec: 0.0143
Recall: 0.8571
F1: 0.0280

lm
Acc: 0.7186
Prec: 0.0205
Recall: 0.4973
F1: 0.0394

joint
Acc: 0.7111
Prec: 0.0234
Recall: 0.5882
F1: 0.0451

Joint stats:
correct pos 770
correct neg 79510


GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


correct all 80280

=== epoch = 3 ===


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…


emb
Acc: 0.3114
Prec: 0.0143
Recall: 0.8571
F1: 0.0281

lm
Acc: 0.7296
Prec: 0.0208
Recall: 0.4851
F1: 0.0399

joint
Acc: 0.7192
Prec: 0.0237
Recall: 0.5775
F1: 0.0455

Joint stats:
correct pos 756
correct neg 80442


GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


correct all 81198

=== epoch = 4 ===


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…


emb
Acc: 0.3118
Prec: 0.0143
Recall: 0.8571
F1: 0.0281

lm
Acc: 0.7246
Prec: 0.0208
Recall: 0.4927
F1: 0.0398

joint
Acc: 0.7152
Prec: 0.0235
Recall: 0.5821
F1: 0.0452

Joint stats:
correct pos 762
correct neg 79978


GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


correct all 80740

=== epoch = 5 ===


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…


emb
Acc: 0.3121
Prec: 0.0143
Recall: 0.8571
F1: 0.0281

lm
Acc: 0.7793
Prec: 0.0216
Recall: 0.4072
F1: 0.0410

joint
Acc: 0.7588
Prec: 0.0250
Recall: 0.5202
F1: 0.0476

Joint stats:
correct pos 681
correct neg 84980


GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


correct all 85661

=== epoch = 6 ===


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…


emb
Acc: 0.3128
Prec: 0.0143
Recall: 0.8571
F1: 0.0281

lm
Acc: 0.8176
Prec: 0.0216
Recall: 0.3331
F1: 0.0406

joint
Acc: 0.7925
Prec: 0.0259
Recall: 0.4622
F1: 0.0491

Joint stats:
correct pos 605
correct neg 88870


GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


correct all 89475

=== epoch = 8 ===


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…


emb
Acc: 0.3135
Prec: 0.0143
Recall: 0.8571
F1: 0.0281

lm
Acc: 0.8212
Prec: 0.0218
Recall: 0.3285
F1: 0.0409

joint
Acc: 0.7954
Prec: 0.0261
Recall: 0.4591
F1: 0.0495

Joint stats:
correct pos 601
correct neg 89197


GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


correct all 89798

=== epoch = 9 ===


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [188]:
ckpt_path = 'lightning_logs/default/version_14/checkpoints/epoch=999-step=385999.ckpt'
# hparams_file = 'lightning_logs/version_8/hparams.yaml'
ee_clf_trained = EE_Classifier.load_from_checkpoint(ckpt_path, embeddings_dim=768)

In [None]:
ee_clf_trained(wiki_test_set_2[20])

In [190]:
# preds = trainer.predict(ee_clf_trained, dataloaders=wiki_test_loader)
preds = pd.read_csv(os.path.join(base_dir, f'{wiki_data_dir}/ee_clf_test_v=14.csv')).to_dict('record')
len(preds)

665

In [191]:
sum([d['joint_pred'] == d['label'] for d in preds]) / len(preds)

0.6992481203007519

In [201]:
# Test on indeed 
indeed_test_set_2.sample_records[80]

{'concept': 'background_screening', 'neighbor': 'case report', 'label': 1}

In [202]:
ee_clf_trained(indeed_test_set_2[80])

{'emb_logits': tensor(0., grad_fn=<SqueezeBackward1>),
 'lm_logits': tensor(1.5716e-11, grad_fn=<SqueezeBackward1>),
 'joint_logits': tensor(7.8581e-12, grad_fn=<MeanBackward1>)}

In [None]:
trainer = pl.Trainer(gpus=None)

In [None]:
indeed_preds = trainer.predict(ee_clf_trained, dataloaders=indeed_test_loader_2)

In [212]:
indeed_pred_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_cotraining_14.csv')

In [213]:
indeed_pred_records = []
for i in range(len(indeed_test_set)):
    _raw_d = indeed_test_set.sample_records[i]
    _pred_d = indeed_preds[i]
    
    indeed_pred_records.append({
        'concept': _raw_d['concept'],
        'neighbor': _raw_d['neighbor'],
        'label': _raw_d['label'],
        'emb_logits': _pred_d['emb_logits'][0],
        'lm_logits': _pred_d['lm_logits'][0],
        'joint_logits': _pred_d['joint_logits'][0],
        'emb_pred': _pred_d['emb_pred'][0],
        'lm_pred': _pred_d['lm_pred'][0],
        'joint_pred': _pred_d['joint_pred'][0]
    })
pd.DataFrame(indeed_pred_records).to_csv(indeed_pred_path, index=None)

In [214]:
print(sum([d['emb_pred'][0] == d['label'][0] for d in indeed_preds]) / len(indeed_preds))
print(sum([d['lm_pred'][0] == d['label'][0] for d in indeed_preds]) / len(indeed_preds))
print(sum([d['joint_pred'][0] == d['label'][0] for d in indeed_preds]) / len(indeed_preds))

0.28762755102040816
0.5712691326530612
0.5988786139455783


In [215]:
sum([d['emb_pred'][0] == d['label'][0] for d in indeed_preds])

32472

In [217]:
_labels = [d['label'][0] for d in indeed_preds]
_preds = [d['joint_pred'][0] for d in indeed_preds]
print(sum([(l == 1) & (p == 1) for l, p in zip(_labels, _preds)]))
print(sum([(l == 0) & (p == 0) for l, p in zip(_labels, _preds)]))
print(sum([l == p for l, p in zip(_labels, _preds)]))

914
66697
67611


In [None]:
ee_clf_trained.views_clf_heads['lm']

In [None]:
wiki_test_set.sample_records[400]

In [None]:
_s = wiki_test_set[400]
_ev = _s['lm_ent']
_cv = _s['lm_cc']
_v = torch.cat([_ev, _cv, _ev * _cv, _ev - _cv]).unsqueeze(0)

print(_v)
for _layer in ee_clf_trained.views_clf_heads['lm']:
    _v = _layer(_v)
    print(_layer)
    print(_v)

In [None]:
wiki_test_set[20].keys()

In [None]:
# temp

bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')

In [None]:
bert_model.embeddings.word_embeddings.weight.size()

In [None]:
bert_mlm_model = BertForMaskedLM.from_pretrained('bert-base-uncased')

In [None]:
bert_mlm_model.cls.predictions.decoder.weight.size()

In [None]:
torch.allclose(bert_model.embeddings.word_embeddings.weight, bert_mlm_model.cls.predictions.decoder.weight)

## Sub-clusters analysis

In [75]:
ee_LM_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_k=None.csv')

In [14]:
bert_emb_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembed+seeds.txt')

_entity_embeddings = load_embeddings(bert_emb_path, 768)
_entity_emb_dict = dict(zip(_entity_embeddings['entity'].tolist(),
                            _entity_embeddings['embedding'].tolist()))
len(_entity_emb_dict)

8064

In [76]:
_concept = 'benefits'
ee_LM_df = pd.read_csv(ee_LM_path)
ee_LM_list = ee_LM_df[ee_LM_df['concept'] == _concept]['neighbor'].tolist()
len(ee_LM_list)

8036

In [None]:
K = 100
_entities = ee_LM_list[:K]
_feats_arr = [_entity_emb_dict[_e] for _e in _entities]

In [33]:
C = 20
A_cls = AgglomerativeClustering(n_clusters=C, affinity='euclidean')
_cls_res = A_cls.fit_predict(_feats_arr)

In [None]:
for i in range(C):
    print(f'CLUSTER {i}:')
    print(' | '.join(_entities[j] for j in range(K) if _cls_res[j] == i))

## Frequency analysis

In [178]:
def _get_bin_name(log_freq, base=2):
    return f'{base**log_freq}~{base**(log_freq+1)-1}' if log_freq > 0 else '1'

### on wiki

In [86]:
# Indeed entities in Wiki
wiki_corpus_path = f'{wiki_data_dir}/sentences.json'
indeed_corpus_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/sentences.json')
indeed_embed_num_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/BERTembednum+seeds.txt')

In [20]:
with open(indeed_embed_num_path, 'r') as f:
    indeed_ents = [l.strip().rsplit(' ', 1)[0] for l in f]
len(indeed_ents)

8064

In [43]:
A = ahocorasick.Automaton()
for e in indeed_ents:
    A.add_word(f' {e} ', e)
A.make_automaton()
A.get_stats()

{'nodes_count': 51325,
 'words_count': 8064,
 'longest_word': 44,
 'links_count': 51324,
 'sizeof_node': 32,
 'total_size': 2052992}

In [88]:
ent_freqs_wiki = Counter()
ent_sent_freqs_wiki = Counter()

with open(wiki_corpus_path, 'r') as f:
    for l in tqdm(f, total=1180171):
        d = json.loads(l)
        _l = ' ' + ' '.join(d['tokens']).lower() + ' '
        A_matches = [e for _pos, e in A.iter(_l)]
        ent_freqs_wiki.update(A_matches)
        ent_sent_freqs_wiki.update(set(A_matches))

HBox(children=(FloatProgress(value=0.0, max=1180171.0), HTML(value='')))




In [89]:
len(ent_freqs_wiki), len(ent_sent_freqs_wiki)

(6368, 6368)

In [90]:
freq_records_wiki = []
for _e, _f in ent_freqs_wiki.most_common():
    _sf = ent_sent_freqs_wiki[_e]
    freq_records_wiki.append({
        'entity': _e,
        'freq': _f,
        'sent_freq': _sf
    })

In [91]:
freq_records_wiki_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ent_freqs_in_wiki.csv')
pd.DataFrame(freq_records_wiki).to_csv(freq_records_wiki_path, index=None)

In [92]:
freq_bins_wiki = Counter()
for _e, _f in ent_freqs_wiki.most_common():
    _exp = np.floor(np.log2(_f)).astype(int)
    _bin_name = f'{2**_exp}~{2**(_exp+1)-1}' if _exp > 0 else '1'
    freq_bins_wiki[_bin_name] += 1

In [93]:
freq_bins

Counter({'16384~32767': 8,
         '8192~16383': 28,
         '4096~8191': 70,
         '2048~4095': 134,
         '1024~2047': 217,
         '512~1023': 279,
         '256~511': 399,
         '128~255': 490,
         '64~127': 494,
         '32~63': 490,
         '16~31': 472,
         '8~15': 448,
         '4~7': 492,
         '2~3': 414,
         '1': 421})

### on indeed corpus 

In [79]:
ent_freqs_indeed = Counter()
ent_sent_freqs_indeed = Counter()

with open(indeed_corpus_path, 'r') as f:
    for l in tqdm(f, total=901796):
        d = json.loads(l)
        _l = ' ' + ' '.join(d['tokens']).lower() + ' '
        A_matches = [e for _pos, e in A.iter(_l)]
        ent_freqs_indeed.update(A_matches)
        ent_sent_freqs_indeed.update(set(A_matches))

HBox(children=(FloatProgress(value=0.0, max=901796.0), HTML(value='')))




In [80]:
len(ent_freqs_indeed), len(ent_sent_freqs_indeed)

(8064, 8064)

In [82]:
freq_records_indeed = []
for _e, _f in ent_freqs_indeed.most_common():
    _sf = ent_sent_freqs_indeed[_e]
    freq_records_indeed.append({
        'entity': _e,
        'freq': _f,
        'sent_freq': _sf
    })

In [83]:
freq_records_indeed_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ent_freqs_in_indeed.csv')
pd.DataFrame(freq_records_indeed).to_csv(freq_records_indeed_path, index=None)

In [84]:
freq_bins_indeed = Counter()
for _e, _f in ent_freqs_indeed.most_common():
    _exp = np.floor(np.log2(_f)).astype(int)
    _bin_name = f'{2**_exp}~{2**(_exp+1)-1}' if _exp > 0 else '1'
    freq_bins_indeed[_bin_name] += 1

In [85]:
freq_bins_indeed

Counter({'32768~65535': 1,
         '16384~32767': 3,
         '8192~16383': 6,
         '4096~8191': 16,
         '2048~4095': 44,
         '1024~2047': 67,
         '512~1023': 118,
         '256~511': 182,
         '128~255': 326,
         '64~127': 444,
         '32~63': 787,
         '16~31': 1172,
         '8~15': 788,
         '4~7': 1385,
         '2~3': 2359,
         '1': 366})

### accuracy on subsets

In [94]:
raw_ee_label_file = os.path.join(base_dir, f'data/indeed-benchmark/ee-increment-labels-2.csv')

raw_ee_df = pd.read_csv(raw_ee_label_file).dropna()
raw_ee_df.shape

(3423, 7)

In [95]:
raw_ee_df.columns

Index(['concept', 'neighbor', 'label-Nikita', 'label-Sajjadur', 'label-Yutong',
       'Disagree', 'Majority'],
      dtype='object')

In [183]:
seed_concepts_df = load_seed_aligned_concepts(seed_aligned_concepts_path)
seed_instances_dict = dict(zip(seed_concepts_df['alignedCategoryName'],
                               seed_concepts_df['seedInstances']))

In [186]:
gold_k_dict = Counter()
for d in raw_ee_df.to_dict('record'):
    cc = d['concept']
    if d['neighbor'] not in seed_instances_dict[cc] and d['Majority'] == 1:
        gold_k_dict[cc] += 1

In [187]:
method_pred_paths = {
    'emb_contr': os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_contr_knn_k=None-2.csv'),
    'lm': os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_bert_k=None-2.csv'),
    'mrr': os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_mrr_combine_k=None-2.csv'),
}

method_preds = dict()
for m, p in method_pred_paths.items():
    _df = pd.read_csv(p)
    _cc_dict = dict()
    for cc, _seeds in seed_instances_dict.items():
        _sub_df = _df[_df['concept'] == cc]
        _preds = [e for e in _sub_df['neighbor'] if e not in _seeds]
        _preds = _preds[:gold_k_dict[cc]]
        _cc_dict[cc] = _preds
    method_preds[m] = _cc_dict

In [229]:
# Dump to csv

_output_csv = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee-dim-analysis.csv')

_output_records = []
for raw_d in raw_ee_df.to_dict('record'):
    d = dict(raw_d)
    cc = d['concept']
    e = d['neighbor']
    if e in seed_instances_dict[cc]:
        continue
    
    d['votes'] = np.round(d['label-Nikita'] + d['label-Sajjadur'] + d['label-Yutong']).astype(int)
    
    _freq_wiki = ent_sent_freqs_wiki[e]
    _bin_wiki = np.floor(np.log2(_freq_wiki + 0.9)).astype(int) + 1
    _freq_indeed = ent_sent_freqs_indeed[e]
    _bin_indeed = np.floor(np.log2(_freq_indeed + 0.9)).astype(int) + 1
    ## these are log(freq) + 1
    d['log_freq_wiki'] = _bin_wiki
    d['log_freq_indeed'] = _bin_indeed

    for _m, _cc_preds in method_preds.items():
        d[f'{_m}_pred'] = int(e in _cc_preds[cc])
    
    _output_records.append(d)

pd.DataFrame(_output_records).to_csv(_output_csv, index=None)

In [230]:
joint_info_df = pd.read_csv(_output_csv)
joint_info_df.columns

Index(['concept', 'neighbor', 'label-Nikita', 'label-Sajjadur', 'label-Yutong',
       'Disagree', 'Majority', 'votes', 'log_freq_wiki', 'log_freq_indeed',
       'emb_contr_pred', 'lm_pred', 'mrr_pred'],
      dtype='object')

In [None]:
# by agreement 

by_agreement_records = []

for _agree in [0, 1]:
    _agree_df = joint_info_df[joint_info_df['Disagree'] == 1 - _agree]
    print(f'Agree = {_agree} samples:')
    for m in method_pred_paths.keys():
        print(f'{m}:')
        for cc in seed_instances_dict.keys():
            _cc_df = _agree_df[_agree_df['concept'] == cc]
            _corr = (_cc_df['Majority'] * _cc_df[f'{m}_pred']).sum()
            _all_pred = _cc_df[f'{m}_pred'].sum()
            _all_corr = _cc_df['Majority'].sum()
            _P = _corr / _all_pred
            _R = _corr / _all_corr
            _F1 = 2 * _P * _R / (_P + _R + 1e-9)
            print(f'{cc:<20s} >> P = {_P:.4f} | R = {_R:.4f} | F1 = {_F1:.4f}')
            print(f'{"":<20s} >> corr = {_corr} | all_p = {_all_pred} | all_r = {_all_corr}')
            by_agreement_records.append({
                'agreement': _agree,
                'method': m,
                'concept': cc,
                'P': _P,
                'R': _R,
                'F1': _F1,
                'corr': _corr,
                'all_p': _all_pred,
                'all_r': _all_corr,
            })
        print()

In [234]:
_agreement_csv = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee-agreement-analysis.csv')
pd.DataFrame(by_agreement_records).to_csv(_agreement_csv, index=None)

In [None]:
# by indeed frequency 

by_indeed_freq_records = []

for _freq_bin in sorted(joint_info_df['log_freq_indeed'].drop_duplicates().to_list()):
    _freq_df = joint_info_df[joint_info_df['log_freq_indeed'] == _freq_bin]
    print(f'Freq bin {_freq_bin}:')
    for m in method_pred_paths.keys():
        print(f'{m}:')
        for cc in seed_instances_dict.keys():
            _cc_df = _freq_df[_freq_df['concept'] == cc]
            _corr = (_cc_df['Majority'] * _cc_df[f'{m}_pred']).sum()
            _all_pred = _cc_df[f'{m}_pred'].sum()
            _all_corr = _cc_df['Majority'].sum()
            _P = _corr / _all_pred
            _R = _corr / _all_corr
            _F1 = 2 * _P * _R / (_P + _R + 1e-9)
            print(f'{cc:<20s} >> P = {_P:.4f} | R = {_R:.4f} | F1 = {_F1:.4f}')
            print(f'{"":<20s} >> corr = {_corr} | all_p = {_all_pred} | all_r = {_all_corr}')
            by_indeed_freq_records.append({
                'freq_bin': _freq_bin,
                'method': m,
                'concept': cc,
                'P': _P,
                'R': _R,
                'F1': _F1,
                'corr': _corr,
                'all_p': _all_pred,
                'all_r': _all_corr,
            })
        print()

In [239]:
_freq_indeed_csv = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee-freq-indeed-analysis.csv')
pd.DataFrame(by_indeed_freq_records).to_csv(_freq_indeed_csv, index=None)

In [None]:
# by wiki frequency 

by_wiki_freq_records = []

for _freq_bin in sorted(joint_info_df['log_freq_wiki'].drop_duplicates().to_list()):
    _freq_df = joint_info_df[joint_info_df['log_freq_wiki'] == _freq_bin]
    print(f'Freq bin {_freq_bin}:')
    for m in method_pred_paths.keys():
        print(f'{m}:')
        for cc in seed_instances_dict.keys():
            _cc_df = _freq_df[_freq_df['concept'] == cc]
            _corr = (_cc_df['Majority'] * _cc_df[f'{m}_pred']).sum()
            _all_pred = _cc_df[f'{m}_pred'].sum()
            _all_corr = _cc_df['Majority'].sum()
            _P = _corr / _all_pred
            _R = _corr / _all_corr
            _F1 = 2 * _P * _R / (_P + _R + 1e-9)
            print(f'{cc:<20s} >> P = {_P:.4f} | R = {_R:.4f} | F1 = {_F1:.4f}')
            print(f'{"":<20s} >> corr = {_corr} | all_p = {_all_pred} | all_r = {_all_corr}')
            by_wiki_freq_records.append({
                'freq_bin': _freq_bin,
                'method': m,
                'concept': cc,
                'P': _P,
                'R': _R,
                'F1': _F1,
                'corr': _corr,
                'all_p': _all_pred,
                'all_r': _all_corr,
            })
        print()

In [241]:
_freq_wiki_csv = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee-freq-wiki-analysis.csv')
pd.DataFrame(by_wiki_freq_records).to_csv(_freq_wiki_csv, index=None)

In [None]:
pair2votes = dict()

# for cc in raw_ee_df['concept'].drop_duplicate().to_list():
for cc in ['job_position']:
    _sub_df = raw_ee_df[raw_ee_df['concept'] == cc]
    for d in _sub_df.to_dict('record'):
        e = d['neighbor']
        maj = d['Majority']
        votes = d['label-Nikita'] + d['label-Sajjadur'] + d['label-Yutong']
        pair2votes[(cc, e)] = np.round(votes).astype(int)

pair2votes[('job_position', 'field tech')]

In [179]:
wiki_freq_stats = defaultdict(Counter)
indeed_freq_stats = defaultdict(Counter)
freq_records = []

for (cc, e), votes in pair2votes.items():
    _freq_wiki = ent_sent_freqs_wiki[e]
    if _freq_wiki == 0: continue
    _bin_wiki = np.floor(np.log2(_freq_wiki)).astype(int)
    _bin_name_wiki = _get_bin_name(_bin_wiki)
    _freq_indeed = ent_sent_freqs_indeed[e]
    _bin_indeed = np.floor(np.log2(_freq_indeed)).astype(int)
    _bin_name_indeed = _get_bin_name(_bin_indeed)
    # print(cc, e, (_freq_wiki, _bin_wiki, _bin_name_wiki), (_freq_indeed, _bin_indeed, _bin_name_indeed))
    wiki_freq_stats[_bin_name_wiki][votes] += 1
    indeed_freq_stats[_bin_name_indeed][votes] += 1
    freq_records.append({
        'concept': cc,
        'neighbor': e,
        # ?
    })
    

In [None]:
wiki_freq_stats

In [None]:
indeed_freq_stats

## EE labels preprocessing

### Extract labeled entities

In [233]:
in_EE_label_file = os.path.join(base_dir, f'data/indeed-benchmark/ee-increment-labels-2.csv')
out_EE_label_file = os.path.join(base_dir, f'data/indeed-benchmark/ee-labels-2.csv')

In [234]:
in_ee_df = pd.read_csv(in_EE_label_file)

gold_labels_dict = dict()
for d in in_ee_df.to_dict('records'):
    if d['Majority'] != 1:
        continue
    _cc = d['concept']
    _e = d['neighbor']
    gold_labels_dict[_cc] = gold_labels_dict.get(_cc, []) + [_e]


In [235]:
for _cc, _e_list in gold_labels_dict.items():
    print(f'{_cc}: {len(_e_list)}')

company: 155
dress_code: 268
job_position: 152
pay_schedule: 37
benefits: 116
compensation: 47
payment_option: 30
background_screening: 104
person: 67
hire_prerequisite: 137
shifts: 89
schedule: 69
employee_type: 30
onboarding_steps: 30


In [236]:
gold_labels_list = []
for _cc, _e_list in gold_labels_dict.items():
    for _e in _e_list:
        gold_labels_list.append((_cc, _e))
pd.DataFrame(gold_labels_list, columns=['concept', 'neighbor']).to_csv(out_EE_label_file, index=False)

### Increment label file

In [103]:
ee_incre_label_path = os.path.join(base_dir, f'data/indeed-benchmark/ee-increment-labels-2.csv')
ee_pred_paths = [
    os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_knn_roberta_k=None.csv'),
    os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_contr_knn_roberta_k=None.csv'),
    os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_roberta_k=None.csv'),
    os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_LM_roberta_contr_k=None.csv'),
]
# ee_pred_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_concept_contr_knn-aux-k=None.csv')

In [104]:
# Self-check and dedup current label file 

ee_label_df = pd.read_csv(ee_incre_label_path)
ee_label_dict = dict()  # Dict[(cc, e), maj]
for _d in ee_label_df.to_dict('record'):
    _key = (_d['concept'], _d['neighbor'])
    _label = ee_label_dict.get(_key, None)
    if _label is None:
        ee_label_dict[_key] = _d['Majority']
    elif _label == 'ERR':
        pass
    else:
        if _label != _d['Majority']:
            print(f"Contradict: {_key}")
            ee_label_dict[_key] = 'ERR'

In [105]:
len(ee_label_dict)

3435

In [106]:
_concepts = ee_label_df['concept'].drop_duplicates(keep='first').tolist()
_concepts_order_dict = dict([(_concepts[i], i) for i in range(len(_concepts))])
_concepts_order_dict

{'company': 0,
 'dress_code': 1,
 'job_position': 2,
 'pay_schedule': 3,
 'benefits': 4,
 'compensation': 5,
 'payment_option': 6,
 'background_screening': 7,
 'person': 8,
 'hire_prerequisite': 9,
 'shifts': 10,
 'schedule': 11,
 'employee_type': 12,
 'onboarding_steps': 13}

In [107]:
K = 100

for ee_pred_path in ee_pred_paths:
    ee_new_df = pd.read_csv(ee_pred_path)

    for _cc in _concepts:
        _new_ents = ee_new_df[ee_new_df['concept'] == _cc]['neighbor'].tolist()[:K]
        for _e in _new_ents:
            _key = (_cc, _e)
            if _key in ee_label_dict:
                continue
            ee_label_dict[_key] = None

len(ee_label_dict)

5620

In [109]:
_out_records = []
_added_keys = set()

# remove existing errors 
for _d in ee_label_df.to_dict('record'):
    _cc = _d['concept']
    _e = _d['neighbor']
    if (_cc, _e) in _added_keys:
        continue
        
    if ee_label_dict[(_cc, _e)] == 'ERR':
        _d['label-Nikita'] = _d['label-Sajjadur'] = _d['label-Yutong'] = \
        _d['Disagree'] = _d['Majority'] = np.nan
    _out_records.append(_d)   
    _added_keys.add((_cc, _e))

for (_cc, _e), _label in ee_label_dict.items():
    if (_cc, _e) in _added_keys:
        continue
        
    _out_records.append({
        'concept': _cc,
        'neighbor': _e,
        'label-Nikita': np.nan,
        'label-Sajjadur': np.nan,
        'label-Yutong': np.nan,
        'Disagree': np.nan,
        'Majority': np.nan
    })
    _added_keys.add((_cc, _e))

_out_records.sort(key=lambda d : (np.isnan(d['Majority']), _concepts_order_dict[d['concept']]))
_out_records[:3], _out_records[-3:]

([{'concept': 'company',
   'neighbor': 'wal mart',
   'label-Nikita': 1,
   'label-Sajjadur': 1.0,
   'label-Yutong': 1,
   'Disagree': 0,
   'Majority': 1},
  {'concept': 'company',
   'neighbor': 'walmart',
   'label-Nikita': 1,
   'label-Sajjadur': 1.0,
   'label-Yutong': 1,
   'Disagree': 0,
   'Majority': 1},
  {'concept': 'company',
   'neighbor': 'costco',
   'label-Nikita': 1,
   'label-Sajjadur': 1.0,
   'label-Yutong': 1,
   'Disagree': 0,
   'Majority': 1}],
 [{'concept': 'onboarding_steps',
   'neighbor': 'changeup',
   'label-Nikita': nan,
   'label-Sajjadur': nan,
   'label-Yutong': nan,
   'Disagree': nan,
   'Majority': nan},
  {'concept': 'onboarding_steps',
   'neighbor': 'personal issues',
   'label-Nikita': nan,
   'label-Sajjadur': nan,
   'label-Yutong': nan,
   'Disagree': nan,
   'Majority': nan},
  {'concept': 'onboarding_steps',
   'neighbor': 'basic information',
   'label-Nikita': nan,
   'label-Sajjadur': nan,
   'label-Yutong': nan,
   'Disagree': nan,
  

In [110]:
# ee_incre_new_path: the new unlabeled file 
ee_incre_new_path = os.path.join(base_dir, f'data/indeed-benchmark/ee-increment-labels-NEW3.csv')
pd.DataFrame(_out_records).to_csv(ee_incre_new_path, index=None)

### Concepts overlapping

In [245]:
EE_label_file = os.path.join(base_dir, f'data/indeed-benchmark/ee-labels-2.csv')

In [246]:
_df = pd.read_csv(EE_label_file)
_e_dict = dict([(_cc, _df[_df['concept'] == _cc]['neighbor'].tolist())
                    for _cc in set(_df['concept'].tolist())])

In [247]:
_ints = set(_e_dict['benefits']) & set(_e_dict['compensation'])
len(_e_dict['benefits']), len(_e_dict['compensation']), len(_ints), _ints

(116, 47, 2, {'stock options', 'unemployment compensation'})

In [None]:
_overlap_ccs = [
    ('benefits', 'compensation'),
    ('hire_prerequisite', 'background_screening'),
    ('hire_prerequisite', 'person'),
    ('person', 'background_screening'),
    ('schedule', 'pay_schedule'),
    ('schedule', 'shifts'),
    ('shifts', 'pay_schedule'),
]

In [None]:
for _c1, _c2 in _overlap_ccs:
    _ints = sorted(list(set(_e_dict[_c1]) & set(_e_dict[_c2])))
    print(f'{_c1} -- {_c2}')
    print(f'{_c1}: {len(_e_dict[_c1])} ')
    print(f'{_c2}: {len(_e_dict[_c2])} ')
    print(f'Intersection: {_ints} {len(_ints)}')
    print()

In [248]:
_e2cc = dict([(_e, tuple(sorted(_df[_df['neighbor'] == _e]['concept'].tolist())))
                  for _e in set(_df['neighbor'].tolist())])

_ccs2e = dict([(_ccs, [_e for _e, _l in _e2cc.items() if _l == _ccs]) for _ccs in sorted(_e2cc.values())])

In [249]:
for _ccs, _e_list in _ccs2e.items():
    if len(_ccs) > 1:
        print('Concepts:', ' | '.join(_ccs))
        print(f'Intersection: {_e_list} {len(_e_list)}')
        print()

Concepts: background_screening | benefits
Intersection: ['college education', 'medical diagnosis', 'unemployment', 'medical marijuana card', 'medical marijuana', 'health'] 6

Concepts: background_screening | benefits | hire_prerequisite
Intersection: ['education'] 1

Concepts: background_screening | benefits | hire_prerequisite | payment_option
Intersection: ['medical card'] 1

Concepts: background_screening | hire_prerequisite
Intersection: ['social security number', 'driver license', 'social security card', 'marriage license', 'employment contract', 'medical test', 'passport', 'citizenship', 'security card', 'physical exam', 'marriage certificate'] 11

Concepts: benefits | compensation
Intersection: ['unemployment compensation', 'stock options'] 2

Concepts: benefits | dress_code
Intersection: ['hair'] 1

Concepts: benefits | hire_prerequisite
Intersection: ['car insurance', 'high school education', 'health insurance', 'dental insurance', 'insurance', 'disability insurance', 'physica

In [250]:
cc_overlap_dict = defaultdict(set)
for _ccs in _ccs2e.keys():
    for i in range(len(_ccs)):
        for j in range(i+1, len(_ccs)):
            _c1 = _ccs[i]
            _c2 = _ccs[j]
            cc_overlap_dict[_c1].add(_c2)
            cc_overlap_dict[_c2].add(_c1)

In [None]:
cc_overlap_dict

In [160]:
for _e, _cc_list in _e2cc.items():
    if tuple(sorted(_cc_list)) == ('shifts', 'shifts'):
        print(_e, _cc_list)

In [None]:
for _e, _cc_list in _e2cc.items():
    if len(_cc_list) >= 3:
        print(_e, _cc_list)

## Entity expansion evaluation
Now using benchmark entities, mean reciprocal rank

In [None]:
!python eval_entities.py \
-b $benchmark_dir \
-pred $base_dir/data/$data_ac/intermediate/ee_concept_knn_k=None-2.csv \
-o $base_dir/data/$data_ac/intermediate/ee_concept_knn_k=None-2_eval.csv \
-rank sim \
-rev

In [None]:
!python eval_entities.py \
-b $benchmark_dir \
-pred $base_dir/data/$data_ac/intermediate/ee_LM_bert_k=None-2.csv \
-o $base_dir/data/$data_ac/intermediate/ee_LM_bert_k=None-2_eval.csv

In [None]:
!python eval_entities.py \
-b $benchmark_dir \
-pred $base_dir/data/$data_ac/intermediate/ee_mrr_combine-lm*3-k=None.csv \
-o $base_dir/data/$data_ac/intermediate/ee_mrr_combine-lm*3-k=None_eval.csv

In [158]:
for COMB in ['avg', 'lm*3_avg', 'harmonic', 'min']:
    !python eval_entities.py \
    -b $benchmark_dir \
    -pred $base_dir/data/$data_ac/intermediate/ee_calib-merged-k=None.csv \
    -o $base_dir/data/$data_ac/intermediate/ee_calib-merge=$COMB-k=None_eval.csv \
    -rank $COMB \
    -rev

In [9]:
!python eval_entities.py \
-b $benchmark_dir \
-pred $base_dir/data/$data_ac/intermediate/ee_cotraining_14.csv \
-o $base_dir/data/$data_ac/intermediate/ee_cotraining_14_eval.csv \
-rank joint_logits \
-rev

--- Summary ---
Concept                  Max K   Gold K   P@K     R@K     P@20    R@20   P@100   R@100  
company                   8059    150    0.5533  0.5533  1.0000  0.1333  0.8300  0.5533 
dress_code                8057    261    0.5977  0.5977  1.0000  0.0766  1.0000  0.3831 
job_position              8057    145    0.0414  0.0414  0.1500  0.0207  0.0400  0.0276 
pay_schedule              8060     34    0.1765  0.1765  0.3000  0.1765  0.0600  0.1765 
benefits                  8058    110    0.3727  0.3727  1.0000  0.1818  0.4100  0.3727 
compensation              8060     43    0.0000  0.0000  0.0000  0.0000  0.0000  0.0000 
payment_option            8061     27    0.8519  0.8519  1.0000  0.7407  0.2300  0.8519 
background_screening      8061    102    0.3333  0.3333  1.0000  0.1961  0.3400  0.3333 
person                    8055     58    0.0690  0.0690  0.2000  0.0690  0.0800  0.1379 
hire_prerequisite         8059    133    0.4812  0.4812  1.0000  0.1504  0.6400  0.4812 
shift

In [46]:
for ep in range(10):
    !python eval_entities.py \
    -b $benchmark_dir \
    -pred $base_dir/data/$data_ac/intermediate/ee_cotraining_14.2_ep={ep}.csv \
    -o $base_dir/data/$data_ac/intermediate/ee_cotraining_14.2_ep={ep}_eval.csv \
    -rank joint_logits \
    -rev

--- Summary ---
Concept                  Max K   Gold K   P@K     R@K     P@20    R@20   P@100   R@100  
company                   8059    150    0.5467  0.5467  1.0000  0.1333  0.8200  0.5467 
dress_code                8057    261    0.4444  0.4444  1.0000  0.0766  1.0000  0.3831 
job_position              8057    145    0.0414  0.0414  0.1000  0.0138  0.0500  0.0345 
pay_schedule              8060     34    0.0294  0.0294  0.0500  0.0294  0.0100  0.0294 
benefits                  8058    110    0.3364  0.3364  1.0000  0.1818  0.3700  0.3364 
compensation              8060     43    0.0000  0.0000  0.0000  0.0000  0.0000  0.0000 
payment_option            8061     27    0.8148  0.8148  1.0000  0.7407  0.2200  0.8148 
background_screening      8061    102    0.2745  0.2745  1.0000  0.1961  0.2800  0.2745 
person                    8055     58    0.0690  0.0690  0.2000  0.0690  0.0800  0.1379 
hire_prerequisite         8059    133    0.3459  0.3459  1.0000  0.1504  0.4600  0.3459 
shift

--- Summary ---
Concept                  Max K   Gold K   P@K     R@K     P@20    R@20   P@100   R@100  
company                   8059    150    0.5067  0.5067  1.0000  0.1333  0.7600  0.5067 
dress_code                8057    261    0.0805  0.0805  1.0000  0.0766  0.2100  0.0805 
job_position              8057    145    0.1241  0.1241  0.0000  0.0000  0.0500  0.0345 
pay_schedule              8060     34    0.0294  0.0294  0.0500  0.0294  0.0100  0.0294 
benefits                  8058    110    0.2909  0.2909  1.0000  0.1818  0.3200  0.2909 
compensation              8060     43    0.0000  0.0000  0.0000  0.0000  0.0000  0.0000 
payment_option            8061     27    0.4444  0.4444  0.6000  0.4444  0.1200  0.4444 
background_screening      8061    102    0.1275  0.1275  0.6500  0.1275  0.1300  0.1275 
person                    8055     58    0.1034  0.1034  0.2000  0.0690  0.1300  0.2241 
hire_prerequisite         8059    133    0.1429  0.1429  0.9500  0.1429  0.1900  0.1429 
shift

## EE results visualization

In [296]:
def create_tidy_data_gold_k(filepath_dict):
    record_ls = []
    
    for method, f in filepath_dict.items():
        res_df = pd.read_csv(f)
        for d in res_df.to_dict('records'):
            record_ls.append({
                'method': method,
                'concept': d['concept'],
                'acc': d['R@K']
            })
    df = pd.DataFrame(record_ls)
    return df

def pivot_df(df_, measure_y, dimension_x, group):
    df_pivot = pd.pivot_table(
        df_,
        values=measure_y,
        index=dimension_x,
        columns=group,
        aggfunc=np.mean
    )
    return df_pivot

In [396]:
ee_eval_filepath_dicts = {
    'emb': f'{base_dir}/data/{data_ac}/intermediate/ee_concept_knn_k=None_eval.csv',
    'emb-contr': f'{base_dir}/data/{data_ac}/intermediate/ee_concept_contr_knn_k=None_eval.csv',
    'emb-contr-aux': f'{base_dir}/data/{data_ac}/intermediate/ee_concept_contr_knn-aux-k=None_eval.csv',
    'lm': f'{base_dir}/data/{data_ac}/intermediate/ee_LM_bert_k=None_eval.csv',
    'lm-contr': f'{base_dir}/data/{data_ac}/intermediate/ee_LM_bert_contr_k=None_eval.csv',
    'lm-contr-aux': f'{base_dir}/data/{data_ac}/intermediate/ee_LM_bert_contr-aux-k=None_eval.csv',
    'mrr-combine': f'{base_dir}/data/{data_ac}/intermediate/ee_mrr_combine_k=None_eval.csv',
    'mrr-combine-aux': f'{base_dir}/data/{data_ac}/intermediate/ee_mrr_combine_3-k=None_eval.csv',
}

_eval_df = create_tidy_data_gold_k(ee_eval_filepath_dicts)

In [397]:
_eval_df.head(3)

Unnamed: 0,method,concept,acc
0,emb,company,0.413333
1,emb,dress_code,0.295019
2,emb,job_position,0.22069


In [347]:
# easy = ['company','dress_code','benefits','job_position','employee_type']
# hard = ['background_screening', 'compensation', 'hire_prerequisite','onboarding_steps','payment_option','person','schedule','shifts']

# cc_groups = {
#     '1-easy': ['company', 'dress_code', 'job_position', 'benefits', 'payment_option', 'employee_type'],
#     '2-medium': ['hire_prerequisite', 'background_screening', 'shifts', 'pay_schedule', 'schedule'],
#     '3-hard': ['person', 'onboarding_steps', 'compensation']
# }

cc_groups = {
    '1-non-overlap': ['company', 'dress_code', 'job_position', 'payment_option', 'employee_type', 'onboarding_steps'],
    '2-overlap': ['hire_prerequisite', 'background_screening', 'person', 'benefits', 'compensation', 'shifts', 'pay_schedule', 'schedule']
}

cc2group = dict([(cc, g) for g, cc_list in cc_groups.items() for cc in cc_list])

In [348]:
cc2group

{'company': '1-non-overlap',
 'dress_code': '1-non-overlap',
 'job_position': '1-non-overlap',
 'payment_option': '1-non-overlap',
 'employee_type': '1-non-overlap',
 'onboarding_steps': '1-non-overlap',
 'hire_prerequisite': '2-overlap',
 'background_screening': '2-overlap',
 'person': '2-overlap',
 'benefits': '2-overlap',
 'compensation': '2-overlap',
 'shifts': '2-overlap',
 'pay_schedule': '2-overlap',
 'schedule': '2-overlap'}

In [398]:
_eval_df['group'] = _eval_df.apply(lambda row: cc2group[row['concept']], axis=1)
_eval_df_2 = _eval_df.copy()
_eval_df_2['group'] = '0-all'
_eval_df_a = pd.concat([_eval_df_2, _eval_df])
_eval_df_a.head(3)

Unnamed: 0,method,concept,acc,group
0,emb,company,0.413333,0-all
1,emb,dress_code,0.295019,0-all
2,emb,job_position,0.22069,0-all


In [None]:
# _df_plot = _eval_df_a[_eval_df_a.apply(lambda row: row['method'] != 'mrr-combine', axis=1)]
_df_plot = _eval_df_a
df_pivot_difficulty = pivot_df(_df_plot, "acc", "group", "method")
df_pivot_difficulty.plot(kind="bar", figsize=(8,6)).legend(loc='upper left',ncol=1,bbox_to_anchor=(1.05, 1))

## Dump (expanded) entities

In [110]:
ee_pred_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/ee_mrr_combine_k=None.csv')
ee_df = pd.read_csv(ee_pred_path)
ee_df.shape

(112322, 3)

In [None]:
concepts_df = load_seed_aligned_concepts(seed_aligned_concepts_path)
instances_dict = dict(zip(concepts_df['alignedCategoryName'].tolist(),
                      concepts_df['seedInstances'].tolist()))
instances_dict

In [114]:
K = 20
for cc, e_list in instances_dict.items():
    e_list = list(set(e_list + ee_df[ee_df['concept'] == cc]['neighbor'].tolist()[:K]))
    instances_dict[cc] = e_list
[(cc, len(e_list)) for cc, e_list in instances_dict.items()]

[('company', 23),
 ('dress_code', 25),
 ('job_position', 22),
 ('pay_schedule', 21),
 ('benefits', 23),
 ('compensation', 21),
 ('payment_option', 21),
 ('background_screening', 21),
 ('person', 25),
 ('hire_prerequisite', 22),
 ('shifts', 23),
 ('schedule', 25),
 ('employee_type', 20),
 ('onboarding_steps', 25)]

In [117]:
instances_dict_path = os.path.join(base_dir, f'data/{data_ac}/intermediate/expanded_mrr_combine_k=20.json')
with open(instances_dict_path, 'w') as f:
    json.dump(instances_dict, f, indent=4)

# Relation Extraction Baselines
Currently only for single relation. TODO: include all relations

## Null baseline - Cartesian product

In [164]:
# Use script 
!python relation_extraction_cartesian.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-o $base_dir/data/$data_ac/intermediate/rel_extraction-aux-EE=emb_contr-K=100-RE=Ct.csv \
-cknn $base_dir/data/$data_ac/intermediate/ee_concept_contr_knn_k=None.csv \
-topk 100

company 	 pay_schedule
company 	 dress_code
company 	 background_screening
company 	 benefits
company 	 person
company 	 compensation
company 	 hire_prerequisite
company 	 schedule
company 	 employee_type
company 	 onboarding_steps
company 	 shifts
company 	 job_position
company 	 payment_option
job_position 	 background_screening
job_position 	 hire_prerequisite
job_position 	 employee_type
job_position 	 onboarding_steps


In [249]:
# Use script 
!python relation_extraction_cartesian.py \
-d $base_dir/data/$data_ac/intermediate \
-b $base_dir/data/indeed-benchmark \
-o $base_dir/data/$data_ac/intermediate/rel_extraction-aux-EE=mrr_combine-K=20-RE=Ct.csv \
-cknn $base_dir/data/$data_ac/intermediate/ee_mrr_combine_k=None.csv \
-topk 20

company 	 pay_schedule
company 	 dress_code
company 	 background_screening
company 	 benefits
company 	 person
company 	 compensation
company 	 hire_prerequisite
company 	 schedule
company 	 employee_type
company 	 onboarding_steps
company 	 shifts
company 	 job_position
company 	 payment_option
job_position 	 background_screening
job_position 	 hire_prerequisite
job_position 	 employee_type
job_position 	 onboarding_steps


## Relation Extraction Evaluation

### has_dress_code

In [171]:
!python eval_relations.py \
-b $base_dir/data/indeed-benchmark \
-pred $base_dir/data/$data_ac/intermediate/rel_extraction-has_dress_code-EE=LM_bert-RE=Ct.csv \
-r has_dress_code

--- RE Results ---
Benchmark relations: 107
Predicted relations: 10000
Intersection: 6
P = 0.0006, R = 0.0561, F1 = 0.0012

Intersection:
('subway', 'has_dress_code', 'piercings')
('walmart', 'has_dress_code', 'black jeans')
('walmart', 'has_dress_code', 'blue collar')
('walmart', 'has_dress_code', 'face tattoos')
('walmart', 'has_dress_code', 'jeans')
('walmart', 'has_dress_code', 'uniform')



In [170]:
!python eval_relations.py \
-b $base_dir/data/indeed-benchmark \
-pred $base_dir/data/$data_ac/intermediate/rel_extraction-has_dress_code-EE=LM_bert-RE=Ct+KV=0.9.csv \
-r has_dress_code

--- RE Results ---
Benchmark relations: 107
Predicted relations: 203
Intersection: 4
P = 0.0197, R = 0.0374, F1 = 0.0258

Intersection:
('subway', 'has_dress_code', 'piercings')
('walmart', 'has_dress_code', 'black jeans')
('walmart', 'has_dress_code', 'jeans')
('walmart', 'has_dress_code', 'uniform')



# Knowledge Verification baseline
(finding co-occurrences of head / tail from corpus)

## Knowledge Verification

In [165]:
# Use script 
!python knowledge_verification_entail.py \
-d $base_dir/data/$data_ac/intermediate \
-in $base_dir/data/$data_ac/intermediate/rel_extraction-all-EE=mrr_combine-RE=Ct-MINI.csv \
-o_kv $base_dir/data/$data_ac/intermediate/kv_evidences-all-EE=mrr_combine-RE=Ct-MINI.json \
-o_re $base_dir/data/$data_ac/intermediate/rel_extraction-all-EE=mrr_combine-RE=Ct+KV=0.9-MINI.csv \
-r $yutong_base_dir/models/roberta-large \
-rs $yutong_base_dir/repos/Roberta_SES/checkpoints/epoch=2-valid_loss=-0.2620-valid_acc_end=0.9223.ckpt \
-p_kv 0.7 \
-p_re 0.9 \
--fast_skip 3

Loading files...
Finding evidence for rels: 100%|████████████████| 68/68 [00:08<00:00,  8.16it/s]


In [None]:
# Use script 
!python knowledge_verification_entail.py \
-d $base_dir/data/$data_ac/intermediate \
-in $base_dir/data/$data_ac/intermediate/rel_extraction-aux-EE=mrr_combine-K=100-RE=Ct.csv \
-o_kv $base_dir/data/$data_ac/intermediate/kv_evidences-aux-EE=mrr_combine-K=100-RE=Ct.json \
-o_re $base_dir/data/$data_ac/intermediate/rel_extraction-aux-EE=mrr_combine-K=100-RE=Ct+KV=0.9.csv \
-r $yutong_base_dir/models/roberta-large \
-rs $yutong_base_dir/repos/Roberta_SES/checkpoints/epoch=2-valid_loss=-0.2620-valid_acc_end=0.9223.ckpt \
-p_kv 0.7 \
-p_re 0.9 \
--fast_skip 3

In [None]:
!grep "\"pharmacy\"" $base_dir/data/$data_ac/intermediate/kv_evidences-aux-EE=mrr_combine-RE=Ct.json

# Retrieve Prompt Evidence

In [109]:
# visit here: /meg-kb/src/analysis/lm_probing.ipynb

# Suggest Quality Prompts

# Discussions

In [249]:
# Discussions:
# coherence clustering / ensemble models?
# trying for other relations or entities
# using entities in sub-categories
# fine-tuning
# ambiguous samples (high for pos and neg)
# quantitative-evaluation

In [None]:
%tensorboard --logdir lightning_logs/version_8 --port 6008 --host 0.0.0.0

In [168]:
!kill 76798