# Corpus Processing

In [1]:
import pandas as pd
import json
from tqdm import tqdm
tqdm.pandas()
import ast

  from pandas import Panel


In [5]:
# directory with corpus, one sentence per line
data_ac="data/jobs"

# Keyword Extraction

In [3]:
# src directory
kb_base_dir="/mnt/efs/shared/meg-kb"

In [4]:
%cd $kb_base_dir/src/keyword_extraction/

[Errno 2] No such file or directory: '/mnt/efs/shared/meg-kb/src/keyword_extraction/'
/mnt/efs/shared/meg_shared_scripts/knowledge-hub


In [None]:
!chmod +x ./corpusProcess.sh

In [None]:
thread = 8
!./corpusProcess.sh $data_ac $thread

## Generate Embeddings

In [None]:
%cd $kb_base_dir/src/concept_learning/

In [None]:
!CUDA_VISIBLE_DEVICES=3 python compute_keyphrase_embeddings.py -m bert-base-uncased -et ac -d $data_ac/intermediate -c 750

# Define Taxonomy

In [6]:
concepts = [
    {
        "alignedCategoryName": "programming_language",
        "unalignedCategoryName": "programming language",
        "generalizations": None,
        "seedInstances": ["python", "sql", "java", "html", "perl", "javascript", "php"]
    },
    {
        "alignedCategoryName": "technology",
        "unalignedCategoryName": "technology",
        "generalizations": None,
        "seedInstances": ["distributed systems", "load balancing", "network monitoring", "data structures", "virtualization technologies", "search engine optimization", "network communications", "version control"]
    },
    {
        "alignedCategoryName": "database_type",
        "unalignedCategoryName": "database type",
        "generalizations": None,
        "seedInstances": ["nosql databases", "oracle", "microsoft sql server", "mongo db", "mysql databases"]
    },
    {
        "alignedCategoryName": "software_development_type",
        "unalignedCategoryName": "software development type",
        "generalizations": None,
        "seedInstances": ["fullstack", "server side", "game development", "enterprise architecture", "mobile application development"]
    },
    {
        "alignedCategoryName": "system_type",
        "unalignedCategoryName": "system type",
        "generalizations": None,
        "seedInstances": ["linux", "android", "unix", "windows"]
    },
    {
        "alignedCategoryName": "job_position",
        "unalignedCategoryName": "job position",
        "generalizations": None,
        "seedInstances": ["architect", "software engineer", "senior software engineer", "junior level", "entry level"]
    },
        {
        "alignedCategoryName": "development_tool",
        "unalignedCategoryName": "development tool",
        "generalizations": None,
        "seedInstances": ["github", "maven", "eclipse", "jenkins", "cuda", "visual studio", "svn"]
    },
]

seed_df = pd.DataFrame(concepts)

In [None]:
# save taxonomy to csv
import os
benchmark_dir = "data/jobs"
benchmark_filename = 'seed_aligned_concepts.csv'
seed_df.to_csv(os.path.join(benchmark_dir, benchmark_filename), index=None)

# Expand Taxonomy

In [None]:
# add embeddings for any seed instances that have not been already computed
%cd /mnt/efs/shared/meg_shared_scripts/meg-kb/src/concept_learning/
!CUDA_VISIBLE_DEVICES=3 python add_seed_instances_embeddings.py -m bert-base-uncased -et ac -d $data_ac/intermediate -b $benchmark_dir -c 750

In [None]:
from tqdm.notebook import tqdm
import argparse
import re
import numpy as np
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr, entropy, gmean
import random
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertTokenizer, BertModel, BertForMaskedLM
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import json
from collections import defaultdict, Counter
import time
import importlib

import logging
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
import pandas as pd
import os
import sys
import math
from annoy import AnnoyIndex
import matplotlib
from matplotlib import pyplot as plt
import networkx as nx

import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
spacy_tokenizer = nlp.tokenizer

nlp_full = spacy.load('en_core_web_sm')

from compute_concept_clusters import knn
from compute_keyphrase_embeddings import ensure_tensor_on_device, mean_pooling

from lm_probes import LMProbe, LMProbe_GPT2, LMProbe_Joint, LMProbe_PMI, LMProbe_PMI_greedy
from utils import load_embeddings, load_seed_aligned_concepts, load_seed_aligned_relations, load_benchmark
from utils import load_EE_labels
from utils import get_masked_contexts, bert_untokenize
from utils import learn_patterns

from roberta_ses.interface import Roberta_SES_Entailment

In [None]:
import utils
importlib.reload(utils)
from utils import load_embeddings, load_seed_aligned_concepts, load_seed_aligned_relations, load_benchmark
from utils import load_EE_labels
from utils import get_masked_contexts, bert_untokenize
from utils import learn_patterns

import lm_probes
importlib.reload(lm_probes)
from lm_probes import LMProbe, LMProbe_GPT2, LMProbe_Joint, LMProbe_PMI, LMProbe_PMI_greedy

In [None]:
bert_emb_path = os.path.join(data_ac, 'intermediate/BERTembed+seeds.txt')
if not os.path.exists(bert_emb_path):
    bert_emb_path = os.path.join(data_ac, 'intermediate/BERTembed.txt')
embeddings = load_embeddings(bert_emb_path, 768)
len(embeddings)

## Expand Seed Entities (clustering)
### Seed instances clustering (EE-emb)
(using all seed instances of a concept to find neighbors)

In [None]:
cluster_size = None

!python compute_concept_seeds_knn.py \
-d $data_ac/intermediate \
-e $data_ac/intermediate/BERTembed+seeds.txt \
-b $benchmark_dir \
-o $data_ac/intermediate/ee_concept_knn_k=None.csv \
-kdt

In [None]:
concept_knn_path = os.path.join(data_ac, 'intermediate/ee_concept_knn_k=None.csv')
df = pd.read_csv(concept_knn_path)
df[df['concept'] == 'programming_language'].head(10)

### EE-LM-probe (prompt)


In [None]:
lm_probe = LMProbe(model_name='bert-base-uncased')

In [None]:
!python compute_EE_LM_probe.py \
-d $data_ac/intermediate \
-b $benchmark_dir \
-e $data_ac/intermediate/BERTembed+seeds.txt \
-lm bert \
-o $data_ac/intermediate/ee_LM_bert_k=None.csv

### MRR

In [None]:
ee_emb_path = os.path.join(data_ac, 'intermediate/ee_concept_knn_k=None.csv')
ee_LM_path = os.path.join(data_ac, 'intermediate/ee_LM_bert_k=None.csv')
ee_LM_df = pd.read_csv(ee_LM_path)
ee_emb_df = pd.read_csv(ee_emb_path)
concept_list = ee_LM_df['concept'].drop_duplicates().tolist()

## Using MRR to combine ranking 

ee_mrr_combine_list = []

for _cc in sorted(concept_list):
    _ce_df = ee_emb_df[ee_emb_df['concept'] == _cc].sort_values(by='sim', ascending=False)
    _ee_emb_list = _ce_df['neighbor'].tolist()
    _ee_LM_list = ee_LM_df[ee_LM_df['concept'] == _cc]['neighbor'].tolist()
        
    _all_entities_mrr = defaultdict(float)
    for i, _e in enumerate(_ee_emb_list):
        _all_entities_mrr[_e] += 1.0 / (i+1)
    for i, _e in enumerate(_ee_LM_list):
        _all_entities_mrr[_e] += 1.0 / (i+1)

    _all_entities_mrr_list = sorted(list(_all_entities_mrr.items()), key=lambda p: p[-1], reverse=True)
    
    for _e, _mrr in _all_entities_mrr_list:
        ee_mrr_combine_list.append((_cc, _e, _mrr))

len(ee_mrr_combine_list)

In [None]:
ee_mrr_combine_path = os.path.join(data_ac, 'intermediate/ee_mrr_combine_bert_k=None.csv')
df = pd.DataFrame(ee_mrr_combine_list, columns=['concept', 'neighbor', 'MRR'])
df = df.merge(ee_LM_df, how='left', on=['concept', 'neighbor'])
df = df.merge(ee_emb_df, how='left', on=['concept', 'neighbor'])
df.to_csv(ee_mrr_combine_path, index=None)
df = pd.read_csv(ee_mrr_combine_path)
mrr = df.groupby('concept').head(200)
mrr.to_csv(os.path.join(data_ac, 'intermediate/ee_mrr_combine_bert_k=200.csv'), index=None)

In [None]:
for concept, grp in mrr.groupby('concept'):
    print(concept)
    print(grp['neighbor'].tolist()[:20])
    print()
    print()

In [None]:
lm_preds = []
for concept, grp in ee_LM_df.groupby('concept'):
        grp = grp.reset_index()
        grp = grp.sort_values(by='lm_score', ascending=False).head(200)
        lm_preds.append(grp)
lm_preds = pd.concat(lm_preds)
lm_preds.to_csv(os.path.join(data_ac, 'intermediate/ee_lm_bert_k=200.csv'), index=None)
lm_preds

for concept, grp in lm_preds.groupby('concept'):
    print(concept)
    print(grp['neighbor'].tolist()[:20])
    print()
    print()

In [None]:
emb_preds = []
for concept, grp in ee_emb_df.groupby('concept'):
        grp = grp.reset_index()
        grp = grp.sort_values(by='sim', ascending=False).head(200)
        emb_preds.append(grp)
emb_preds = pd.concat(emb_preds)
emb_preds.to_csv(os.path.join(data_ac, 'intermediate/ee_emb_bert_k=200.csv'), index=None)
emb_preds

for concept, grp in emb_preds.groupby('concept'):
    print(concept)
    print(grp['neighbor'].tolist()[:20])
    print()
    print()