In [16]:
import pandas as pd
from tabulate import tabulate

# Experimental Settings

**Datasets**: The three datasets share the same corpus and keyphrase list but differ in embeddings.
1. Indeeda-corel: Has corel embeddings. 
2. Indeeda-meg-ac: Has average sentence embeddings for keywords (where keyword has been masked in a sentence).
3. Indeeda-meg-pt: Has average tokenized embeddings from last 4 layers for keywords (where keyword has been masked in a sentence).

**Embedding parameters**:

no. of sentences sampled per keyword: 750

bert-model: /home/ubuntu/users/nikita/models/bert_finetuned_lm/indeed_reviews_ques_ans


**Clustering algorithms**:

KMeans: dim = 768 (3072 for Indeeda-meg-pt), clusters = 500, output_filename=kmeans.csv

KNN: dim = 768 (3072 for Indeeda-meg-pt), neighbors = 5, output_filename=knn.csv

Agglomerative clustering: dim = 768 (3072 for Indeeda-meg-pt), clusters = 500, output_filename=agg.csv

# KMeans

In [57]:
corel_kmeans = pd.read_csv('/home/ubuntu/users/nikita/src/meg-kb/data/indeeda-corel/intermediate/kmeans.csv')
meg_ac_kmeans = pd.read_csv('/home/ubuntu/users/nikita/src/meg-kb/data/indeeda-meg-ac/intermediate/kmeans.csv')

In [27]:
def get_cluster_kmeans(query, dfs):
    print('query: {}'.format(query))
    all_neighbors = {}
    for encoding, df in dfs.items():
        entities = []
        match = df[df['entity'] == query]
        if len(match) > 0:
            clus_id = match.iloc[0]['clus_id']
            cluster = df[df['clus_id'] == clus_id]
            entities = cluster['entity'].tolist()
            entities.remove(query)
        all_neighbors[encoding] = entities
    print(tabulate(all_neighbors, headers='keys'))

In [28]:
get_cluster_kmeans('drug test', {"mask_vec": corel_kmeans, "avg_context_vec": meg_ac_kmeans})

query: drug test
mask_vec               avg_context_vec
---------------------  ---------------------------
flat rate              random tests
gallon                 back ground checks
permit                 back ground check
livable wage           testes
retirement plan        dollar generals
class b                urine tests
shift differential     convicted felons
hand book              new hires
speeding ticket        dod
half pay               pregnant women
swab test              maintenance workers
pat test               assistance program
salaried position      random testing
temporary job          illegal drugs
strict dress code      severance pay
driver position        employment
yearly raise           sex change
language barrier       swab test
paying job             bi lingual
seasonal position      previous employers
higher salary          background checks
job fair               extensive background check
15 minute break        bluefield
perm                   nation wide

In [29]:
get_cluster_kmeans('dress code', {"mask_vec": corel_kmeans, "avg_context_vec": meg_ac_kmeans})

query: dress code
mask_vec                     avg_context_vec
---------------------------  -----------------
age requirements             turtle
starting salary              car hops
rehire policy                proper uniform
parade                       hawaiian
barometer                    school uniform
starting wage                strict dress code
youngest age                 jumpsuit
payscale                     plain dress
commission structure         dress clothes
exact amount                 parrot
hourly rate                  watch
age limit                    cap
hourly wage                  slate
legal age                    casual clothing
pay scale                    conservative
company 's policy            headphones
base pay                     sports bra
pay grade                    casual wear
range                        casual clothes
salary range                 covered
curriculum                   dress casual
kicker                       cover
minimum age     

In [30]:
get_cluster_kmeans('hiring age', {"mask_vec": corel_kmeans, "avg_context_vec": meg_ac_kmeans})

query: hiring age
mask_vec                     avg_context_vec
---------------------------  -------------------------
age requirements             baylor
starting salary              youngest age
rehire policy                baling
parade                       age range
barometer                    legal working age
starting wage                harlingen
youngest age                 ols
payscale                     legal age
commission structure         retirement age
exact amount                 civil service examination
hourly rate                  age limit
age limit                    workers permit
hourly wage                  minimal age
legal age                    brazil
pay scale                    alcoholic beverages
company 's policy            sixteen years old
base pay                     17 year olds
pay grade                    brandenburg kentucky
range                        policy states
salary range                 minimum age
curriculum                   mascot
kick

In [54]:
get_cluster_kmeans('dental benefits', {"mask_vec": corel_kmeans, "avg_context_vec": meg_ac_kmeans})

query: dental benefits
mask_vec                   avg_context_vec
-------------------------  -------------------------
lifelong learning          domestic partners
summer camps               kaiser permanente
monongalia county          thrift savings plan
401 k                      vision insurance
nascar                     civil unions
dress nice                 medicaid
heat stroke                medical insurance
chemo                      health
ad&d                       retirement plan
grt                        health benefits
adoption                   disability insurance
medical                    sodexo
hmo                        hmo
health benefits            sick leave
disability insurance       medicare
dental and vision          excellent benefits
intellectual disabilities  aflac
automobile                 health dental
dental insurance           cigna
health                     ft employees
recreational drug          health insurance
military police            atm card

In [50]:
get_cluster_kmeans('company', {"mask_vec": corel_kmeans, "avg_context_vec": meg_ac_kmeans})

query: company
mask_vec             avg_context_vec
-------------------  ----------------------------
department store     economy
dinning room         technology
king                 aetna
department           fortune 500
plant                excellent company
commissary           food industry
aisle                consumer
college town         australia
restaurant           hotel
beach                retail
federal records      advertising
warehouse            congress
met team             population
caf                  major
university           construction
town                 supermarket
fitting room         age groups
factory              elementary
manor                christian
drive thru           postal service
pressure cooker      licensed
meat department      independent
cash register        usps
career website       china
higher level         growing company
community            wide range
revolution           amazons
bakery               investment
center               

# KNN

In [59]:
corel_knn = pd.read_csv('/home/ubuntu/users/nikita/src/meg-kb/data/indeeda-corel/intermediate/knn.csv')
meg_ac_knn = pd.read_csv('/home/ubuntu/users/nikita/src/meg-kb/data/indeeda-meg-ac/intermediate/knn.csv')
meg_pt_knn = pd.read_csv('/home/ubuntu/users/nikita/src/meg-kb/data/indeeda-meg-pt/intermediate/knn.csv')

In [42]:
def get_cluster_knn(query, dfs):
    print('query: {}'.format(query))
    all_neighbors = {}
    for encoding, df in dfs.items():
        entities = []
        match = df[df['entity'] == query]
        if len(match) > 0:
            entities = match['neighbor'].tolist()
        all_neighbors[encoding] = entities
    print(tabulate(all_neighbors, headers='keys'))

In [60]:
get_cluster_knn('drug test', {"mask_vec": corel_knn, "avg_context_vec": meg_ac_knn, "avg_concat_mask_vec": meg_pt_knn})

query: drug test
mask_vec           avg_context_vec    avg_concat_mask_vec
-----------------  -----------------  ---------------------
back ground check  background check   back ground check
background check   back ground check  background check
random drug test   test               urine test
credit check       urine test         backround check
backround check    credit check       test


In [61]:
get_cluster_knn('dress code', {"mask_vec": corel_knn, "avg_context_vec": meg_ac_knn, "avg_concat_mask_vec": meg_pt_knn})

query: dress code
mask_vec             avg_context_vec    avg_concat_mask_vec
-------------------  -----------------  ---------------------
pay scale            uniform policy     uniform policy
uniform policy       casual wear        pay scale
pay rate             clothing           attendance policy
work / life balance  professional       culture
attendance policy    strict dress code  dress attire


In [62]:
get_cluster_knn('hiring age', {"mask_vec": corel_knn, "avg_context_vec": meg_ac_knn, "avg_concat_mask_vec": meg_pt_knn})

query: hiring age
mask_vec           avg_context_vec    avg_concat_mask_vec
-----------------  -----------------  ---------------------
age requirement    age requirement    age requirement
minimum age        minimum age        age range
age range          youngest age       minimum age
legal working age  minimal age        age limit
youngest age       legal working age  youngest age


In [63]:
get_cluster_knn('dental benefits', {"mask_vec": corel_knn, "avg_context_vec": meg_ac_knn, "avg_concat_mask_vec": meg_pt_knn})

query: dental benefits
mask_vec          avg_context_vec    avg_concat_mask_vec
----------------  -----------------  ---------------------
dental insurance  dental insurance   dental insurance
vision insurance  life insurance     life insurance
life insurance    medical            vision insurance
healthcare        health             dental vision
education         health insurance   paid vacations


In [64]:
get_cluster_knn('company', {"mask_vec": corel_knn, "avg_context_vec": meg_ac_knn, "avg_concat_mask_vec": meg_pt_knn})

query: company
mask_vec        avg_context_vec    avg_concat_mask_vec
--------------  -----------------  ---------------------
corporation     usps               post office
organization    publix             organization
post office     walgreens          postal service
usps            government         usps
postal service  private            corporation
                starbucks


# Agglomerative Clustering

In [46]:
# corel_agg = pd.read_csv('/home/ubuntu/users/nikita/src/meg-kb/data/indeeda-corel/intermediate/agg.csv')
meg_ac_agg = pd.read_csv('/home/ubuntu/users/nikita/src/meg-kb/data/indeeda-meg-ac/intermediate/agg.csv')

In [47]:
def get_cluster_agg(query, dfs):
    print('query: {}'.format(query))
    all_neighbors = {}
    for encoding, df in dfs.items():
        entities = []
        match = df[df['entity'] == query]
        if len(match) > 0:
            clus_id = match.iloc[0]['clus_id']
            cluster = df[df['clus_id'] == clus_id]
            entities = cluster['entity'].tolist()
            entities.remove(query)
        all_neighbors[encoding] = entities
    print(tabulate(all_neighbors, headers='keys'))

In [48]:
get_cluster_agg('drug test', {"avg_context_vec": meg_ac_agg})

query: drug test
avg_context_vec
------------------------------------
arrest record
tick
punks
classified information
minuet
wall street
unicorn
south florida
wolf
sustainable
terminal illness
troutdale
exile
pergola
lunar
grindstone
enron
red jacket
baptism
at home advisor
roller skates
eos
landscape design
hearsay
fruits
bonus structure
sexism
dod
macon ga
covina
hyderabad india
plymouth meeting
shield
little ceasars
someone elses
acronyms
in store shopper
buddy / buddy
hot topic
native
henry ford
pay rent
north american english
chatsworth
podiatrist
el segundo
nice shirt
pear
button up
labor camps
septa
red light
carpool
cta
treadmill
warner cable
owl
bay city
ants
tendonitis
two face
lifting heavy
workstation
cheaters
humidity
mass hysteria
phone operator
butter
turtle
making rate
tentacles
super fast
hydraulic fluid
bipolar disorder
self love
lansing
reverse discrimination
mezzanine
autism
angola
pts
training courses
lille
apopka
social networking
rhetoric
autism spectrum
cancerou

In [49]:
get_cluster_agg('dress code', {"avg_context_vec": meg_ac_agg})

query: dress code
avg_context_vec
------------------------------------
arrest record
tick
punks
classified information
minuet
wall street
unicorn
south florida
wolf
sustainable
terminal illness
troutdale
exile
pergola
lunar
grindstone
enron
red jacket
baptism
at home advisor
roller skates
eos
landscape design
hearsay
fruits
bonus structure
sexism
dod
macon ga
covina
hyderabad india
plymouth meeting
shield
little ceasars
someone elses
acronyms
in store shopper
buddy / buddy
hot topic
native
henry ford
pay rent
north american english
chatsworth
podiatrist
el segundo
nice shirt
pear
button up
labor camps
septa
red light
carpool
cta
treadmill
warner cable
owl
bay city
ants
tendonitis
two face
lifting heavy
workstation
cheaters
humidity
mass hysteria
phone operator
butter
turtle
making rate
tentacles
super fast
hydraulic fluid
bipolar disorder
self love
lansing
reverse discrimination
mezzanine
autism
angola
pts
training courses
lille
apopka
social networking
rhetoric
autism spectrum
cancero