# Imports

In [75]:
import pandas as pd
from tabulate import tabulate

# Experimental Settings

**Datasets**: Yelp reviews (yelp), Indeed answers (indeed), Amazon Electronics (electronics), Amazon Groceries (grocery)

**Embeddings**: Each dataset has three different embeddings: mask_vec, avg_context_vec, avg_concat_mask_vec. These embeddings share the same corpus and keyphrase list. For example, for _indeeda_
1. **mask_vec**: Has corel embeddings. dataset -> Indeeda-corel. 
2. **avg_context_vec**: Has average sentence embeddings for keywords (where keyword has been masked in a sentence). dataset -> Indeeda-meg-ac
3. **avg_concat_mask_vec**: Has average tokenized embeddings from last 4 layers for keywords (where keyword has been masked in a sentence). dataset -> Indeeda-meg-pt

**Embedding parameters**:

no. of sentences sampled per keyword: 750

bert-model: 
/home/ubuntu/users/nikita/models/bert_finetuned_lm/indeed_reviews_ques_ans
or,
bert-base-uncased


**Clustering algorithms**:

KMeans: dim = 768 (3072 for avg_concat_mask_vec), cluster size = {800, 100}, output_filename=kmeans_{size}.csv

KNN: dim = 768 (3072 for avg_concat_mask_vec), neighbors = {5, 10, 15, 20, 25}, output_filename=knn_{neighbors}.csv

# KMeans

In [10]:
def tabulate_results_kmeans(dataset, query, size):
    print('query: {}, cluster size: {}'.format(query, size))
    corel_kmeans = pd.read_csv('../../data/'+dataset+'-corel/intermediate/kmeans_'+str(size)+'.csv')
    meg_ac_kmeans = pd.read_csv('../../data/'+dataset+'-meg-ac/intermediate/kmeans_'+str(size)+'.csv')
    meg_pt_kmeans = pd.read_csv('../../data/'+dataset+'-meg-pt/intermediate/kmeans_'+str(size)+'.csv')
    get_cluster_kmeans(query, {"mask_vec": corel_kmeans, "avg_context_vec": meg_ac_kmeans, "avg_concat_mask_vec": meg_pt_kmeans})

In [241]:
def get_cluster_kmeans(query, dfs):
    all_neighbors = {}
    for encoding, df in dfs.items():
        entities = []
        match = df[df['entity'] == query]
        if len(match) > 0:
            clus_id = match.iloc[0]['clus_id']
            cluster = df[df['clus_id'] == clus_id]
            entities = cluster['entity'].tolist()
            entities.remove(query)
        all_neighbors[encoding] = entities
    print(tabulate(all_neighbors, headers='keys'))

## Indeed Answers

In [25]:
dataset = 'indeeda'

### Drug Test

In [26]:
query = 'drug test'

In [246]:
tabulate_results_kmeans(dataset, query, 800)

query: drug test, cluster size: 800
mask_vec                   avg_context_vec             avg_concat_mask_vec
-------------------------  --------------------------  -------------------------
ua                         desecration                 driving test
rotating shift             sex change                  onsite interview
cafeteria                  school district             urine test
sales rep                  criminal background check   473 exam
backround check            final paycheck              teller position
fork lift                  tps                         role play
mystery                    review eligible             math test
half pay                   employment agency           personality test
medical emergency          truck driver                appeal
criminal background check  review ready                puzzle
bee                        sine                        ua
temp service               airline industry            assessment test
dictatorship

In [247]:
tabulate_results_kmeans(dataset, query, 1000)

query: drug test, cluster size: 1000
mask_vec                     avg_context_vec        avg_concat_mask_vec
---------------------------  ---------------------  ------------------------
test                         tps                    dot physical
backround check              test                   performance review
saliva                       5 panel                math test
white paper                  pensacola florida      bg check
commission structure         rectal                 back ground check
retirement plan              finger prints          back round check
hand book                    urination              online application
swab test                    cheek swab             annual review
lunch break                  oral drug test         final interview
uniform policy               oral swab              probation period
cafeteria                    urine test             assessment test
fingerprint                  urine                  grace period
finger pr

### Dress Code

In [248]:
query = 'dress code'

In [249]:
tabulate_results_kmeans(dataset, query, 800)

query: dress code, cluster size: 800
mask_vec                     avg_context_vec        avg_concat_mask_vec
---------------------------  ---------------------  ---------------------------
typing speed                 proper uniform         typing speed
initial training             business casual        kinks
performance review           comfortable clothing   quarterly review
assessment test              suspenders             constitution
exact amount                 tunic                  rehire process
90 day probation period      professional           uniform policy
dress attire                 casual clothes         catapult
goldfish                     dress clothes          maximum age
pat test                     casual attire          open door policy
3rd interview                parrot                 customer base
pension plan                 leopard                solution
overnight shift              plain dress            bureaucracy
friday / saturday            jumpsu

In [250]:
tabulate_results_kmeans(dataset, query, 1000)

query: dress code, cluster size: 1000
mask_vec              avg_context_vec      avg_concat_mask_vec
--------------------  -------------------  ---------------------------
art                   dress clothes        uniform policy
main reason           cap                  exact date
sake                  cart attendants      sales goals
adjectives            deer valley          employee assistance program
adventure             watch                nature
golden rule           hard hat             company policy
attendance policy     uniform policy       pay rate
shopping experience   caps                 working conditions
organization          horace               hourly wages
culture               sps                  hourly rate
actress               car hops             legislation
gravity               uniform shirts       producer
economy               personal appearance  vacation policy
worst place           dress casual         minimum wages
brand extension       cover       

### Hiring Age

In [251]:
query = 'hiring age'

In [252]:
tabulate_results_kmeans(dataset, query, 800)

query: hiring age, cluster size: 800
mask_vec             avg_context_vec       avg_concat_mask_vec
-------------------  --------------------  ---------------------
minimum age          ova                   starting pay
minimal age          minimum age           legal working age
density              youngest age          starting wage
age range            deptford              parade
youngest age         minimal age           minimum age
legal age            age requirement       starting salary
age requirement      17 year olds          decimal
legal working age    hazardous material    starting rate
succession planning  baling                age requirement
                     age limit             starting wages
                     years old             youngest age
                     age range             starting point
                     legal working age     maximum wage
                     legal age             minimal age
                     yrs old               age 

In [253]:
tabulate_results_kmeans(dataset, query, 1000)

query: hiring age, cluster size: 1000
mask_vec             avg_context_vec       avg_concat_mask_vec
-------------------  --------------------  ---------------------
youngest age         years old             minimal age
wild card            brandenburg kentucky  age requirement
rehire process       age requirement       age range
legal age            age limit             legal age
age limit            youngest age          youngest age
age range            minimal age           legal working age
legal working age    legal age             age limit
moreno valley        age range             minimum age
minimum age          legal working age
age requirement      mascot
succession planning  ova
kicker               retirement age
minimal age          minimum age
maximum age          deptford
office lady          shelton washington


### Dental Benefits

In [254]:
query = 'dental benefits'

In [255]:
tabulate_results_kmeans(dataset, query, 800)

query: dental benefits, cluster size: 800
mask_vec                  avg_context_vec            avg_concat_mask_vec
------------------------  -------------------------  ---------------------
adp                       commuter                   carpool
credit history            dental insurance           dental insurance
excellent service         ad&d                       dental / vision
previous experience       domestic partners          mining industry
aflac                     blue shield                psychiatrist
special needs             life insurance             disability insurance
food stamps               united healthcare          savings accounts
managerial positions      blue cross blue shield     401 k
transport                 health care insurance      surrogacy
medical marijuana         dentist                    vision insurance
state law                 profit sharing             stock market
great customer service    dental and vision          dental vision
finger

In [256]:
tabulate_results_kmeans(dataset, query, 1000)

query: dental benefits, cluster size: 1000
mask_vec                     avg_context_vec              avg_concat_mask_vec
---------------------------  ---------------------------  ---------------------
cross training               obama                        thrift savings plan
quality service              arizona state                401 k
design                       blue cross blue shield       ad&d
customer experience          telecommuting                disability insurance
customer service             benefits package             life insurance
customer relations           lifetime                     psychological testing
talent management            ivf                          metlife
medical dental               domestic partners            aflac
basic math                   tuition reimbursement        savings accounts
team building                maternity leave              welding
customer satisfaction        healthcare                   federal records
genesis          

### Company

In [257]:
query = 'company'

In [258]:
tabulate_results_kmeans(dataset, query, 800)

query: company, cluster size: 800
mask_vec                      avg_context_vec               avg_concat_mask_vec
----------------------------  ----------------------------  ----------------------------
city                          door dash                     fair labor standards act
convenience store             jurisdiction                  erc
bones                         corporation                   washington post
department store              meijer                        post office
restaurant                    young age                     private sector
commercial banking            pepsico                       district managers
super center                  distribution centers          lgbt community
local store                   casino                        united states postal service
headquarters                  united states                 eeoc
county                        dc                            home office
channel                       g4s             

In [259]:
tabulate_results_kmeans(dataset, query, 1000)

query: company, cluster size: 1000
mask_vec              avg_context_vec               avg_concat_mask_vec
--------------------  ----------------------------  -------------------------
bakery                amazon fulfillment            middle class
restaurant            hiring practices              cia
commissary            independent                   fair labor standards act
department            aldi                          postal service
house                 insurance company             private sector
city                  mayor                         sweatshops
dressing room         advertising                   team dynamics
garden                capital                       federal government
pharmacy              co   workers                  bottom line
milford connecticut   private                       millennium
warehouse             trump                         stereo
rain                  doral                         economy
department store      amazons        

## Yelp Reviews

In [260]:
dataset = 'yelp'

### Food

In [261]:
query = 'food'

In [262]:
tabulate_results_kmeans(dataset, query, 800)

query: food, cluster size: 800
mask_vec                    avg_context_vec               avg_concat_mask_vec
--------------------------  ----------------------------  ---------------------
nasi kuning                 authentic korean food         wine selection
portion size                southern_comfort food         whiskey selection
concept                     so many choices               d cor
tap beer selection          local bakery                  draft_beer list
packaging                   shitty customer_service       outdoor setting
whiskey list                draft_beer selection          caesar_salad dressing
vocals                      new york bagels               turnover rate
graphic design              lebanese food                 dining environment
cigarette smell             whiskey selection             draft selection
typeface                    decent_sized portions         customer services
salsa music                 hard_working people           ordering syst

In [264]:
tabulate_results_kmeans(dataset, query, 1000)

query: food, cluster size: 1000
mask_vec                    avg_context_vec                  avg_concat_mask_vec
--------------------------  -------------------------------  ------------------------
ambient music               draft_beer list                  monolith
vantage point               waiter andrew                    pharmacy staff
vibes                       first class                      color and texture
view                        symbiotic                        ventilation system
user interface              exceptionally busy               weekend_brunch menu
wine list                   somewhat slow                    atmosphere
stunning view               vandal                           four star rating
window view                 octagon                          value proposition
garden centre               standing_room only               enema
soundproofing               overly attentive                 store layout
diner vibe                  prestige         

### Restaurant (C)

In [408]:
query = 'restaurant'

In [266]:
tabulate_results_kmeans(dataset, query, 800)

query: restaurant, cluster size: 800
mask_vec                 avg_context_vec           avg_concat_mask_vec
-----------------------  ------------------------  ----------------------------
gallery                  luxor                     mammal
wsm                      twilight                  independent business
newest location          beaten path               local dive_bar
shopping centre          gondola                   donut shop
chevron                  rio buffet                bakery
swap meet                palazzo hotel             brazilian steakhouse
meat dept                hell hole                 diner
arcade                   carnival buffet           local sports_bar
general store            music factory             steakhouse
mansion                  hotel                     british pub
vieux montr al           strip hotels              lichen
lobby casino             texas station             style diner
gate                     riviera                   et

In [267]:
tabulate_results_kmeans(dataset, query, 1000)

query: restaurant, cluster size: 1000
mask_vec              avg_context_vec                  avg_concat_mask_vec
--------------------  -------------------------------  ---------------------
grocer                completely different             bakery
college town          steak n shake                    mall
hogwarts              yacht                            bake shop
nightclub             dominoes                         nightclub
pawn shop             authentic_italian pizza          motel
museum                donut shop                       theater
coffee roaster        sushi samba                      movie theatre
retirement community  miyako                           casino resort
taqueria              hangover cure                    italian bistro
music festival        tehran                           casino hotel
cookhouse             hookah                           library
mansion               patchouli                        feline
hyatt hotel           hot_dog joi

### Atmosphere

In [268]:
query = 'atmosphere'

In [269]:
tabulate_results_kmeans(dataset, query, 800)

query: atmosphere, cluster size: 800
mask_vec                    avg_context_vec               avg_concat_mask_vec
--------------------------  ----------------------------  ---------------------
nasi kuning                 gastropub                     wine selection
portion size                quick and painless            whiskey selection
concept                     beautiful space               food
tap beer selection          healthy food choices          d cor
packaging                   decent wine_list              draft_beer list
whiskey list                authentic vietnamese_cuisine  outdoor setting
vocals                      cozy setting                  caesar_salad dressing
graphic design              enjoyable dining_experience   turnover rate
cigarette smell             neighbourhood pub             dining environment
typeface                    an adorable                   draft selection
salsa music                 infrastructure                customer services
op

In [270]:
tabulate_results_kmeans(dataset, query, 1000)

query: atmosphere, cluster size: 1000
mask_vec                    avg_context_vec               avg_concat_mask_vec
--------------------------  ----------------------------  ------------------------
ambient music               minimalism                    monolith
vantage point               free wifi                     pharmacy staff
vibes                       impeccable service            color and texture
view                        lovely ambience               ventilation system
user interface              fun environment               food
wine list                   family_oriented restaurant    weekend_brunch menu
food                        authentic korean food         four star rating
stunning view               simple fare                   value proposition
window view                 vintage decor                 enema
garden centre               unique dining_experience      store layout
soundproofing               stiff drinks                  spp
diner vibe         

### Service (C)

In [271]:
query = 'customer service'

In [272]:
tabulate_results_kmeans(dataset, query, 800)

query: customer service, cluster size: 800
mask_vec                avg_context_vec    avg_concat_mask_vec
----------------------  -----------------  ---------------------
order accuracy
vandalism
enlightenment
digestion
sentience
service recovery
rude behavior
wisdom
intoxication
health standards
sexual orientation
tenure
yelp ratings
moustache
sanitation
cruise control
social media marketing
unprofessionalism
problem solving
exploration
body language
poor attitudes
locality
parenting
jihad
rudeness
communication
breast feeding
programming
laundry service
ethics
friendliness
bad behavior
lack thereof
hygiene
muscle memory
honesty
quality assurance
market share
personal space
critical mass
higher standards
quality control
behavior
diligence
torque
grammar
employment
etiquette
judgement
leadership
customer loyalty
productivity
convenience
turnover
puerto ricans
giant order
economics
gross negligence
product placement
safety
philanthropy
sustainability
discrimination
mastication
employee 

In [273]:
tabulate_results_kmeans(dataset, query, 1000)

query: customer service, cluster size: 1000
mask_vec                   avg_context_vec    avg_concat_mask_vec
-------------------------  -----------------  ---------------------
plumbing
action
marketing
negligence
poor attitudes
productivity
foot traffic
order accuracy
success
vandalism
programming
background noise
convenience
availability
ganja
glitz
customer interaction
intercourse
reservation policy
communication
pacing
innovation
punctuation
sustainability
orientation
parenting
inflated prices
locality
drunk munchies
culture shock
crowds
revenue
cloning
leadership
citizenship
gentrification
laundry service
damage
cruise control
electricity
chrysler
exploration
dated decor
proof
costumer service
customer engagement
yelp ratings
testosterone
greatness
traffic
punishment
nostalgia
late night grub
anesthesia
animation
teamwork
breast feeding
negative space
crowd control
relaxation
profit
turnover
employment
smoke smell
high turnover
street cred
capacity
big business
lowest common deno

### Mexican

In [274]:
query = 'mexican'

In [275]:
tabulate_results_kmeans(dataset, query, 800)

query: mexican, cluster size: 800
mask_vec                avg_context_vec           avg_concat_mask_vec
----------------------  ------------------------  ----------------------
indian                  old fashioned             buddhist
palestinian             savoy                     customer_service 101
isan                    venezuelan                american fusion
deli style              chipotle                  indian
southern soul           fraser                    michelin
traditional japanese    ramen                     caribbean
mexican inspired        omnivore                  americas
tibetan                 oktoberfest               european
pakistani               korean                    americanized chinese
bodybuilding            asian                     industrial
uyghur                  grass                     americanized mexican
thai                    caribbean                 redneck
middle eastern          argentina                 polish
authentic jamai

In [276]:
tabulate_results_kmeans(dataset, query, 1000)

query: mexican, cluster size: 1000
mask_vec             avg_context_vec               avg_concat_mask_vec
-------------------  ----------------------------  ----------------------------
flange               stoner                        pakistan
bellwether           less pretentious              tailgating
southern american    argentine                     lebanese
scandinavian         superlative                   japanese korean
palestinian          galway                        real italian
central american     your typical                  laotian
iranian              usual fare                    piccadilly
balkan               oslo                          ukrainian
salvadoran           traditional irish             peasant
austrian             irish                         authentic hawaiian
slovenian            typical mexican restaurant    mexican american
northern             new age                       guyanese
ole                  british                       french
cana

### Dessert

In [277]:
query = 'dessert'

In [278]:
tabulate_results_kmeans(dataset, query, 800)

query: dessert, cluster size: 800
mask_vec                   avg_context_vec            avg_concat_mask_vec
-------------------------  -------------------------  ---------------------
ny style pizza             an arnold_palmer           turkish tea
lebanese cuisine           cabernet                   chocolate custard
south indian cuisine       honeysuckle                snow cream
eating healthy             soju                       yakiniku
italian american cuisine   jasmine                    barilla
cuisine                    190 octane                 chaat
fake meat                  latte                      aged beef
italian fare               chrysanthemum tea          bottom shelf
middle eastern cuisine     unlimited beer             biosphere
brewed beer                merlot                     polish dog
standard breakfast fare    an iced_americano          nestea
american fare              peach bellini              hitachi
junk                       old fashions      

In [279]:
tabulate_results_kmeans(dataset, query, 1000)

query: dessert, cluster size: 1000
mask_vec                     avg_context_vec            avg_concat_mask_vec
---------------------------  -------------------------  -----------------------------
red velvet cake              clan                       republic ramen
unlimited champagne          extra bonus                french_press coffee
tap water                    chilly day                 pistachio cake
cider                        pizza                      banana tempura
craft beer                   topos                      naan bread
shave ice                    henna                      fresh pressed_juice
secret recipe                old fashioneds             mushroom soup
sangria                      coffee                     tteokbokki
french pressed coffee        some tweaking              roasted green_tea
bubble milk tea              dogs                       butter chicken
bottomless coffee            beer                       bibimbap
cold pizza              

# KNN

In [80]:
def tabulate_results_knn(dataset, query, size, word=None):
    filename = ""
    if word is None:
        filename = 'knn_'+str(size)+'.csv'
    else:
        filename = 'knn_'+str(size)+'.'+str(word)+'.csv'

    print('query: {}, cluster size: {}'.format(query, size))
    corel_knn = pd.read_csv('../../data/'+dataset+'-corel/intermediate/'+filename)
    meg_ac_knn = pd.read_csv('../../data/'+dataset+'-meg-ac/intermediate/'+filename)
    meg_pt_knn = pd.read_csv('../../data/'+dataset+'-meg-pt/intermediate/'+filename)
    get_cluster_knn(query, {"mask_vec": corel_knn, "avg_context_vec": meg_ac_knn, "avg_concat_mask_vec": meg_pt_knn})

def tabulate_results_knn_sim(dataset, query, size, word=None):
    filename = ""
    if word is None:
        filename = 'knn_'+str(size)+'.csv'
    else:
        filename = 'knn_'+str(size)+'.'+str(word)+'.csv'

    print('query: {}, cluster size: {}'.format(query, size))
    corel_knn = pd.read_csv('../../data/'+dataset+'-corel/intermediate/'+filename)
    meg_ac_knn = pd.read_csv('../../data/'+dataset+'-meg-ac/intermediate/'+filename)
    meg_pt_knn = pd.read_csv('../../data/'+dataset+'-meg-pt/intermediate/'+filename)
    get_cluster_knn_sim(query, {"mask_vec": corel_knn, "avg_context_vec": meg_ac_knn, "avg_concat_mask_vec": meg_pt_knn})

def tabulate_results_knn_sim2(dataset, query, size, word=None):
    filename = ""
    if word is None:
        filename = 'knn_'+str(size)+'.csv'
    else:
        filename = 'knn_'+str(size)+'.'+str(word)+'.csv'

    print('query: {}, cluster size: {}'.format(query, size))
    corel_knn = pd.read_csv('../../data/'+dataset+'-corel/intermediate/'+filename)
    #meg_ac_knn = pd.read_csv('../../data/'+dataset+'-meg-ac/intermediate/'+filename)
    meg_pt_knn = pd.read_csv('../../data/'+dataset+'-meg-pt/intermediate/'+filename)
    get_cluster_knn_sim(query, {"mask_vec": corel_knn, "avg_concat_mask_vec": meg_pt_knn})

    
    
def tabulate_results_knn_meg(dataset, query, size):
    print('query: {}, cluster size: {}'.format(query, size))
    meg_ac_knn = pd.read_csv('../../data/'+dataset+'-meg-ac/intermediate/knn_'+str(size)+'.csv')
    meg_pt_knn = pd.read_csv('../../data/'+dataset+'-meg-pt/intermediate/knn_'+str(size)+'.csv')
    get_cluster_knn(query, {"avg_context_vec": meg_ac_knn, "avg_concat_mask_vec": meg_pt_knn})
    
def get_cluster_knn(query, dfs):
    all_neighbors = {}
    for encoding, df in dfs.items():
        entities = []
        match = df[df['entity'] == query]
        if len(match) > 0:
            entities = match['neighbor'].tolist()
        all_neighbors[encoding] = entities
    print(tabulate(all_neighbors, headers='keys'))
    
def get_cluster_knn_sim(query, dfs):
    all_neighbors = {}
    formatter = "{:.4f}"
    for encoding, df in dfs.items():
        entity_sim = []
        match = df[df['entity'] == query]
        if len(match) > 0:
            entities = match['neighbor'].tolist()
            sims = match['sim'].tolist()
            for i in range(len(entities)):
                entity_sim.append(entities[i]+"<=>"+formatter.format(sims[i]))
        all_neighbors[encoding] = entity_sim
    print(tabulate(all_neighbors, headers='keys'))
    df = pd.DataFrame(all_neighbors) 
    df.to_csv(query+'.csv', index=False) 

## Indeed Answers

In [27]:
dataset = 'indeeda'

### Drug Test

In [28]:
query = 'drug test'

In [61]:
tabulate_results_knn(dataset, query, 20)

query: drug test, cluster size: 20
mask_vec           avg_context_vec      avg_concat_mask_vec
-----------------  -------------------  ---------------------
back ground check  background check     back ground check
background check   back ground check    background check
random drug test   test                 urine test
credit check       urine test           backround check
backround check    credit check         test
swab test          back round check     random drug test
math test          drug tests           urinalysis
ua                 urine drug test      swab test
job offer          urinalysis           ua
back round check   random drug test     math test
typing test        backround check      saliva test
pension            ua                   back round check
group interview    swab test            credit check
lunch break        assessment test      assessment test
drug tests         pass                 urine drug test
401k plan          random drug testing  personality

In [62]:
tabulate_results_knn(dataset, query, 50, 1)

query: drug test, cluster size: 50
mask_vec                 avg_context_vec       avg_concat_mask_vec
-----------------------  --------------------  ---------------------
back ground check        background check      back ground check
background check         back ground check     background check
random drug test         test                  urine test
credit check             urine test            backround check
backround check          credit check          test
swab test                back round check      random drug test
math test                drug tests            urinalysis
ua                       urine drug test       swab test
job offer                urinalysis            ua
test                     random drug test      math test
back round check         backround check       saliva test
typing test              ua                    back round check
pension                  swab test             credit check
group interview          assessment test       assessment 

In [29]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: drug test, cluster size: 100
mask_vec                          avg_context_vec                  avg_concat_mask_vec
--------------------------------  -------------------------------  ---------------------------------
back ground check<=>0.9889        background check<=>0.9970        back ground check<=>0.9881
background check<=>0.9869         back ground check<=>0.9969       background check<=>0.9877
random drug test<=>0.9802         test<=>0.9954                    urine test<=>0.9820
credit check<=>0.9795             urine test<=>0.9943              backround check<=>0.9801
backround check<=>0.9789          credit check<=>0.9935            test<=>0.9800
swab test<=>0.9776                back round check<=>0.9930        random drug test<=>0.9798
math test<=>0.9749                drug tests<=>0.9927              urinalysis<=>0.9795
ua<=>0.9738                       urine drug test<=>0.9926         swab test<=>0.9783
job offer<=>0.9729                urinalysis<=>0.9922          

#### Urine Test

In [30]:
query = 'urine test'

In [64]:
tabulate_results_knn(dataset, query, 15)

query: urine test, cluster size: 15
mask_vec                  avg_context_vec    avg_concat_mask_vec
------------------------  -----------------  ---------------------
urine drug test           urine drug test    urine drug test
saliva test               swab test          saliva test
urinalysis                urinalysis         swab test
cheek swab                drug test          urinalysis
blood test                saliva test        back ground check
saliva drug test          back ground check  ua
standard urine test       test               drug test
weekly pay                background check   background check
better job opportunity    ua                 cheek swab
casual wear               cheek swab         group interview
formal wear               backround check    backround check
federal background check  back round check   personality test
lol                       pass               typing test
regular clothes           random drug test   back round check
flexible schedul

In [31]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: urine test, cluster size: 100
mask_vec                               avg_context_vec                      avg_concat_mask_vec
-------------------------------------  -----------------------------------  ----------------------------------
urine drug test<=>0.9916               urine drug test<=>0.9972             urine drug test<=>0.9916
saliva test<=>0.9886                   swab test<=>0.9962                   saliva test<=>0.9913
urinalysis<=>0.9881                    urinalysis<=>0.9952                  swab test<=>0.9906
cheek swab<=>0.9865                    drug test<=>0.9943                   urinalysis<=>0.9863
blood test<=>0.9813                    saliva test<=>0.9940                 back ground check<=>0.9827
saliva drug test<=>0.9807              back ground check<=>0.9939           ua<=>0.9820
standard urine test<=>0.9790           test<=>0.9936                        drug test<=>0.9820
weekly pay<=>0.9782                    background check<=>0.9931            backg

#### Oral Test

In [67]:
query = 'oral test'

In [68]:
tabulate_results_knn(dataset, query, 15)

query: oral test, cluster size: 15
mask_vec              avg_context_vec    avg_concat_mask_vec
--------------------  -----------------  ---------------------
customer services     urine test         urinalysis
common sense          swab test          swab test
drug test             back ground check  test
online application    drug test          urine test
direct deposit        test               drug test
back ground check     background check   saliva test
oral drug test        urine drug test    back ground check
criminal history      urinalysis         background check
aptitude test         ua                 oral drug test
sun                   saliva test        ad
dl                    pass               internship
urinalysis            hiring process     report
sales representative  back round check   ua
flexible schedule     math test          assessment test
internship            matter             training program


#### Swab Test

In [69]:
query = 'swab test'

In [70]:
tabulate_results_knn(dataset, query, 15)

query: swab test, cluster size: 15
mask_vec            avg_context_vec    avg_concat_mask_vec
------------------  -----------------  ---------------------
back ground check   urine test         saliva test
saliva test         saliva test        urine test
drug test           urinalysis         urinalysis
background check    urine drug test    cheek swab
backround check     cheek swab         drug test
random drug test    test               urine drug test
seasonal position   drug test          back ground check
urine drug test     back ground check  ua
ua                  ua                 background check
cotton swab         urine              test
group interview     background check   random drug test
temporary job       urine tests        back round check
credit check        pass               hair follicle test
hair follicle test  cotton swab        backround check
flexible schedule   oral test          group interview


#### background check

In [71]:
query = 'background check'

In [72]:
tabulate_results_knn(dataset, query, 15)

query: background check, cluster size: 15
mask_vec           avg_context_vec      avg_concat_mask_vec
-----------------  -------------------  ---------------------
back ground check  back ground check    back ground check
drug test          drug test            backround check
backround check    backround check      drug test
back round check   test                 back round check
job offer          back round check     urine test
group interview    assessment test      urinalysis
typing test        application process  test
test               urine test           bg check
swab test          urine drug test      swab test
phone interview    hiring process       saliva test
credit check       results              ua
2nd interview      urinalysis           backgroud check
bg check           credit check         2nd interview
ua                 job offer            job offer
job interview      driving record       assessment test


#### Very Large K

In [73]:
query = 'drug test'

In [74]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: drug test, cluster size: 100
mask_vec                          avg_context_vec                  avg_concat_mask_vec
--------------------------------  -------------------------------  ---------------------------------
back ground check<=>0.9889        background check<=>0.9970        back ground check<=>0.9881
background check<=>0.9869         back ground check<=>0.9969       background check<=>0.9877
random drug test<=>0.9802         test<=>0.9954                    urine test<=>0.9820
credit check<=>0.9795             urine test<=>0.9943              backround check<=>0.9801
backround check<=>0.9789          credit check<=>0.9935            test<=>0.9800
swab test<=>0.9776                back round check<=>0.9930        random drug test<=>0.9798
math test<=>0.9749                drug tests<=>0.9927              urinalysis<=>0.9795
ua<=>0.9738                       urine drug test<=>0.9926         swab test<=>0.9783
job offer<=>0.9729                urinalysis<=>0.9922          

In [75]:
tabulate_results_knn_sim(dataset, query, 500, 1)

query: drug test, cluster size: 500
mask_vec                              avg_context_vec                       avg_concat_mask_vec
------------------------------------  ------------------------------------  ------------------------------------
back ground check<=>0.9889            background check<=>0.9970             back ground check<=>0.9881
background check<=>0.9869             back ground check<=>0.9969            background check<=>0.9877
random drug test<=>0.9802             test<=>0.9954                         urine test<=>0.9820
credit check<=>0.9795                 urine test<=>0.9943                   backround check<=>0.9801
backround check<=>0.9789              credit check<=>0.9935                 test<=>0.9800
swab test<=>0.9776                    back round check<=>0.9930             random drug test<=>0.9798
math test<=>0.9749                    drug tests<=>0.9927                   urinalysis<=>0.9795
ua<=>0.9738                           urine drug test<=>0.9926   

In [76]:
tabulate_results_knn_sim(dataset, query, 1000, 1)

query: drug test, cluster size: 1000
mask_vec                              avg_context_vec                        avg_concat_mask_vec
------------------------------------  -------------------------------------  -------------------------------------
back ground check<=>0.9889            background check<=>0.9970              back ground check<=>0.9881
background check<=>0.9869             back ground check<=>0.9969             background check<=>0.9877
random drug test<=>0.9802             test<=>0.9954                          urine test<=>0.9820
credit check<=>0.9795                 urine test<=>0.9943                    backround check<=>0.9801
backround check<=>0.9789              credit check<=>0.9935                  test<=>0.9800
swab test<=>0.9776                    back round check<=>0.9930              random drug test<=>0.9798
math test<=>0.9749                    drug tests<=>0.9927                    urinalysis<=>0.9795
ua<=>0.9738                           urine drug test<

### Dress Code

In [32]:
query = 'dress code'

In [80]:
tabulate_results_knn(dataset, query, 20)

query: dress code, cluster size: 20
mask_vec              avg_context_vec    avg_concat_mask_vec
--------------------  -----------------  ---------------------
pay scale             uniform policy     uniform policy
uniform policy        casual wear        pay scale
pay rate              clothing           attendance policy
work / life balance   professional       culture
attendance policy     strict dress code  dress attire
pay structure         footwear           bonus structure
commission structure  dress codes        commission structure
point system          business casual    pay rate
dress attire          brand              training program
bonus structure       gym                open door policy
training program      cap                pay structure
work life balance     watch              point system
contract              casual dress       interview process
union                 caps               union
atmosphere            sport              dress codes
base salary       

In [81]:
tabulate_results_knn(dataset, query, 50, 1)

query: dress code, cluster size: 50
mask_vec                     avg_context_vec    avg_concat_mask_vec
---------------------------  -----------------  ---------------------------
pay scale                    uniform policy     uniform policy
uniform policy               casual wear        pay scale
pay rate                     clothing           attendance policy
culture                      professional       culture
work / life balance          strict dress code  dress attire
attendance policy            footwear           bonus structure
pay structure                dress codes        commission structure
commission structure         business casual    pay rate
point system                 brand              training program
dress attire                 gym                open door policy
bonus structure              cap                pay structure
training program             watch              point system
work life balance            casual dress       interview process
contrac

In [33]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: dress code, cluster size: 100
mask_vec                              avg_context_vec               avg_concat_mask_vec
------------------------------------  ----------------------------  ------------------------------------
pay scale<=>0.9752                    uniform policy<=>0.9876       uniform policy<=>0.9674
uniform policy<=>0.9729               casual wear<=>0.9876          pay scale<=>0.9665
pay rate<=>0.9721                     clothing<=>0.9873             attendance policy<=>0.9642
culture<=>0.9672                      professional<=>0.9864         culture<=>0.9641
work / life balance<=>0.9656          strict dress code<=>0.9859    dress attire<=>0.9612
attendance policy<=>0.9656            footwear<=>0.9834             bonus structure<=>0.9602
pay structure<=>0.9653                dress codes<=>0.9832          commission structure<=>0.9598
commission structure<=>0.9642         business casual<=>0.9830      pay rate<=>0.9593
point system<=>0.9636                 brand<

#### casual wear

In [82]:
query = 'casual wear'

In [83]:
tabulate_results_knn(dataset, query, 15)

query: casual wear, cluster size: 15
mask_vec                avg_context_vec    avg_concat_mask_vec
----------------------  -----------------  ---------------------
weekly pay              dress code         business casual
business casual         professional       casual clothing
casual attire           business casual    weekly pay
casual clothing         clothing           casual clothes
regular clothes         footwear           business professional
casual clothes          casual dress       casual attire
blue cross blue shield  casual clothing    footwear
lol                     proper attire      security
contact hr              light              common sense
ninety days             cover              3rd shift
formal wear             sports             dd
answer phones           sport              asset protection
dress casual            cap                management
weekly paychecks        gym                clothing
urinalysis              proper uniform     formal wear


#### footwear

In [84]:
query = 'footwear'

In [85]:
tabulate_results_knn(dataset, query, 15)

query: footwear, cluster size: 15
mask_vec          avg_context_vec    avg_concat_mask_vec
----------------  -----------------  ---------------------
clothing          clothing           clothing
cosmetics         professional       facial hair
customer service  dress code         casual wear
insurance         gym                sports
loss prevention   casual wear        casual clothing
asset protection  sports             security
facial hair       business casual    scrubs
tech support      casual dress       business casual
dress code        sport              customer service
proper uniform    caps               fashion
electronics       light              food
health insurance  conservative       asset protection
scrubs            golf               comfortable shoes
healthcare        nike               loss prevention
crew members      nice               customer services


#### red khaki

In [34]:
query = 'khaki' #red khaki

In [89]:
tabulate_results_knn(dataset, query, 15)

query: khaki, cluster size: 15
mask_vec       avg_context_vec       avg_concat_mask_vec
-------------  --------------------  ---------------------
navy blue      black                 black
black          navy blue             black khaki
white          black polo            dark colored
navy           black collared shirt  navy blue
blue           blue polo shirt       tan or black
dark colored   dark blue             cargo
red            green polo            yoga
dark blue      dark colored          solid black
plain black    polo                  blue jean
black dress    light blue            dark blue
tan or black   collard shirt         solid colored
solid colored  white                 plain black
green          solid color           capri
cargo          blue                  white
blue jean      dark jeans            solid color


In [35]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: khaki, cluster size: 100
mask_vec                        avg_context_vec                avg_concat_mask_vec
------------------------------  -----------------------------  ------------------------------
navy blue<=>0.9779              black<=>0.9949                 black<=>0.9793
black<=>0.9720                  navy blue<=>0.9949             black khaki<=>0.9780
white<=>0.9660                  black polo<=>0.9935            dark colored<=>0.9644
navy<=>0.9653                   black collared shirt<=>0.9926  navy blue<=>0.9621
blue<=>0.9638                   blue polo shirt<=>0.9917       tan or black<=>0.9609
dark colored<=>0.9626           dark blue<=>0.9916             cargo<=>0.9564
red<=>0.9626                    green polo<=>0.9916            yoga<=>0.9561
dark blue<=>0.9624              dark colored<=>0.9914          solid black<=>0.9546
plain black<=>0.9571            polo<=>0.9911                  blue jean<=>0.9534
black dress<=>0.9568            light blue<=>0.9911     

#### blue jeans

In [91]:
query = 'blue jeans'

In [92]:
tabulate_results_knn(dataset, query, 15)

query: blue jeans, cluster size: 15
mask_vec           avg_context_vec    avg_concat_mask_vec
-----------------  -----------------  ---------------------
black dress pants  black dress pants  dress pants
dark jeans         jeans              dark jeans
jeans              dark jeans         jeans
black jeans        dress pants        black jeans
dress pants        black jeans        brown pants
dark blue jeans    polo               dark blue jeans
brown pants        dress shoes        dress slacks
dress shoes        black              black dress pants
dark pants         dark pants         dark pants
comfortable shoes  dark blue jeans    t shirts
nonslip shoes      brown pants        nice jeans
leggings           white              dress shoes
dress slacks       polo shirt         beige pants
polo               solid color        comfortable shoes
t shirts           black polo         athletic shoes


#### t shirt

In [36]:
query = 't shirt'

In [95]:
tabulate_results_knn(dataset, query, 15)

query: t shirt, cluster size: 15
mask_vec          avg_context_vec    avg_concat_mask_vec
----------------  -----------------  ---------------------
polo shirt        polo shirt         polo shirt
uniform shirt     tee shirt          polo
polo              uniform shirt      tee shirt
collar shirt      polo               uniform shirt
tee shirt         collared shirt     dress shirt
collard shirt     jeans              panera shirt
dress shirt       t shirts           blouse
black t shirt     collar shirt       collar shirt
white polo shirt  black dress pants  collared shirt
collared shirt    black jeans        black t shirt
blouse            smock              smock
name tag          red polo           dress pants
jeans             black              blazer
hoodie            dress pants        jeans
smock             collard shirt      white polo shirt


In [37]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: t shirt, cluster size: 100
mask_vec                       avg_context_vec                avg_concat_mask_vec
-----------------------------  -----------------------------  -----------------------------
polo shirt<=>0.9933            polo shirt<=>0.9969            polo shirt<=>0.9933
uniform shirt<=>0.9871         tee shirt<=>0.9950             polo<=>0.9845
polo<=>0.9839                  uniform shirt<=>0.9948         tee shirt<=>0.9840
collar shirt<=>0.9809          polo<=>0.9939                  uniform shirt<=>0.9833
tee shirt<=>0.9803             collared shirt<=>0.9938        dress shirt<=>0.9793
collard shirt<=>0.9792         jeans<=>0.9934                 panera shirt<=>0.9786
dress shirt<=>0.9767           t shirts<=>0.9912              blouse<=>0.9741
black t shirt<=>0.9760         collar shirt<=>0.9908          collar shirt<=>0.9740
white polo shirt<=>0.9741      black dress pants<=>0.9907     collared shirt<=>0.9735
collared shirt<=>0.9726        black jeans<=>0.9907  

#### Very Large K

In [96]:
query = 'dress code'

In [97]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: dress code, cluster size: 100
mask_vec                              avg_context_vec               avg_concat_mask_vec
------------------------------------  ----------------------------  ------------------------------------
pay scale<=>0.9752                    uniform policy<=>0.9876       uniform policy<=>0.9674
uniform policy<=>0.9729               casual wear<=>0.9876          pay scale<=>0.9665
pay rate<=>0.9721                     clothing<=>0.9873             attendance policy<=>0.9642
culture<=>0.9672                      professional<=>0.9864         culture<=>0.9641
work / life balance<=>0.9656          strict dress code<=>0.9859    dress attire<=>0.9612
attendance policy<=>0.9656            footwear<=>0.9834             bonus structure<=>0.9602
pay structure<=>0.9653                dress codes<=>0.9832          commission structure<=>0.9598
commission structure<=>0.9642         business casual<=>0.9830      pay rate<=>0.9593
point system<=>0.9636                 brand<

In [98]:
tabulate_results_knn_sim(dataset, query, 500, 1)

query: dress code, cluster size: 500
mask_vec                              avg_context_vec                       avg_concat_mask_vec
------------------------------------  ------------------------------------  ------------------------------------
pay scale<=>0.9752                    uniform policy<=>0.9876               uniform policy<=>0.9674
uniform policy<=>0.9729               casual wear<=>0.9876                  pay scale<=>0.9665
pay rate<=>0.9721                     clothing<=>0.9873                     attendance policy<=>0.9642
culture<=>0.9672                      professional<=>0.9864                 culture<=>0.9641
work / life balance<=>0.9656          strict dress code<=>0.9859            dress attire<=>0.9612
attendance policy<=>0.9656            footwear<=>0.9834                     bonus structure<=>0.9602
pay structure<=>0.9653                dress codes<=>0.9832                  commission structure<=>0.9598
commission structure<=>0.9642         business casual<=>0.

In [99]:
tabulate_results_knn_sim(dataset, query, 1000, 1)

query: dress code, cluster size: 1000
mask_vec                               avg_context_vec                       avg_concat_mask_vec
-------------------------------------  ------------------------------------  -------------------------------------
pay scale<=>0.9752                     uniform policy<=>0.9876               uniform policy<=>0.9674
uniform policy<=>0.9729                casual wear<=>0.9876                  pay scale<=>0.9665
pay rate<=>0.9721                      clothing<=>0.9873                     attendance policy<=>0.9642
culture<=>0.9672                       professional<=>0.9864                 culture<=>0.9641
work / life balance<=>0.9656           strict dress code<=>0.9859            dress attire<=>0.9612
attendance policy<=>0.9656             footwear<=>0.9834                     bonus structure<=>0.9602
pay structure<=>0.9653                 dress codes<=>0.9832                  commission structure<=>0.9598
commission structure<=>0.9642          business

### Hiring Age

In [100]:
query = 'hiring age'

In [102]:
tabulate_results_knn(dataset, query, 20)

query: hiring age, cluster size: 20
mask_vec           avg_context_vec    avg_concat_mask_vec
-----------------  -----------------  ---------------------
age requirement    age requirement    age requirement
minimum age        minimum age        age range
age range          youngest age       minimum age
legal working age  minimal age        age limit
youngest age       legal working age  youngest age
starting wage      age range          starting wage
pay rate           legal age          legal working age
starting pay       age limit          starting pay
age limit          years old          starting rate
legal age          17 year olds       starting salary
pay scale          starting wage      pay rate
starting salary    starting pay       legal age
maximum age        starting salary    hourly rate
hourly rate        starting rate      minimal age
starting rate      average            pay scale
hourly pay         retirement age     hourly pay
base pay           pay rate           

In [103]:
tabulate_results_knn(dataset, query, 50, 1)

query: hiring age, cluster size: 50
mask_vec              avg_context_vec     avg_concat_mask_vec
--------------------  ------------------  ---------------------
age requirement       age requirement     age requirement
minimum age           minimum age         age range
age range             youngest age        minimum age
legal working age     minimal age         age limit
youngest age          legal working age   youngest age
starting wage         age range           starting wage
pay rate              legal age           legal working age
starting pay          age limit           starting pay
age limit             years old           starting rate
legal age             17 year olds        starting salary
pay scale             starting wage       pay rate
starting salary       starting pay        legal age
maximum age           starting salary     hourly rate
hourly rate           starting rate       minimal age
starting rate         average             pay scale
hourly pay         

#### age limit

In [122]:
query = 'age limit'

In [123]:
tabulate_results_knn(dataset, query, 15)

query: age limit, cluster size: 15
mask_vec              avg_context_vec     avg_concat_mask_vec
--------------------  ------------------  ---------------------
vacation policy       age requirement     age requirement
rehire policy         pay rate            maximum age
maximum age           legal age           hiring age
age requirement       maximum age         pay rate
dress code            legal working age   age requirements
work / life balance   hiring age          rehire policy
commission structure  state law           starting wage
pay rate              minimum age         pay scale
time frame            time frame          legal working age
graveyard shift       shift differential  starting salary
overnight shift       youngest age        salary range
starting wage         17 year olds        dress code
hourly pay            delivery drivers    minimum age
4th shift             crew member         vacation policy
drug test             dishwasher          age range


#### years old

In [124]:
query = 'years old'

In [125]:
tabulate_results_knn(dataset, query, 15)

query: years old, cluster size: 15
mask_vec          avg_context_vec     avg_concat_mask_vec
----------------  ------------------  ---------------------
hour shifts       legal working age   yrs old
20 minutes        legal age           hour shifts
days tops         hiring age          yr old
min break         minimum age         stars
percent discount  age requirement     mile radius
plus tips         youngest age        plus tips
12 hour shifts    17 year olds        17 year olds
p / h             minimal age         minute walk
45 minutes        age limit           percent discount
30 minute breaks  yrs old             min break
hundred dollars   age range           bucks
8 hour shifts     working permit      pts
sec               worker 's permit    p / h
plus commission   parents permission  12 hour shifts
cpm               workers permit      m f


#### Very Large K

In [104]:
query = 'hiring age'

In [105]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: hiring age, cluster size: 100
mask_vec                       avg_context_vec                avg_concat_mask_vec
-----------------------------  -----------------------------  -----------------------------
age requirement<=>0.9892       age requirement<=>0.9957       age requirement<=>0.9904
minimum age<=>0.9760           minimum age<=>0.9944           age range<=>0.9693
age range<=>0.9653             youngest age<=>0.9915          minimum age<=>0.9692
legal working age<=>0.9586     minimal age<=>0.9845           age limit<=>0.9587
youngest age<=>0.9584          legal working age<=>0.9837     youngest age<=>0.9568
starting wage<=>0.9516         age range<=>0.9824             starting wage<=>0.9557
pay rate<=>0.9421              legal age<=>0.9817             legal working age<=>0.9500
starting pay<=>0.9393          age limit<=>0.9742             starting pay<=>0.9481
age limit<=>0.9363             years old<=>0.9724             starting rate<=>0.9457
legal age<=>0.9337            

In [106]:
tabulate_results_knn_sim(dataset, query, 500, 1)

query: hiring age, cluster size: 500
mask_vec                               avg_context_vec                     avg_concat_mask_vec
-------------------------------------  ----------------------------------  ------------------------------------
age requirement<=>0.9892               age requirement<=>0.9957            age requirement<=>0.9904
minimum age<=>0.9760                   minimum age<=>0.9944                age range<=>0.9693
age range<=>0.9653                     youngest age<=>0.9915               minimum age<=>0.9692
legal working age<=>0.9586             minimal age<=>0.9845                age limit<=>0.9587
youngest age<=>0.9584                  legal working age<=>0.9837          youngest age<=>0.9568
starting wage<=>0.9516                 age range<=>0.9824                  starting wage<=>0.9557
pay rate<=>0.9421                      legal age<=>0.9817                  legal working age<=>0.9500
starting pay<=>0.9393                  age limit<=>0.9742                  

In [107]:
tabulate_results_knn_sim(dataset, query, 1000, 1)

query: hiring age, cluster size: 1000
mask_vec                               avg_context_vec                     avg_concat_mask_vec
-------------------------------------  ----------------------------------  -------------------------------------
age requirement<=>0.9892               age requirement<=>0.9957            age requirement<=>0.9904
minimum age<=>0.9760                   minimum age<=>0.9944                age range<=>0.9693
age range<=>0.9653                     youngest age<=>0.9915               minimum age<=>0.9692
legal working age<=>0.9586             minimal age<=>0.9845                age limit<=>0.9587
youngest age<=>0.9584                  legal working age<=>0.9837          youngest age<=>0.9568
starting wage<=>0.9516                 age range<=>0.9824                  starting wage<=>0.9557
pay rate<=>0.9421                      legal age<=>0.9817                  legal working age<=>0.9500
starting pay<=>0.9393                  age limit<=>0.9742                

### Dental Benefits

In [38]:
query = 'dental benefits'

In [109]:
tabulate_results_knn(dataset, query, 20)

query: dental benefits, cluster size: 20
mask_vec                avg_context_vec        avg_concat_mask_vec
----------------------  ---------------------  ---------------------
dental insurance        dental insurance       dental insurance
vision insurance        life insurance         life insurance
life insurance          medical                vision insurance
healthcare              health                 dental vision
401 k                   health insurance       paid vacations
education               sick leave             sick leave
car insurance           healthcare             car insurance
health insurance        401k plan              healthcare
insurance               paid vacations         insurance
criminal records        health care            medical insurance
finger printing         insurance              health insurance
back ground checks      heath                  401 k
social security         medical insurance      health care
disability              paid vacati

In [110]:
tabulate_results_knn(dataset, query, 50, 1)

query: dental benefits, cluster size: 50
mask_vec                avg_context_vec        avg_concat_mask_vec
----------------------  ---------------------  ---------------------
dental insurance        dental insurance       dental insurance
vision insurance        life insurance         life insurance
life insurance          medical                vision insurance
healthcare              health                 dental vision
health care             health insurance       paid vacations
401 k                   sick leave             sick leave
education               healthcare             car insurance
car insurance           401k plan              healthcare
health insurance        paid vacations         insurance
insurance               health care            medical insurance
dental vision           insurance              health insurance
criminal records        heath                  401 k
finger printing         medical insurance      health care
back ground checks      paid vacati

In [39]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: dental benefits, cluster size: 100
mask_vec                            avg_context_vec                  avg_concat_mask_vec
----------------------------------  -------------------------------  ------------------------------------
dental insurance<=>0.9493           dental insurance<=>0.9769        dental insurance<=>0.9518
vision insurance<=>0.9308           life insurance<=>0.9701          life insurance<=>0.9208
life insurance<=>0.9288             medical<=>0.9701                 vision insurance<=>0.9119
healthcare<=>0.9181                 health<=>0.9671                  dental vision<=>0.8958
health care<=>0.9157                health insurance<=>0.9662        paid vacations<=>0.8954
401 k<=>0.9143                      sick leave<=>0.9653              sick leave<=>0.8914
education<=>0.9127                  healthcare<=>0.9651              car insurance<=>0.8893
car insurance<=>0.9124              401k plan<=>0.9649               healthcare<=>0.8887
health insurance<=>0.9107

#### 401k plan

In [40]:
query = '401k plan'

In [129]:
tabulate_results_knn(dataset, query, 15)

query: 401k plan, cluster size: 15
mask_vec                avg_context_vec    avg_concat_mask_vec
----------------------  -----------------  ---------------------------
retirement plan         insurance          pension
criminal record         health insurance   retirement plan
drivers license         discount card      flexible schedule
pension                 medical insurance  discount card
payroll card            sick leave         insurance
discount card           paid vacation      pay increase
back ground check       medical            medical marijuana card
college degree          maternity leave    health insurance
set schedule            health             drivers license
cdl license             life insurance     employee discount
flexible schedule       dental insurance   paid vacation
high school education   employee discount  set schedule
drug test               sick days          tuition reimbursement
medical marijuana card  disability         payroll card
background che

In [41]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: 401k plan, cluster size: 100
mask_vec                              avg_context_vec                       avg_concat_mask_vec
------------------------------------  ------------------------------------  ------------------------------------
retirement plan<=>0.9798              insurance<=>0.9896                    pension<=>0.9711
criminal record<=>0.9741              health insurance<=>0.9896             retirement plan<=>0.9705
drivers license<=>0.9717              retirement plan<=>0.9869              flexible schedule<=>0.9622
pension<=>0.9709                      discount card<=>0.9868                discount card<=>0.9620
payroll card<=>0.9703                 medical insurance<=>0.9855            insurance<=>0.9602
discount card<=>0.9699                pension<=>0.9850                      pay increase<=>0.9600
back ground check<=>0.9698            sick leave<=>0.9843                   medical marijuana card<=>0.9574
college degree<=>0.9686               healthcare<=>0.9838 

#### Vacation Days

In [130]:
query = 'vacation days'

In [131]:
tabulate_results_knn(dataset, query, 15)

query: vacation days, cluster size: 15
mask_vec           avg_context_vec    avg_concat_mask_vec
-----------------  -----------------  ---------------------
sick days          sick days          sick days
holiday pay        paid vacation      holiday pay
health insurance   sick leave         sick leave
medical insurance  holiday pay        health insurance
lunch breaks       health insurance   insurance
sick leave         insurance          medical insurance
insurance          maternity leave    paid vacation
seasonal jobs      paid vacations     annual raises
extra hours        discount card      employee discounts
paid vacation      medical insurance  maternity leave
gift cards         extra hours        paid vacations
maternity leave    leaves             yearly raises
overnight shifts   90 days            extra hours
cash tips          mandatory          pension
annual raises      part timers        lunch breaks


#### discount card

In [42]:
query = 'discount card'

In [133]:
tabulate_results_knn(dataset, query, 15)

query: discount card, cluster size: 15
mask_vec                avg_context_vec     avg_concat_mask_vec
----------------------  ------------------  ---------------------
drivers license         employee discount   employee discount
pension                 insurance           red card
401k plan               health insurance    insurance
employee discount       paid vacation       pension
social security number  401k plan           pay increase
retirement plan         sick leave          credit card
job offer               medical insurance   401k plan
back ground check       card                bank card
background check        pay increase        health insurance
social security         cards               gift card
drug test               holiday pay         yearly raise
paycheck                sick days           payment card
health insurance        employee discounts  card
credit card             profit sharing      paycheck
card                    vacation days       debit card


In [43]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: discount card, cluster size: 100
mask_vec                              avg_context_vec                       avg_concat_mask_vec
------------------------------------  ------------------------------------  ------------------------------------
drivers license<=>0.9703              employee discount<=>0.9905            employee discount<=>0.9766
pension<=>0.9701                      insurance<=>0.9894                    red card<=>0.9665
401k plan<=>0.9699                    health insurance<=>0.9890             insurance<=>0.9657
employee discount<=>0.9685            paid vacation<=>0.9869                pension<=>0.9654
social security number<=>0.9682       401k plan<=>0.9868                    pay increase<=>0.9634
retirement plan<=>0.9682              sick leave<=>0.9845                   credit card<=>0.9625
job offer<=>0.9676                    medical insurance<=>0.9844            401k plan<=>0.9620
back ground check<=>0.9655            pension<=>0.9841                      

#### profit sharing

In [134]:
query = 'profit sharing'

In [135]:
tabulate_results_knn(dataset, query, 15)

query: profit sharing, cluster size: 15
mask_vec            avg_context_vec     avg_concat_mask_vec
------------------  ------------------  ---------------------
health insurance    healthcare          healthcare
health care         health care         health insurance
healthcare          medical insurance   health care
life insurance      health insurance    insurance
medical insurance   life insurance      medical insurance
higher pay          health              life insurance
employee discount   paid vacation       pension
min wage            insurance           sick leave
holiday pay         discount card       employee discount
social security     medical             401 k
direct deposit      dental insurance    holiday pay
pension             pension             paid vacation
career advancement  paid vacations      tuition reimbursement
low pay             employee discount   tuition assistance
tech support        employee discounts  dental insurance


#### stock options

In [140]:
query = 'stock market'

In [141]:
tabulate_results_knn(dataset, query, 15)

query: stock market, cluster size: 15
mask_vec           avg_context_vec     avg_concat_mask_vec
-----------------  ------------------  ---------------------
healthcare         401 k               life insurance
health care        dental and vision   vision insurance
life insurance     vision insurance    401 k
vision insurance   life insurance      dental insurance
401 k              dental vision       healthcare
profit sharing     health care         health care
health insurance   dental insurance    profit sharing
employee discount  healthcare          medical insurance
pension            profit sharing      paid vacations
social security    health              ad&d
cobra              excellent benefits  aflac
banking            hmo                 insurance
medical insurance  ad&d                health insurance
aflac              medical insurance   hotels
car insurance      medicaid            tuition reimbursement


#### Very Large K

In [111]:
query = 'dental benefits'

In [112]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: dental benefits, cluster size: 100
mask_vec                            avg_context_vec                  avg_concat_mask_vec
----------------------------------  -------------------------------  ------------------------------------
dental insurance<=>0.9493           dental insurance<=>0.9769        dental insurance<=>0.9518
vision insurance<=>0.9308           life insurance<=>0.9701          life insurance<=>0.9208
life insurance<=>0.9288             medical<=>0.9701                 vision insurance<=>0.9119
healthcare<=>0.9181                 health<=>0.9671                  dental vision<=>0.8958
health care<=>0.9157                health insurance<=>0.9662        paid vacations<=>0.8954
401 k<=>0.9143                      sick leave<=>0.9653              sick leave<=>0.8914
education<=>0.9127                  healthcare<=>0.9651              car insurance<=>0.8893
car insurance<=>0.9124              401k plan<=>0.9649               healthcare<=>0.8887
health insurance<=>0.9107

In [113]:
tabulate_results_knn_sim(dataset, query, 500, 1)

query: dental benefits, cluster size: 500
mask_vec                               avg_context_vec                       avg_concat_mask_vec
-------------------------------------  ------------------------------------  ------------------------------------
dental insurance<=>0.9493              dental insurance<=>0.9769             dental insurance<=>0.9518
vision insurance<=>0.9308              life insurance<=>0.9701               life insurance<=>0.9208
life insurance<=>0.9288                medical<=>0.9701                      vision insurance<=>0.9119
healthcare<=>0.9181                    health<=>0.9671                       dental vision<=>0.8958
health care<=>0.9157                   health insurance<=>0.9662             paid vacations<=>0.8954
401 k<=>0.9143                         sick leave<=>0.9653                   sick leave<=>0.8914
education<=>0.9127                     healthcare<=>0.9651                   car insurance<=>0.8893
car insurance<=>0.9124                 401

In [114]:
tabulate_results_knn_sim(dataset, query, 1000, 1)

query: dental benefits, cluster size: 1000
mask_vec                               avg_context_vec                       avg_concat_mask_vec
-------------------------------------  ------------------------------------  -------------------------------------
dental insurance<=>0.9493              dental insurance<=>0.9769             dental insurance<=>0.9518
vision insurance<=>0.9308              life insurance<=>0.9701               life insurance<=>0.9208
life insurance<=>0.9288                medical<=>0.9701                      vision insurance<=>0.9119
healthcare<=>0.9181                    health<=>0.9671                       dental vision<=>0.8958
health care<=>0.9157                   health insurance<=>0.9662             paid vacations<=>0.8954
401 k<=>0.9143                         sick leave<=>0.9653                   sick leave<=>0.8914
education<=>0.9127                     healthcare<=>0.9651                   car insurance<=>0.8893
car insurance<=>0.9124                 4

### Company

In [115]:
query = 'company'

In [117]:
tabulate_results_knn(dataset, query, 20)

query: company, cluster size: 20
mask_vec          avg_context_vec    avg_concat_mask_vec
----------------  -----------------  ----------------------------
corporation       postal service     post office
organization      usps               organization
post office       union              postal service
usps              management         usps
postal service    business           corporation
restaurant        publix             restaurant
ceo               aldi               corporate office
union             walgreens          community
warehouse         government         united states postal service
gm                private            hr department
plant             starbucks          call center
store manager     macy 's            home office
department        public             ceo
manager           store level        union
community         apple              franchise owner
district manager  kroger             warehouse
hr department     sam 's club        home depot
public

In [118]:
tabulate_results_knn(dataset, query, 50, 1)

query: company, cluster size: 50
mask_vec           avg_context_vec    avg_concat_mask_vec
-----------------  -----------------  ----------------------------
corporation        postal service     post office
organization       usps               organization
post office        post office        postal service
usps               union              usps
postal service     organization       corporation
restaurant         amazon             restaurant
ceo                management         corporate office
union              business           community
warehouse          publix             united states postal service
gm                 aldi               hr department
plant              home depot         call center
store manager      walgreens          home office
department         government         ceo
manager            wal mart           union
community          costco             franchise owner
district manager   private            warehouse
hr department      contract         

#### management

In [142]:
query = 'management'

In [143]:
tabulate_results_knn(dataset, query, 15)

query: management, cluster size: 15
mask_vec          avg_context_vec    avg_concat_mask_vec
----------------  -----------------  ---------------------
upper management  upper management   upper management
loss prevention   team members       mgmt
mgmt              mgmt               lp
asset protection  union              leadership
ap                store manager      loss prevention
g4s               gm                 asset protection
lp                crew members       human resources
usps              manager            ap
security          leadership         security
customer service  district           customer support
spectrum          staff members      union
union             department         geek squad
geek squad        district manager   customer service
business          lp                 tech support
family            ceo                senior management


#### department

In [146]:
query = 'department'

In [147]:
tabulate_results_knn(dataset, query, 15)

query: department, cluster size: 15
mask_vec      avg_context_vec    avg_concat_mask_vec
------------  -----------------  ---------------------
area          area               area
region        management         division
division      pharmacy           region
restaurant    retail             state
organization  restaurant         station
station       hotel              specific location
city          asset protection   restaurant
plant         production         city
district      home office        plant
industry      union              warehouse
state         gm                 center
warehouse     neighborhood       hotel
company       ap                 call center
center        electronics        organization
business      member             county


#### hr department

In [148]:
query = 'hr department'

In [149]:
tabulate_results_knn(dataset, query, 15)

query: hr department, cluster size: 15
mask_vec                    avg_context_vec             avg_concat_mask_vec
--------------------------  --------------------------  --------------------------
hiring manager              human resources             human resources department
district manager            human resources department  hiring manager
store manager               hiring manager              district manager
human resources department  manager                     hr dept
general manager             postmaster                  personnel office
hr dept                     district manager            store manager
post master                 post master                 general manager
gm                          local store                 gm
ceo                         support team                home office
postal service              phone number                ceo
regional manager            main office                 postmaster
post office                 personnel offi

#### location

In [152]:
query = 'specific location'

In [153]:
tabulate_results_knn(dataset, query, 15)

query: specific location, cluster size: 15
mask_vec      avg_context_vec    avg_concat_mask_vec
------------  -----------------  ---------------------
department    area               department
region        hr department      area
restaurant    hiring manager     region
area          manager            restaurant
company       department         company
organization  home office        station
plant         restaurant         division
home office   time frame         hotel
hotel         gm                 state
division      ad                 call center
warehouse     station            orientation class
corporation   human resources    desired location
call center   book               training class
city          local store        organization
manager       main office        home office


#### business

In [144]:
query = 'business'

In [145]:
tabulate_results_knn(dataset, query, 15)

query: business, cluster size: 15
mask_vec          avg_context_vec    avg_concat_mask_vec
----------------  -----------------  ---------------------
union             restaurant         production
customer service  department         restaurant
security          production         union
contract          public             organization
management        organization       contract
organization      area               banking
family            team members       hotel
restaurant        customer service   customer service
loss prevention   daily              loss prevention
banking           safety             brand
department        model              leadership
fashion           major              management
company           book               military
warehouse         matter             community
brand             function           warehouse


#### Very Large K

In [119]:
query = 'company'

In [120]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: company, cluster size: 100
mask_vec                      avg_context_vec            avg_concat_mask_vec
----------------------------  -------------------------  -------------------------------------
corporation<=>0.9870          postal service<=>0.9958    post office<=>0.9849
organization<=>0.9853         usps<=>0.9951              organization<=>0.9828
post office<=>0.9832          post office<=>0.9949       postal service<=>0.9828
usps<=>0.9828                 union<=>0.9948             usps<=>0.9780
postal service<=>0.9826       organization<=>0.9944      corporation<=>0.9735
restaurant<=>0.9813           corporation<=>0.9940       restaurant<=>0.9728
ceo<=>0.9782                  amazon<=>0.9936            corporate office<=>0.9667
union<=>0.9781                management<=>0.9934        community<=>0.9653
warehouse<=>0.9765            business<=>0.9934          united states postal service<=>0.9650
gm<=>0.9764                   publix<=>0.9930            hr department<=>0.9

In [121]:
tabulate_results_knn_sim(dataset, query, 500, 1)

query: company, cluster size: 500
mask_vec                               avg_context_vec                        avg_concat_mask_vec
-------------------------------------  -------------------------------------  -------------------------------------
corporation<=>0.9870                   postal service<=>0.9958                post office<=>0.9849
organization<=>0.9853                  usps<=>0.9951                          organization<=>0.9828
post office<=>0.9832                   post office<=>0.9949                   postal service<=>0.9828
usps<=>0.9828                          union<=>0.9948                         usps<=>0.9780
postal service<=>0.9826                organization<=>0.9944                  corporation<=>0.9735
restaurant<=>0.9813                    corporation<=>0.9940                   restaurant<=>0.9728
ceo<=>0.9782                           amazon<=>0.9936                        corporate office<=>0.9667
union<=>0.9781                         management<=>0.9934

In [107]:
tabulate_results_knn_sim(dataset, query, 1000, 1)

query: hiring age, cluster size: 1000
mask_vec                               avg_context_vec                     avg_concat_mask_vec
-------------------------------------  ----------------------------------  -------------------------------------
age requirement<=>0.9892               age requirement<=>0.9957            age requirement<=>0.9904
minimum age<=>0.9760                   minimum age<=>0.9944                age range<=>0.9693
age range<=>0.9653                     youngest age<=>0.9915               minimum age<=>0.9692
legal working age<=>0.9586             minimal age<=>0.9845                age limit<=>0.9587
youngest age<=>0.9584                  legal working age<=>0.9837          youngest age<=>0.9568
starting wage<=>0.9516                 age range<=>0.9824                  starting wage<=>0.9557
pay rate<=>0.9421                      legal age<=>0.9817                  legal working age<=>0.9500
starting pay<=>0.9393                  age limit<=>0.9742                

## Yelp Reviews

In [44]:
dataset = 'yelp'

### Restaurant (C)

In [10]:
query = 'restaurant'

In [23]:
tabulate_results_knn(dataset, query, 15)

query: restaurant, cluster size: 15
mask_vec       avg_context_vec     avg_concat_mask_vec
-------------  ------------------  ---------------------
resturant      hotel               buffet
restaraunt     buffet              hotel
restraunt      keg                 brewery
resteraunt     diner               bakery
store          chinese restaurant  donut shop
establishment  theater             hotel casino
resturaunt     brewery             truck
location       pub                 steakhouse
joint          area                resort
buffet         steak house         property
hotel          steakhouse          ice_cream shop
bakery         starbucks           japanese restaurant
cafe           neighbourhood       steak house
brewery        company             coffeehouse
salon          mexican restaurant  theater


In [20]:
tabulate_results_knn(dataset, query, 20)

query: restaurant, cluster size: 20
mask_vec       avg_context_vec        avg_concat_mask_vec
-------------  ---------------------  ---------------------
resturant      hotel                  buffet
restaraunt     buffet                 hotel
restraunt      keg                    brewery
resteraunt     diner                  bakery
store          chinese restaurant     donut shop
establishment  theater                hotel casino
resturaunt     brewery                truck
location       pub                    steakhouse
joint          area                   resort
buffet         steak house            property
hotel          steakhouse             ice_cream shop
bakery         starbucks              japanese restaurant
cafe           neighbourhood          steak house
brewery        company                coffeehouse
salon          mexican restaurant     theater
pizzeria       japanese restaurant    irish pub
food truck     vietnamese restaurant  casino
wine bar       downtown        

In [11]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: restaurant, cluster size: 100
mask_vec                    avg_context_vec                  avg_concat_mask_vec
--------------------------  -------------------------------  --------------------------------
resturant<=>0.9981          hotel<=>0.9973                   buffet<=>0.9929
restaraunt<=>0.9961         buffet<=>0.9972                  hotel<=>0.9925
restraunt<=>0.9959          keg<=>0.9965                     brewery<=>0.9921
resteraunt<=>0.9929         diner<=>0.9964                   bakery<=>0.9900
store<=>0.9928              chinese restaurant<=>0.9964      donut shop<=>0.9889
establishment<=>0.9924      casino<=>0.9963                  hotel casino<=>0.9886
resturaunt<=>0.9918         theater<=>0.9963                 truck<=>0.9879
location<=>0.9902           property<=>0.9962                steakhouse<=>0.9878
joint<=>0.9901              brewery<=>0.9960                 resort<=>0.9877
buffet<=>0.9900             pub<=>0.9960                     property<=>0.9877
hot

#### Mexican

In [12]:
query = 'mexican'

In [157]:
tabulate_results_knn(dataset, query, 20)

query: mexican, cluster size: 20
mask_vec        avg_context_vec    avg_concat_mask_vec
--------------  -----------------  ---------------------
chinese         greek              chinese
japanese        italian            french
korean          southern           indian
vietnamese      korean             peruvian
italian         asian              japanese
brazilian       brazilian          asian
asian           chinese            italian
filipino        vietnamese         greek
kosher          caribbean          kosher
hakka           island             filipino
thai            thai               brazilian
tex mex         barbecue           hawaiian
french          polish             lebanese
middle eastern  filipino           korean
colombian       mediterranean      thai
caribbean       gourmet            polish
hawaiian        hawaiian           vietnamese
indian          american           ethiopian
peruvian        french             persian
western         persian            rus

In [13]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: mexican, cluster size: 100
mask_vec                       avg_context_vec              avg_concat_mask_vec
-----------------------------  ---------------------------  ----------------------------
chinese<=>0.9976               greek<=>0.9985               chinese<=>0.9971
japanese<=>0.9938              italian<=>0.9985             french<=>0.9958
korean<=>0.9936                southern<=>0.9983            indian<=>0.9953
vietnamese<=>0.9934            korean<=>0.9983              peruvian<=>0.9950
italian<=>0.9928               asian<=>0.9981               japanese<=>0.9949
brazilian<=>0.9921             brazilian<=>0.9981           asian<=>0.9945
asian<=>0.9917                 chinese<=>0.9981             italian<=>0.9945
filipino<=>0.9913              indian<=>0.9979              greek<=>0.9945
kosher<=>0.9912                vietnamese<=>0.9978          kosher<=>0.9944
hakka<=>0.9900                 caribbean<=>0.9977           filipino<=>0.9941
thai<=>0.9899                  

In [417]:
query = 'mexican restaurant'

In [418]:
tabulate_results_knn(dataset, query, 15)

query: mexican restaurant, cluster size: 15
mask_vec    avg_context_vec        avg_concat_mask_vec
----------  ---------------------  ---------------------
            vietnamese restaurant  vietnamese restaurant
            italian restaurant     chinese restaurant
            donut shop             japanese restaurant
            indian restaurant      filipino restaurant
            steak house            donut shop
            filipino restaurant    boba shop
            chinese restaurant     steak house
            japanese restaurant    sub shop
            steakhouse             italian restaurant
            sub shop               steakhouse
            brazilian steakhouse   chinese buffet
            chinese buffet         indian restaurant
            starbucks              brazilian steakhouse
            boba shop              french bakery
            keg                    irish pub


#### Bistro

In [14]:
query = 'bistro'

In [410]:
tabulate_results_knn(dataset, query, 15)

query: bistro, cluster size: 15
mask_vec     avg_context_vec    avg_concat_mask_vec
-----------  -----------------  ---------------------
pub          tavern             tavern
grille       pub                brasserie
deli         village            caf
hut          diner              pub
village      inn                noodle house
shack        steakhouse         garden
izakaya      steak house        izakaya
smokehouse   izakaya            village
supermarket  bakery             inn
legend       gastropub          brewery
snack bar    urban              paradise
garden       villa              factory
paradise     paris              diner
bay          restaurant         creamery
cottage      brewery            winery


In [15]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: bistro, cluster size: 100
mask_vec                avg_context_vec               avg_concat_mask_vec
----------------------  ----------------------------  ------------------------
cafe<=>0.9920           tavern<=>0.9981               tavern<=>0.9952
tavern<=>0.9896         pub<=>0.9981                  brasserie<=>0.9936
pub<=>0.9890            village<=>0.9973              caf<=>0.9933
cantina<=>0.9887        diner<=>0.9971                pub<=>0.9915
pizzeria<=>0.9884       inn<=>0.9970                  noodle house<=>0.9911
brasserie<=>0.9878      steakhouse<=>0.9969           garden<=>0.9905
grille<=>0.9869         steak house<=>0.9966          izakaya<=>0.9894
deli<=>0.9866           izakaya<=>0.9962              bakery<=>0.9893
caf<=>0.9865            bakery<=>0.9961               village<=>0.9892
bakery<=>0.9859         gastropub<=>0.9960            inn<=>0.9882
taqueria<=>0.9853       paradise<=>0.9960             brewery<=>0.9879
resto<=>0.9851          urban<=>0.9958   

#### Steakhouse

In [411]:
query = 'steakhouse'

In [412]:
tabulate_results_knn(dataset, query, 15)

query: steakhouse, cluster size: 15
mask_vec       avg_context_vec       avg_concat_mask_vec
-------------  --------------------  ---------------------
pizzeria       steak house           steak house
diner          mexican restaurant    japanese restaurant
pizza joint    buffet                chinese restaurant
burger joint   chinese restaurant    brewery
bbq joint      italian restaurant    irish pub
buffet         bistro                mexican restaurant
brewery        diner                 donut shop
wine bar       pub                   coffeehouse
pub            keg                   italian restaurant
mexican joint  japanese restaurant   chinese buffet
taqueria       tavern                french bakery
food truck     indian restaurant     diner
sushi joint    restaurant            buffet
gastropub      brazilian steakhouse  sub shop
bakery         chinese buffet        vietnamese restaurant


In [415]:
query = 'brazilian steakhouse'

In [416]:
tabulate_results_knn(dataset, query, 15)

query: brazilian steakhouse, cluster size: 15
mask_vec    avg_context_vec         avg_concat_mask_vec
----------  ----------------------  ---------------------
            mexican restaurant      mexican restaurant
            steakhouse              japanese restaurant
            steak house             chinese restaurant
            vietnamese restaurant   vietnamese restaurant
            indian restaurant       chinese buffet
            japanese restaurant     filipino restaurant
            filipino restaurant     steak house
            keg                     steakhouse
            italian restaurant      boba shop
            chinese restaurant      donut shop
            buffet                  french bakery
            restaurant              sub shop
            benihana                brewery
            fine_dining experience  indian restaurant
            ethiopian restaurant    italian restaurant


#### Very Large K

In [38]:
query = 'restaurant'

In [39]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: restaurant, cluster size: 100
mask_vec                    avg_context_vec                  avg_concat_mask_vec
--------------------------  -------------------------------  --------------------------------
resturant<=>0.9981          hotel<=>0.9973                   buffet<=>0.9929
restaraunt<=>0.9961         buffet<=>0.9972                  hotel<=>0.9925
restraunt<=>0.9959          keg<=>0.9965                     brewery<=>0.9921
resteraunt<=>0.9929         diner<=>0.9964                   bakery<=>0.9900
store<=>0.9928              chinese restaurant<=>0.9964      donut shop<=>0.9889
establishment<=>0.9924      casino<=>0.9963                  hotel casino<=>0.9886
resturaunt<=>0.9918         theater<=>0.9963                 truck<=>0.9879
location<=>0.9902           property<=>0.9962                steakhouse<=>0.9878
joint<=>0.9901              brewery<=>0.9960                 resort<=>0.9877
buffet<=>0.9900             pub<=>0.9960                     property<=>0.9877
hot

In [40]:
tabulate_results_knn_sim(dataset, query, 500, 1)

query: restaurant, cluster size: 500
mask_vec                         avg_context_vec                    avg_concat_mask_vec
-------------------------------  ---------------------------------  -------------------------------------
resturant<=>0.9981               hotel<=>0.9973                     buffet<=>0.9929
restaraunt<=>0.9961              buffet<=>0.9972                    hotel<=>0.9925
restraunt<=>0.9959               keg<=>0.9965                       brewery<=>0.9921
resteraunt<=>0.9929              diner<=>0.9964                     bakery<=>0.9900
store<=>0.9928                   chinese restaurant<=>0.9964        donut shop<=>0.9889
establishment<=>0.9924           casino<=>0.9963                    hotel casino<=>0.9886
resturaunt<=>0.9918              theater<=>0.9963                   truck<=>0.9879
location<=>0.9902                property<=>0.9962                  steakhouse<=>0.9878
joint<=>0.9901                   brewery<=>0.9960                   resort<=>0.9877


In [41]:
tabulate_results_knn_sim(dataset, query, 1000, 1)

query: restaurant, cluster size: 1000
mask_vec                         avg_context_vec                           avg_concat_mask_vec
-------------------------------  ----------------------------------------  --------------------------------------
resturant<=>0.9981               hotel<=>0.9973                            buffet<=>0.9929
restaraunt<=>0.9961              buffet<=>0.9972                           hotel<=>0.9925
restraunt<=>0.9959               keg<=>0.9965                              brewery<=>0.9921
resteraunt<=>0.9929              diner<=>0.9964                            bakery<=>0.9900
store<=>0.9928                   chinese restaurant<=>0.9964               donut shop<=>0.9889
establishment<=>0.9924           casino<=>0.9963                           hotel casino<=>0.9886
resturaunt<=>0.9918              theater<=>0.9963                          truck<=>0.9879
location<=>0.9902                property<=>0.9962                         steakhouse<=>0.9878
joint<=>0.99

### Drinks

In [47]:
query = 'drinks'

In [46]:
tabulate_results_knn(dataset, query, 100, 1)

query: drink, cluster size: 100
mask_vec            avg_context_vec    avg_concat_mask_vec
------------------  -----------------  ---------------------
beverage
mixed drink
cocktail
margarita
mimosa
beer
brew
fountain drink
shake
smoothie
draft beer
kids meal
blizzard
soft drink
replacement
martini
hot dog
snack
custom pizza
poke bowl
burger
diet coke
cold brew
hotdog
sushi roll
drip coffee
donut
michelada
flat white
personal pizza
coffee
pizza
diet pepsi
lunch special
breakfast sandwich
doughnut
bento box
boba milk tea
manhattan
feast
gin and tonic
ticket
bloody mary
birthday cake
package
substitution
negroni
meal
bubble tea
mai tai
deli sandwich
coke
cupcake
vegan option
cone
cortado
cookie
sashimi platter
wine flight
shandy
fountain soda
breakfast burrito
mixer
specialty roll
latte
caramel macchiato
wing
house margarita
whiskey sour
shooter
small plate
double double
marg
nosh
macchiato
fish fry
fortune cookie
omakase
chai tea latte
chocolate bar
cold brew coffee
street taco
flight
d

### Food (C)

In [16]:
query = 'food'

In [36]:
tabulate_results_knn(dataset, query, 15)

query: food, cluster size: 15
mask_vec          avg_context_vec       avg_concat_mask_vec
----------------  --------------------  ---------------------
wine list         sushi                 thin crust pizza
product           beer                  oxtail soup
pricing           coffee                pizza
beer list         buffet                table side guacamole
portion size      happy_hour prices     atmosphere
food quality      hookah                patio seating
pizza             energy                sangria
cocktail list     table side guacamole  customer services
atmosphere        diner                 deep_dish pizza
ambiance          indian food           coffee
sushi             entertainment         katsu sauce
salad bar         draft selection       almond croissant
coffee            saki                  music
hookah            cocktails             banchan
wood fired pizza  middle_eastern food   latte art


In [37]:
tabulate_results_knn(dataset, query, 20)

query: food, cluster size: 20
mask_vec          avg_context_vec       avg_concat_mask_vec
----------------  --------------------  ---------------------
wine list         sushi                 thin crust pizza
product           beer                  oxtail soup
pricing           coffee                pizza
beer list         buffet                table side guacamole
portion size      happy_hour prices     atmosphere
food quality      hookah                patio seating
pizza             energy                sangria
cocktail list     table side guacamole  customer services
atmosphere        diner                 deep_dish pizza
ambiance          indian food           coffee
sushi             entertainment         katsu sauce
salad bar         draft selection       almond croissant
coffee            saki                  music
hookah            cocktails             banchan
wood fired pizza  middle_eastern food   latte art
plating           restaurant            d cor
serving size      p

In [17]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: food, cluster size: 100
mask_vec                       avg_context_vec                  avg_concat_mask_vec
-----------------------------  -------------------------------  --------------------------------------
wine list<=>0.9872             sushi<=>0.9962                   thin crust pizza<=>0.9866
product<=>0.9846               beer<=>0.9953                    oxtail soup<=>0.9863
pricing<=>0.9843               coffee<=>0.9952                  pizza<=>0.9862
beer list<=>0.9840             buffet<=>0.9952                  table side guacamole<=>0.9860
portion size<=>0.9837          happy_hour prices<=>0.9949       atmosphere<=>0.9859
food quality<=>0.9836          hookah<=>0.9948                  patio seating<=>0.9858
customer service<=>0.9834      energy<=>0.9947                  sangria<=>0.9856
pizza<=>0.9834                 table side guacamole<=>0.9946    customer services<=>0.9853
cocktail list<=>0.9826         pizza<=>0.9939                   deep_dish pizza<=>0.9853
at

#### buffet

In [18]:
query = 'buffet'

In [432]:
tabulate_results_knn(dataset, query, 15)

query: buffet, cluster size: 15
mask_vec          avg_context_vec     avg_concat_mask_vec
----------------  ------------------  ---------------------
buffett           keg                 bakery
bakery            restaurant          restaurant
food truck        steakhouse          seafood buffet
club              steak house         brewery
resturant         chinese buffet      steakhouse
steakhouse        chinese restaurant  steak house
restaurant        diner               supermarket
deli              sushi               pub
diner             bakery              diner
cafe              pub                 truck
breakfast buffet  seafood buffet      donut shop
pizzeria          subway              bar lounge
juice bar         taco truck          co op
brewery           outback             irish pub
franchise         company             nightclub


In [19]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: buffet, cluster size: 100
mask_vec                   avg_context_vec                 avg_concat_mask_vec
-------------------------  ------------------------------  ------------------------------
buffett<=>0.9939           keg<=>0.9974                    bakery<=>0.9940
bakery<=>0.9922            restaurant<=>0.9972             restaurant<=>0.9929
food truck<=>0.9914        steakhouse<=>0.9971             seafood buffet<=>0.9929
club<=>0.9914              steak house<=>0.9969            brewery<=>0.9921
resturant<=>0.9914         chinese buffet<=>0.9968         steakhouse<=>0.9915
steakhouse<=>0.9913        chinese restaurant<=>0.9967     steak house<=>0.9896
restaurant<=>0.9900        diner<=>0.9966                  supermarket<=>0.9891
deli<=>0.9899              sushi<=>0.9965                  pub<=>0.9887
diner<=>0.9894             bakery<=>0.9963                 diner<=>0.9879
cafe<=>0.9892              pub<=>0.9961                    truck<=>0.9877
breakfast buffet<=>0.9887 

#### sushi

In [23]:
query = 'sushi'

In [435]:
tabulate_results_knn(dataset, query, 15)

query: sushi, cluster size: 15
mask_vec        avg_context_vec    avg_concat_mask_vec
--------------  -----------------  ---------------------
poke            shawarma           shawarma
dim sum         barbecue           barbecue
pho             coffee             ramen
bubble tea      tapas              hookah
frozen yogurt   sushi sashimi      sushi sashimi
boba tea        buffet             tapas
soul food       teppanyaki         pizza
shabu shabu     beer               japanese curry
pizza           food               coffee
coffee          sake               deep_dish pizza
fro yo          sea food           dosa
hookah          island             beer
frozen custard  bakery             japanese cheesecake
ayce            polish             spanish tapas
chinese food    boat               desert


In [24]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: sushi, cluster size: 100
mask_vec                      avg_context_vec               avg_concat_mask_vec
----------------------------  ----------------------------  --------------------------------
poke<=>0.9971                 shawarma<=>0.9975             shawarma<=>0.9950
dim sum<=>0.9964              pizza<=>0.9974                barbecue<=>0.9940
pho<=>0.9964                  barbecue<=>0.9972             ramen<=>0.9936
bubble tea<=>0.9949           coffee<=>0.9969               hookah<=>0.9929
ramen<=>0.9937                tapas<=>0.9969                sushi sashimi<=>0.9926
frozen yogurt<=>0.9933        desert<=>0.9969               tapas<=>0.9917
boba tea<=>0.9921             sushi sashimi<=>0.9967        sea food<=>0.9907
soul food<=>0.9917            ramen<=>0.9967                teppanyaki<=>0.9898
shabu shabu<=>0.9917          buffet<=>0.9965               pizza<=>0.9894
tapas<=>0.9916                teppanyaki<=>0.9965           japanese curry<=>0.9888
pizza<=>0.991

#### hookah

In [20]:
query = 'hookah'

In [438]:
tabulate_results_knn(dataset, query, 15)

query: hookah, cluster size: 15
mask_vec        avg_context_vec     avg_concat_mask_vec
--------------  ------------------  ---------------------
bubble tea      beer                sushi
coffee          coffee              coffee
sushi           saki                ramen
boba tea        entertainment       beer
beer            food                pizza
dim sum         karaoke             deep_dish pizza
poke            sushi               german beer
booze           buffet              japanese curry
frozen yogurt   pub                 absinthe
fro yo          live entertainment  desert
pho             keg                 sangria
frozen custard  brunch              japanese cheesecake
pizza           dogs                dosa
gaming          energy              poutine
saki            bear                thin crust pizza


#### beer

In [21]:
query = 'beer'

In [440]:
tabulate_results_knn(dataset, query, 15)

query: beer, cluster size: 15
mask_vec      avg_context_vec    avg_concat_mask_vec
------------  -----------------  ---------------------
draft beer    wine               wine
coffee        coffee             coffee
craft beer    sake               tea
wine          hookah             hookah
bubble tea    sushi              gelato
hookah        saki               pizza
saki          cocktails          saki
cold brew     food               sangria
pizza         buffet             cocktail
sake          specialty drinks   ramen
beverage      dogs               juice
drip coffee   pub                mimosa
bottled beer  top shelf          pepsi
booze         java               poutine
sushi         bakery             ice tea


In [22]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: beer, cluster size: 100
mask_vec                    avg_context_vec                avg_concat_mask_vec
--------------------------  -----------------------------  -----------------------------
draft beer<=>0.9946         wine<=>0.9982                  wine<=>0.9941
coffee<=>0.9917             coffee<=>0.9976                coffee<=>0.9938
craft beer<=>0.9912         sake<=>0.9967                  tea<=>0.9912
wine<=>0.9895               cocktail<=>0.9966              hookah<=>0.9902
liquor<=>0.9891             hookah<=>0.9964                gelato<=>0.9900
bubble tea<=>0.9888         sushi<=>0.9963                 pizza<=>0.9892
tequila<=>0.9886            saki<=>0.9963                  saki<=>0.9890
hookah<=>0.9882             cocktails<=>0.9962             pastry<=>0.9886
cider<=>0.9879              tapas<=>0.9960                 sangria<=>0.9883
saki<=>0.9877               pizza<=>0.9958                 cocktail<=>0.9880
soda<=>0.9876               alcohol<=>0.9957            

#### pizza

In [50]:
query = 'pizza'

In [51]:
tabulate_results_knn(dataset, query, 15)

query: pizza, cluster size: 15
mask_vec          avg_context_vec    avg_concat_mask_vec
----------------  -----------------  ---------------------
burger            shawarma           poutine
ramen             desert             ramen
pho               barbecue           dosa
dosa              cheesesteak        deep_dish pizza
wood fired pizza  sushi              cheesesteak
gelato            popcorn            thin crust pizza
poke bowl         ramen              paella
sushi             thin crust pizza   hamburger
pie               gyros              biryani
pan roast         house              schnitzel
hotdog            coffee             shawarma
banh mi           baklava            chowder
hot pot           gelato             pastry
bubble tea        bagels             milk tea
coffee            pasty              teriyaki bowl


In [52]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: pizza, cluster size: 100
mask_vec                    avg_context_vec                   avg_concat_mask_vec
--------------------------  --------------------------------  --------------------------------
poutine<=>0.9955            shawarma<=>0.9984                 poutine<=>0.9970
burger<=>0.9941             desert<=>0.9979                   ramen<=>0.9962
ramen<=>0.9939              barbecue<=>0.9979                 dosa<=>0.9958
pho<=>0.9939                cheesesteak<=>0.9978              deep_dish pizza<=>0.9947
dosa<=>0.9935               sushi<=>0.9974                    cheesesteak<=>0.9941
wood fired pizza<=>0.9920   popcorn<=>0.9974                  thin crust pizza<=>0.9940
paella<=>0.9917             ramen<=>0.9973                    paella<=>0.9940
gelato<=>0.9917             taco<=>0.9972                     gelato<=>0.9938
poke bowl<=>0.9914          poutine<=>0.9971                  hamburger<=>0.9936
sushi<=>0.9911              thin crust pizza<=>0.9970         bi

#### price

In [445]:
query = 'happy_hour prices'

In [446]:
tabulate_results_knn(dataset, query, 15)

query: happy_hour prices, cluster size: 15
mask_vec    avg_context_vec       avg_concat_mask_vec
----------  --------------------  -------------------------
            food                  hh prices
            cocktails             cocktails
            draft selection       late_night specials
            specialty drinks      happy_hour drink_specials
            hookah                customer services
            dogs                  low prices
            craft_beer selection  entertainment
            draft_beer selection  room rates
            table side guacamole  kids meals
            sushi                 specialty drinks
            so many options       results
            so many choices       late_night hours
            coffee                dogs
            free popcorn          latte art
            late_night specials   veggie options


#### Dessert (C)

In [53]:
query = 'dessert'

In [421]:
tabulate_results_knn(dataset, query, 20)

query: dessert, cluster size: 20
mask_vec        avg_context_vec      avg_concat_mask_vec
--------------  -------------------  ---------------------
desert          desert               desert
breakfast       cannoli              brunch
pho             baklava              gelato
dim sum         popcorn              ramen
bubble tea      mimosa               sake
frozen custard  pizza                tea
takeaway        coconut cake         poutine
afternoon tea   smoothie             pizza
coffee          milkshake            coffee
halo halo       tea                  an appetizer
sake            gelato               dosa
pizza           cappuccino           soft_serve ice_cream
sushi           butter cake          tapas
takeout         champagne            appetizer
brunch          sake                 sicilian pizza
frozen yogurt   banana split         baklava
happy hour      an app               gluten_free pasta
dosa            coffee               deep_dish pizza
shave ice       

In [54]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: dessert, cluster size: 100
mask_vec                     avg_context_vec                   avg_concat_mask_vec
---------------------------  --------------------------------  -----------------------------
desert<=>0.9938              desert<=>0.9987                   desert<=>0.9929
breakfast<=>0.9879           cannoli<=>0.9972                  brunch<=>0.9854
pho<=>0.9851                 baklava<=>0.9971                  gelato<=>0.9846
dim sum<=>0.9842             popcorn<=>0.9969                  ramen<=>0.9844
bubble tea<=>0.9841          cheesecake<=>0.9968               sake<=>0.9837
frozen custard<=>0.9839      mimosa<=>0.9966                   tea<=>0.9837
takeaway<=>0.9838            pizza<=>0.9964                    poutine<=>0.9836
afternoon tea<=>0.9835       coconut cake<=>0.9963             pizza<=>0.9830
gelato<=>0.9834              monkey bread<=>0.9962             coffee<=>0.9830
boba<=>0.9832                smoothie<=>0.9961                 high tea<=>0.9825
coff

#### butter cake

In [58]:
query = 'cake'

In [59]:
tabulate_results_knn(dataset, query, 20)

query: cake, cluster size: 20
mask_vec       avg_context_vec    avg_concat_mask_vec
-------------  -----------------  ---------------------
crepe          muffin             pancake
pancake        cheesecake         muffin
pie            pastry             fondue
cheesecake     chocolate          cheesecake
souffle        napoleon           empanada
sundae         carrot cake        milk tea
cookie         scone              flatbread
bread pudding  cherry             sandwich
pudding        pancake            strudel
croissant      sundae             pastry
scone          sprinkles          chowder
fondue         churro             ravioli
roll           custard            scone
cupcake        croissant          tamale
smoothie       turtle             smoothie
panna cotta    biscotti           panini
salad          danish             danish
soup           strudel            sorbet
ice cream      truffles           arepa
macaron        oatmeal            roti


In [60]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: cake, cluster size: 100
mask_vec                 avg_context_vec                 avg_concat_mask_vec
-----------------------  ------------------------------  ------------------------
crepe<=>0.9938           muffin<=>0.9982                 pancake<=>0.9956
pancake<=>0.9937         cheesecake<=>0.9981             muffin<=>0.9953
pie<=>0.9935             pastry<=>0.9977                 fondue<=>0.9937
cheesecake<=>0.9932      chocolate<=>0.9975              cheesecake<=>0.9936
souffle<=>0.9929         napoleon<=>0.9974               empanada<=>0.9934
sundae<=>0.9916          carrot cake<=>0.9973            milk tea<=>0.9932
cookie<=>0.9916          scone<=>0.9973                  flatbread<=>0.9931
bread pudding<=>0.9904   cherry<=>0.9973                 sandwich<=>0.9930
pudding<=>0.9901         pancake<=>0.9971                strudel<=>0.9925
croissant<=>0.9900       sundae<=>0.9970                 pastry<=>0.9923
scone<=>0.9895           sprinkles<=>0.9970              chowder<

#### Very Large K

In [29]:
query = 'food'

In [32]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: food, cluster size: 100
mask_vec                       avg_context_vec                  avg_concat_mask_vec
-----------------------------  -------------------------------  --------------------------------------
wine list<=>0.9872             sushi<=>0.9962                   thin crust pizza<=>0.9866
product<=>0.9846               beer<=>0.9953                    oxtail soup<=>0.9863
pricing<=>0.9843               coffee<=>0.9952                  pizza<=>0.9862
beer list<=>0.9840             buffet<=>0.9952                  table side guacamole<=>0.9860
portion size<=>0.9837          happy_hour prices<=>0.9949       atmosphere<=>0.9859
food quality<=>0.9836          hookah<=>0.9948                  patio seating<=>0.9858
customer service<=>0.9834      energy<=>0.9947                  sangria<=>0.9856
pizza<=>0.9834                 table side guacamole<=>0.9946    customer services<=>0.9853
cocktail list<=>0.9826         pizza<=>0.9939                   deep_dish pizza<=>0.9853
at

In [33]:
tabulate_results_knn_sim(dataset, query, 500, 1)

query: food, cluster size: 500
mask_vec                              avg_context_vec                           avg_concat_mask_vec
------------------------------------  ----------------------------------------  --------------------------------------
wine list<=>0.9872                    sushi<=>0.9962                            thin crust pizza<=>0.9866
product<=>0.9846                      beer<=>0.9953                             oxtail soup<=>0.9863
pricing<=>0.9843                      coffee<=>0.9952                           pizza<=>0.9862
beer list<=>0.9840                    buffet<=>0.9952                           table side guacamole<=>0.9860
portion size<=>0.9837                 happy_hour prices<=>0.9949                atmosphere<=>0.9859
food quality<=>0.9836                 hookah<=>0.9948                           patio seating<=>0.9858
customer service<=>0.9834             energy<=>0.9947                           sangria<=>0.9856
pizza<=>0.9834                        

In [34]:
tabulate_results_knn_sim(dataset, query, 1000, 1)

query: food, cluster size: 1000
mask_vec                              avg_context_vec                           avg_concat_mask_vec
------------------------------------  ----------------------------------------  --------------------------------------
wine list<=>0.9872                    sushi<=>0.9962                            thin crust pizza<=>0.9866
product<=>0.9846                      beer<=>0.9953                             oxtail soup<=>0.9863
pricing<=>0.9843                      coffee<=>0.9952                           pizza<=>0.9862
beer list<=>0.9840                    buffet<=>0.9952                           table side guacamole<=>0.9860
portion size<=>0.9837                 happy_hour prices<=>0.9949                atmosphere<=>0.9859
food quality<=>0.9836                 hookah<=>0.9948                           patio seating<=>0.9858
customer service<=>0.9834             energy<=>0.9947                           sangria<=>0.9856
pizza<=>0.9834                       

### Employee (C)

In [375]:
query = 'employee' #staff/staff member

In [376]:
tabulate_results_knn(dataset, query, 15)

query: employee, cluster size: 15
mask_vec           avg_context_vec    avg_concat_mask_vec
-----------------  -----------------  ---------------------
older lady
guy
girl
lady
older man
woman
gal
staff member
barista
attendant
clerk
gentleman
young man
young lady
assistant manager


#### Manager

In [366]:
query = 'manager'

In [367]:
tabulate_results_knn(dataset, query, 15)

query: manager, cluster size: 15
mask_vec           avg_context_vec    avg_concat_mask_vec
-----------------  -----------------  ---------------------
cashier            mgr                mgr
store manager      gm                 bar tender
mgr                bar tender         bartender
general manager    female server      sushi chef
gm                 male server        barista
owner              an employee        gm
hostess            female owner       busboy
bartender          security guard     pharmacist
delivery driver    front desk         sommelier
barista            cashier lady       chef
shift manager      bartender          female owner
clerk              busboy             security guard
assistant manager  barista            district manager
bus boy            delivery man       delivery man
receptionist       sushi chef         cashier lady


In [368]:
tabulate_results_knn(dataset, query, 20)

query: manager, cluster size: 20
mask_vec           avg_context_vec    avg_concat_mask_vec
-----------------  -----------------  ---------------------
cashier            mgr                mgr
store manager      gm                 bar tender
mgr                bar tender         bartender
general manager    female server      sushi chef
gm                 male server        barista
owner              an employee        gm
hostess            female owner       busboy
bartender          security guard     pharmacist
delivery driver    front desk         sommelier
barista            cashier lady       chef
shift manager      bartender          female owner
clerk              busboy             security guard
assistant manager  barista            district manager
bus boy            delivery man       delivery man
receptionist       sushi chef         cashier lady
greeter            front counter      concierge
pharmacist         young guy          male server
driver             security   

#### An Employee

In [377]:
query = 'an employee'

In [378]:
tabulate_results_knn(dataset, query, 15)

query: an employee, cluster size: 15
mask_vec    avg_context_vec       avg_concat_mask_vec
----------  --------------------  ---------------------
            another employee      another employee
            manager               another patron
            another patron        another staff_member
            security guard        an older_woman
            security              an older_gentleman
            female server         his boss
            mgr                   multiple employees
            an attitude           an older_lady
            an older_woman        her boss
            poor girl             an older_man
            bar tender            my entire_family
            an older_gentleman    security
            young guy             my aunt
            another staff_member  my grandfather
            front desk            jesus


In [379]:
tabulate_results_knn(dataset, query, 20)

query: an employee, cluster size: 20
mask_vec    avg_context_vec       avg_concat_mask_vec
----------  --------------------  ---------------------
            another employee      another employee
            manager               another patron
            another patron        another staff_member
            security guard        an older_woman
            security              an older_gentleman
            female server         his boss
            mgr                   multiple employees
            an attitude           an older_lady
            an older_woman        her boss
            poor girl             an older_man
            bar tender            my entire_family
            an older_gentleman    security
            young guy             my aunt
            another staff_member  more than one person
            front desk            my grandfather
            his attention         jesus
            front counter         kerry
            speaker               my cousi

#### Very Large K

In [42]:
query = 'manager'

In [43]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: manager, cluster size: 100
mask_vec                       avg_context_vec                avg_concat_mask_vec
-----------------------------  -----------------------------  ---------------------------
cashier<=>0.9944               mgr<=>0.9961                   mgr<=>0.9964
store manager<=>0.9930         gm<=>0.9960                    bar tender<=>0.9955
mgr<=>0.9930                   bar tender<=>0.9957            bartender<=>0.9947
general manager<=>0.9929       female server<=>0.9950         sushi chef<=>0.9937
gm<=>0.9929                    male server<=>0.9948           barista<=>0.9934
owner<=>0.9926                 an employee<=>0.9948           gm<=>0.9931
hostess<=>0.9922               female owner<=>0.9946          busboy<=>0.9929
bartender<=>0.9922             security guard<=>0.9945        pharmacist<=>0.9926
delivery driver<=>0.9921       front desk<=>0.9943            sommelier<=>0.9918
barista<=>0.9919               cashier lady<=>0.9942          chef<=>0.9907
shif

In [44]:
tabulate_results_knn_sim(dataset, query, 500, 1)

query: manager, cluster size: 500
mask_vec                       avg_context_vec                    avg_concat_mask_vec
-----------------------------  ---------------------------------  --------------------------------------
cashier<=>0.9944               mgr<=>0.9961                       mgr<=>0.9964
store manager<=>0.9930         gm<=>0.9960                        bar tender<=>0.9955
mgr<=>0.9930                   bar tender<=>0.9957                bartender<=>0.9947
general manager<=>0.9929       female server<=>0.9950             sushi chef<=>0.9937
gm<=>0.9929                    male server<=>0.9948               barista<=>0.9934
owner<=>0.9926                 an employee<=>0.9948               gm<=>0.9931
hostess<=>0.9922               female owner<=>0.9946              busboy<=>0.9929
bartender<=>0.9922             security guard<=>0.9945            pharmacist<=>0.9926
delivery driver<=>0.9921       front desk<=>0.9943                sommelier<=>0.9918
barista<=>0.9919         

In [45]:
tabulate_results_knn_sim(dataset, query, 1000, 1)

query: manager, cluster size: 1000
mask_vec                            avg_context_vec                        avg_concat_mask_vec
----------------------------------  -------------------------------------  --------------------------------------
cashier<=>0.9944                    mgr<=>0.9961                           mgr<=>0.9964
store manager<=>0.9930              gm<=>0.9960                            bar tender<=>0.9955
mgr<=>0.9930                        bar tender<=>0.9957                    bartender<=>0.9947
general manager<=>0.9929            female server<=>0.9950                 sushi chef<=>0.9937
gm<=>0.9929                         male server<=>0.9948                   barista<=>0.9934
owner<=>0.9926                      an employee<=>0.9948                   gm<=>0.9931
hostess<=>0.9922                    female owner<=>0.9946                  busboy<=>0.9929
bartender<=>0.9922                  security guard<=>0.9945                pharmacist<=>0.9926
delivery driver<=>0

### Atmosphere (C)

In [96]:
query = 'atmosphere'

In [99]:
tabulate_results_knn(dataset, query, 15)

query: atmosphere, cluster size: 15
mask_vec             avg_context_vec       avg_concat_mask_vec
-------------------  --------------------  ---------------------
ambiance             laid back atmosphere  d cor
interior design      ambient               interior decor
scenery              interior decor        craft_beer selection
vibe                 cool decor            patio seating
decore               d cor                 food
wine list            fun vibe              interior decoration
deco                 fun atmosphere        draft selection
food                 patio seating         draft_beer selection
music choice         design                beverage selection
beer list            craft_beer selection  music
cocktail list        romantic atmosphere   design
seating arrangement  retro decor           architecture
concept              chill environment     outdoor_seating area
outdoor space        fun environment       store layout
interior             modern decor    

In [100]:
tabulate_results_knn(dataset, query, 20)

query: atmosphere, cluster size: 20
mask_vec             avg_context_vec        avg_concat_mask_vec
-------------------  ---------------------  ---------------------
ambiance             laid back atmosphere   d cor
interior design      ambient                interior decor
scenery              interior decor         craft_beer selection
vibe                 cool decor             patio seating
decore               d cor                  food
wine list            fun vibe               interior decoration
deco                 fun atmosphere         draft selection
food                 patio seating          draft_beer selection
music choice         design                 beverage selection
beer list            craft_beer selection   music
cocktail list        romantic atmosphere    design
seating arrangement  retro decor            architecture
concept              chill environment      outdoor_seating area
outdoor space        fun environment        store layout
interior             

#### Fun Atmosphere

In [380]:
query = 'fun atmosphere'

In [381]:
tabulate_results_knn(dataset, query, 15)

query: fun atmosphere, cluster size: 15
mask_vec    avg_context_vec             avg_concat_mask_vec
----------  --------------------------  ---------------------
            fun environment             fun environment
            cool atmosphere             charming atmosphere
            laid back atmosphere        fun vibe
            chill environment           chill environment
            excellent customer_service  cool atmosphere
            impeccable service          welcoming atmosphere
            fast and friendly service   beautiful space
            fast friendly service       clean environment
            clean environment           pleasant environment
            casual environment          laid back atmosphere
            ice_cold beer               casual environment
            decent prices               romantic atmosphere
            atmosphere                  comfortable ambience
            charming atmosphere         fun concept
            craft_beer selecti

In [382]:
tabulate_results_knn(dataset, query, 20)

query: fun atmosphere, cluster size: 20
mask_vec    avg_context_vec             avg_concat_mask_vec
----------  --------------------------  ------------------------
            fun environment             fun environment
            cool atmosphere             charming atmosphere
            laid back atmosphere        fun vibe
            chill environment           chill environment
            excellent customer_service  cool atmosphere
            impeccable service          welcoming atmosphere
            fast and friendly service   beautiful space
            fast friendly service       clean environment
            clean environment           pleasant environment
            casual environment          laid back atmosphere
            ice_cold beer               casual environment
            decent prices               romantic atmosphere
            atmosphere                  comfortable ambience
            charming atmosphere         fun concept
            craft_beer sele

#### Poor Atmosphere

In [391]:
query = 'poor vibe' # bad atmosphere/poor atmosphere/poor vibe

In [390]:
tabulate_results_knn(dataset, query, 15)

query: poor vibe, cluster size: 15
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------


#### Very Large K

In [50]:
query = 'atmosphere'

In [51]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: atmosphere, cluster size: 100
mask_vec                       avg_context_vec                      avg_concat_mask_vec
-----------------------------  -----------------------------------  -----------------------------
ambiance<=>0.9980              laid back atmosphere<=>0.9951        d cor<=>0.9933
interior design<=>0.9890       ambient<=>0.9945                     interior decor<=>0.9920
scenery<=>0.9887               interior decor<=>0.9944              craft_beer selection<=>0.9873
vibe<=>0.9878                  cool decor<=>0.9944                  patio seating<=>0.9868
decore<=>0.9870                d cor<=>0.9943                       food<=>0.9859
wine list<=>0.9837             fun vibe<=>0.9929                    interior decoration<=>0.9850
deco<=>0.9824                  fun atmosphere<=>0.9928              draft selection<=>0.9846
food<=>0.9821                  patio seating<=>0.9928               draft_beer selection<=>0.9843
music choice<=>0.9817          design<=>0.9

In [52]:
tabulate_results_knn_sim(dataset, query, 500, 1)

query: atmosphere, cluster size: 500
mask_vec                              avg_context_vec                         avg_concat_mask_vec
------------------------------------  --------------------------------------  --------------------------------------
ambiance<=>0.9980                     laid back atmosphere<=>0.9951           d cor<=>0.9933
interior design<=>0.9890              ambient<=>0.9945                        interior decor<=>0.9920
scenery<=>0.9887                      interior decor<=>0.9944                 craft_beer selection<=>0.9873
vibe<=>0.9878                         cool decor<=>0.9944                     patio seating<=>0.9868
decore<=>0.9870                       d cor<=>0.9943                          food<=>0.9859
wine list<=>0.9837                    fun vibe<=>0.9929                       interior decoration<=>0.9850
deco<=>0.9824                         fun atmosphere<=>0.9928                 draft selection<=>0.9846
food<=>0.9821                         pati

In [53]:
tabulate_results_knn_sim(dataset, query, 1000, 1)

query: atmosphere, cluster size: 1000
mask_vec                              avg_context_vec                           avg_concat_mask_vec
------------------------------------  ----------------------------------------  --------------------------------------
ambiance<=>0.9980                     laid back atmosphere<=>0.9951             d cor<=>0.9933
interior design<=>0.9890              ambient<=>0.9945                          interior decor<=>0.9920
scenery<=>0.9887                      interior decor<=>0.9944                   craft_beer selection<=>0.9873
vibe<=>0.9878                         cool decor<=>0.9944                       patio seating<=>0.9868
decore<=>0.9870                       d cor<=>0.9943                            food<=>0.9859
wine list<=>0.9837                    fun vibe<=>0.9929                         interior decoration<=>0.9850
deco<=>0.9824                         fun atmosphere<=>0.9928                   draft selection<=>0.9846
food<=>0.9821          

### Service (C)

In [392]:
query = 'customer service' #service, experience

In [117]:
tabulate_results_knn(dataset, query, 15)

query: customer service, cluster size: 15
mask_vec             avg_context_vec    avg_concat_mask_vec
-------------------  -----------------  ---------------------
costumer service
hospitality
pricing
food quality
customer experience
happy hour pricing
value
atmosphere
ambiance
music choice
quality control
hygiene
vibes
organization
teamwork


In [118]:
tabulate_results_knn(dataset, query, 20)

query: customer service, cluster size: 20
mask_vec             avg_context_vec    avg_concat_mask_vec
-------------------  -----------------  ---------------------
costumer service
hospitality
pricing
food quality
customer experience
happy hour pricing
eye candy
value
atmosphere
ambiance
music choice
wine list
quality control
hygiene
vibes
organization
teamwork
service recovery
energy
product


#### Great Service

In [422]:
query = 'impeccable service' #great/good/poor

In [423]:
tabulate_results_knn(dataset, query, 15)

query: impeccable service, cluster size: 15
mask_vec    avg_context_vec             avg_concat_mask_vec
----------  --------------------------  ----------------------------
            excellent customer_service  excellent customer_service
            fun atmosphere              extremely_friendly staff
            personalized service        cool decor
            authentic_italian food      big portions
            ice_cold beer               ice_cold beer
            quick and friendly service  exceptional customer_service
            fun environment             free wifi
            huge portions               fast friendly service
            fast and friendly service   large portion_sizes
            food                        stiff drinks
            stellar                     low prices
            first class                 cute decor
            atmosphere                  extremely_slow service
            cocktails                   personalized service
            low p

In [424]:
tabulate_results_knn(dataset, query, 20)

query: impeccable service, cluster size: 20
mask_vec    avg_context_vec             avg_concat_mask_vec
----------  --------------------------  ----------------------------
            excellent customer_service  excellent customer_service
            fun atmosphere              extremely_friendly staff
            personalized service        cool decor
            authentic_italian food      big portions
            ice_cold beer               ice_cold beer
            quick and friendly service  exceptional customer_service
            fun environment             free wifi
            huge portions               fast friendly service
            fast and friendly service   large portion_sizes
            food                        stiff drinks
            stellar                     low prices
            first class                 cute decor
            atmosphere                  extremely_slow service
            cocktails                   personalized service
            low p

In [425]:
query = 'excellent service' #great/good/poor

In [426]:
tabulate_results_knn(dataset, query, 15)

query: excellent service, cluster size: 15
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------


#### Very Large K

In [57]:
query = 'impeccable service'

In [58]:
tabulate_results_knn_sim(dataset, query, 100, 1)

query: impeccable service, cluster size: 100
mask_vec    avg_context_vec                        avg_concat_mask_vec
----------  -------------------------------------  --------------------------------------
            excellent customer_service<=>0.9958    excellent customer_service<=>0.9946
            fun atmosphere<=>0.9938                extremely_friendly staff<=>0.9878
            personalized service<=>0.9925          huge portions<=>0.9871
            authentic_italian food<=>0.9922        cool decor<=>0.9868
            ice_cold beer<=>0.9920                 quick and friendly service<=>0.9865
            quick and friendly service<=>0.9916    big portions<=>0.9863
            fun environment<=>0.9912               ice_cold beer<=>0.9859
            huge portions<=>0.9911                 exceptional customer_service<=>0.9849
            fast and friendly service<=>0.9911     free wifi<=>0.9847
            food<=>0.9910                          fast friendly service<=>0.9844
  

## IMDB

In [77]:
dataset = 'imdb'

### Movie (C)

In [78]:
query = 'movie' #movie film picture

In [81]:
tabulate_results_knn_sim2(dataset, query, 20,1)

query: movie, cluster size: 20
mask_vec                    avg_concat_mask_vec
--------------------------  --------------------------
film<=>0.9979               film<=>0.9979
documentary<=>0.9877        turkey<=>0.9882
turkey<=>0.9852             documentary<=>0.9867
game<=>0.9828               miniseries<=>0.9864
motion picture<=>0.9820     game<=>0.9838
cartoon<=>0.9816            short film<=>0.9817
short film<=>0.9813         play<=>0.9776
miniseries<=>0.9789         entire series<=>0.9745
play<=>0.9775               ghost story<=>0.9740
slasher film<=>0.9746       romantic comedy<=>0.9737
horror film<=>0.9736        television series<=>0.9735
music video<=>0.9733        final sequence<=>0.9733
sitcom<=>0.9733             crime drama<=>0.9728
television series<=>0.9731  movie 's plot<=>0.9716
sequence<=>0.9728           sequence<=>0.9714
final episode<=>0.9719      thriller<=>0.9714
thriller<=>0.9718           sequel<=>0.9712
romantic comedy<=>0.9718    road movie<=>0.9711
entire 

#### titanic

In [82]:
query = 'titanic'

In [83]:
tabulate_results_knn_sim2(dataset, query, 20,1)

query: titanic, cluster size: 20
mask_vec                       avg_concat_mask_vec
-----------------------------  ----------------------
jane eyre<=>0.9818             cube<=>0.9874
grand canyon<=>0.9809          pet sematary<=>0.9862
bonanza<=>0.9805               hamlet<=>0.9851
deliverance<=>0.9802           jane eyre<=>0.9846
hamlet<=>0.9795                stargate<=>0.9833
star wars<=>0.9772             grand canyon<=>0.9828
midnight cowboy<=>0.9767       power rangers<=>0.9826
pet sematary<=>0.9765          planet earth<=>0.9826
halloween<=>0.9763             deliverance<=>0.9821
planet earth<=>0.9757          star wars<=>0.9812
natural born killers<=>0.9755  hostel<=>0.9812
moonstruck<=>0.9752            bonanza<=>0.9811
sky captain<=>0.9751           purple rain<=>0.9806
edison<=>0.9750                boogie nights<=>0.9806
boogie nights<=>0.9750         halloween<=>0.9805
south park<=>0.9750            edison<=>0.9803
pulp fiction<=>0.9746          cliffhanger<=>0.9802
soylen

#### pulp fiction

In [91]:
query = 'pulp fiction'

In [92]:
tabulate_results_knn_sim2(dataset, query, 20,1)

query: pulp fiction, cluster size: 20
mask_vec                avg_concat_mask_vec
----------------------  -----------------------------
sin city<=>0.9837       blade runner<=>0.9887
star wars<=>0.9829      jurassic park<=>0.9886
jurassic park<=>0.9813  casablanca<=>0.9883
citizen kane<=>0.9813   citizen kane<=>0.9877
south park<=>0.9804     sin city<=>0.9876
anchorman<=>0.9796      boogie nights<=>0.9876
casablanca<=>0.9786     fight club<=>0.9873
jane eyre<=>0.9782      transformers<=>0.9873
boogie nights<=>0.9781  total recall<=>0.9860
blade runner<=>0.9780   office space<=>0.9859
chinatown<=>0.9774      natural born killers<=>0.9858
office space<=>0.9772   starship troopers<=>0.9851
halloween<=>0.9771      goodfellas<=>0.9851
fight club<=>0.9764     anchorman<=>0.9850
dracula<=>0.9758        showgirls<=>0.9847
braveheart<=>0.9758     twin peaks<=>0.9847
animal house<=>0.9758   animal house<=>0.9846
hostel<=>0.9755         star wars<=>0.9846
bonanza<=>0.9753        blazing saddles<=>

### Genre

In [93]:
query = 'genre'

In [94]:
tabulate_results_knn_sim2(dataset, query, 20,1)

query: genre, cluster size: 20
mask_vec                 avg_concat_mask_vec
-----------------------  -----------------------
horror genre<=>0.9816    sub genre<=>0.9857
era<=>0.9797             era<=>0.9831
sub genre<=>0.9769       universe<=>0.9819
industry<=>0.9759        horror genre<=>0.9818
trilogy<=>0.9756         trilogy<=>0.9780
period<=>0.9731          period<=>0.9768
saga<=>0.9718            industry<=>0.9767
subject matter<=>0.9703  sport<=>0.9767
cartoon<=>0.9696         game<=>0.9749
game<=>0.9694            slasher genre<=>0.9742
narrative<=>0.9685       saga<=>0.9731
film genre<=>0.9684      play<=>0.9728
film<=>0.9684            film genre<=>0.9727
film noir<=>0.9675       cartoon<=>0.9721
anime<=>0.9674           film<=>0.9714
play<=>0.9674            narrative<=>0.9704
comics<=>0.9667          entire series<=>0.9704
comedy<=>0.9666          landscape<=>0.9703
production<=>0.9657      country<=>0.9698
sport<=>0.9656           subject matter<=>0.9695


#### comedy

In [95]:
query = 'comedy'

In [96]:
tabulate_results_knn_sim2(dataset, query, 20,1)

query: comedy, cluster size: 20
mask_vec                   avg_concat_mask_vec
-------------------------  -------------------------
drama<=>0.9939             drama<=>0.9954
thriller<=>0.9919          black comedy<=>0.9951
romantic comedy<=>0.9908   romantic comedy<=>0.9918
farce<=>0.9904             melodrama<=>0.9918
black comedy<=>0.9903      thriller<=>0.9916
fantasy<=>0.9898           adventure<=>0.9914
soap opera<=>0.9886        farce<=>0.9912
satire<=>0.9883            satire<=>0.9907
dark comedy<=>0.9875       ghost story<=>0.9906
horror film<=>0.9870       fantasy<=>0.9901
melodrama<=>0.9870         fairy tale<=>0.9897
ghost story<=>0.9864       screwball comedy<=>0.9896
cartoon<=>0.9856           murder mystery<=>0.9888
screwball comedy<=>0.9853  soap opera<=>0.9888
soap<=>0.9850              romance<=>0.9885
horror movie<=>0.9848      sitcom<=>0.9881
fairy tale<=>0.9844        cartoon<=>0.9880
sitcom<=>0.9844            family drama<=>0.9879
mystery<=>0.9843           horror

### Character / role

In [117]:
query = 'role'

In [118]:
tabulate_results_knn_sim2(dataset, query, 20,1)

query: role, cluster size: 20
mask_vec    avg_concat_mask_vec
----------  ---------------------


In [119]:
query = 'character'

In [120]:
tabulate_results_knn_sim2(dataset, query, 20,1)

query: character, cluster size: 20
mask_vec    avg_concat_mask_vec
----------  ---------------------


### Hero

In [121]:
query = 'hero'

In [122]:
tabulate_results_knn_sim2(dataset, query, 20,1)

query: hero, cluster size: 20
mask_vec                 avg_concat_mask_vec
-----------------------  --------------------------
villain<=>0.9918         villain<=>0.9933
protagonist<=>0.9901     bad guy<=>0.9924
bad guy<=>0.9891         protagonist<=>0.9908
soldier<=>0.9852         leading man<=>0.9880
leading man<=>0.9848     antagonist<=>0.9877
mad scientist<=>0.9833   soldier<=>0.9864
antagonist<=>0.9831      mad scientist<=>0.9851
detective<=>0.9825       priest<=>0.9847
judge<=>0.9819           detective<=>0.9843
priest<=>0.9815          judge<=>0.9842
main character<=>0.9815  rapist<=>0.9842
sheriff<=>0.9802         sheriff<=>0.9841
tough guy<=>0.9787       male lead<=>0.9840
lawyer<=>0.9786          central character<=>0.9833
master<=>0.9782          main character<=>0.9829
dentist<=>0.9774         psychopath<=>0.9818
psychopath<=>0.9774      pilot<=>0.9817
king<=>0.9773            lawyer<=>0.9816
rapist<=>0.9773          guru<=>0.9816
male lead<=>0.9772       clown<=>0.9815


### Actor (C)

In [97]:
query = 'actor'

In [98]:
tabulate_results_knn_sim2(dataset, query, 20,1)

query: actor, cluster size: 20
mask_vec                  avg_concat_mask_vec
------------------------  ------------------------
artist<=>0.9841           actress<=>0.9897
actress<=>0.9839          comedian<=>0.9871
comedian<=>0.9788         artist<=>0.9858
character actor<=>0.9735  singer<=>0.9800
leading man<=>0.9728      leading man<=>0.9794
singer<=>0.9722           character actor<=>0.9790
auteur<=>0.9694           filmmaker<=>0.9778
villain<=>0.9692          child actor<=>0.9778
filmmaker<=>0.9676        musician<=>0.9773
anti hero<=>0.9675        anti hero<=>0.9771
child actor<=>0.9674      villain<=>0.9769
playwright<=>0.9653       athlete<=>0.9754
archaeologist<=>0.9646    film director<=>0.9749
director<=>0.9639         rapper<=>0.9749
opera singer<=>0.9638     playwright<=>0.9735
boxer<=>0.9637            boxer<=>0.9734
musician<=>0.9635         martial artist<=>0.9730
dancer<=>0.9630           dancer<=>0.9728
icon<=>0.9625             auteur<=>0.9713
editor<=>0.9624         

#### Tom Hanks

In [101]:
query = 'tom hanks'

In [102]:
tabulate_results_knn_sim2(dataset, query, 20, 1)

query: tom hanks, cluster size: 20
mask_vec                    avg_concat_mask_vec
--------------------------  ---------------------------
robin williams<=>0.9918     robin williams<=>0.9936
brad pitt<=>0.9906          peter sellers<=>0.9933
burt reynolds<=>0.9901      burt reynolds<=>0.9926
morgan freeman<=>0.9898     brad pitt<=>0.9920
denzel washington<=>0.9895  steve mcqueen<=>0.9915
steve mcqueen<=>0.9891      bruce willis<=>0.9914
al pacino<=>0.9889          charlton heston<=>0.9914
adam sandler<=>0.9887       steve martin<=>0.9913
peter sellers<=>0.9885      john wayne<=>0.9913
nicolas cage<=>0.9881       christopher walken<=>0.9912
charlton heston<=>0.9871    morgan freeman<=>0.9912
marlon brando<=>0.9870      jimmy stewart<=>0.9912
john cusack<=>0.9870        christopher lee<=>0.9911
bill paxton<=>0.9870        adam sandler<=>0.9910
james cagney<=>0.9866       denzel washington<=>0.9908
robert de niro<=>0.9864     al pacino<=>0.9907
michael caine<=>0.9864      john cusack<=>0.

### Director

In [103]:
query = 'director'

In [106]:
tabulate_results_knn_sim2(dataset, query, 20, 1)

query: director, cluster size: 20
mask_vec                  avg_concat_mask_vec
------------------------  ----------------------------
screenwriter<=>0.9933     screenwriter<=>0.9955
producer<=>0.9929         producer<=>0.9911
writer<=>0.9868           author<=>0.9883
author<=>0.9867           editor<=>0.9876
cinematographer<=>0.9855  writer<=>0.9872
filmmaker<=>0.9854        filmmaker<=>0.9866
editor<=>0.9818           lead actor<=>0.9840
lead actor<=>0.9818       cinematographer<=>0.9839
leading man<=>0.9734      composer<=>0.9814
hero<=>0.9730             choreographer<=>0.9809
choreographer<=>0.9725    production designer<=>0.9790
protagonist<=>0.9715      protagonist<=>0.9769
photographer<=>0.9712     judge<=>0.9760
star<=>0.9708             auteur<=>0.9755
playwright<=>0.9700       playwright<=>0.9752
detective<=>0.9697        hero<=>0.9752
judge<=>0.9695            male lead<=>0.9749
villain<=>0.9694          leading man<=>0.9744
singer<=>0.9689           professor<=>0.9743
male

#### martin scorsese

In [107]:
query = 'scorsese'

In [108]:
tabulate_results_knn_sim2(dataset, query, 20, 1)

query: scorsese, cluster size: 20
mask_vec                    avg_concat_mask_vec
--------------------------  -----------------------
kubrick<=>0.9791            kubrick<=>0.9888
james cameron<=>0.9770      spielberg<=>0.9872
antonioni<=>0.9753          antonioni<=>0.9860
spielberg<=>0.9753          eastwood<=>0.9845
eastwood<=>0.9723           john carpenter<=>0.9844
morgan freeman<=>0.9723     de palma<=>0.9837
wes craven<=>0.9723         fassbinder<=>0.9834
steve carell<=>0.9721       altman<=>0.9832
denzel washington<=>0.9721  kurosawa<=>0.9831
peter sellers<=>0.9719      john ford<=>0.9830
von trier<=>0.9713          wes craven<=>0.9828
kazan<=>0.9712              kazan<=>0.9822
chaplin<=>0.9708            godard<=>0.9821
john ford<=>0.9708          ford<=>0.9817
steven spielberg<=>0.9705   james cameron<=>0.9812
tony scott<=>0.9700         van damme<=>0.9809
sean connery<=>0.9697       uwe boll<=>0.9808
de palma<=>0.9696           robert altman<=>0.9802
anthony quinn<=>0.9689    

### experience

In [125]:
query = 'performance'

In [126]:
tabulate_results_knn_sim2(dataset, query, 20, 1)

query: performance, cluster size: 20
mask_vec    avg_concat_mask_vec
----------  ---------------------


#### flat

In [127]:
query = 'flat'

In [128]:
tabulate_results_knn_sim2(dataset, query, 20, 1)

query: flat, cluster size: 20
mask_vec    avg_concat_mask_vec
----------  ---------------------


#### funny

In [133]:
query = 'funny'

In [134]:
tabulate_results_knn_sim2(dataset, query, 20, 1)

query: funny, cluster size: 20
mask_vec    avg_concat_mask_vec
----------  ---------------------


#### thrilling

In [135]:
query = 'thrilling'

In [136]:
tabulate_results_knn_sim2(dataset, query, 20, 1)

query: thrilling, cluster size: 20
mask_vec    avg_concat_mask_vec
----------  ---------------------


### Rating

In [137]:
query = 'rating'

In [138]:
tabulate_results_knn_sim2(dataset, query, 20, 1)

query: rating, cluster size: 20
mask_vec    avg_concat_mask_vec
----------  ---------------------


#### average

In [141]:
query = 'average'

In [142]:
tabulate_results_knn_sim2(dataset, query, 20, 1)

query: average, cluster size: 20
mask_vec                avg_concat_mask_vec
----------------------  --------------------------
amateur<=>0.9741        amateur<=>0.9726
independent<=>0.9711    typical hollywood<=>0.9696
american<=>0.9706       independent<=>0.9683
aussie<=>0.9701         american<=>0.9671
asian<=>0.9681          low rent<=>0.9659
australian<=>0.9661     silent<=>0.9657
italian<=>0.9659        hardcore<=>0.9649
silent<=>0.9630         aussie<=>0.9642
old fashioned<=>0.9628  hard core<=>0.9641
anti<=>0.9627           asian<=>0.9641
indie<=>0.9612          gay<=>0.9633
british<=>0.9610        professional<=>0.9628
indian<=>0.9593         japanese<=>0.9625
japanese<=>0.9592       italian<=>0.9624
stock<=>0.9589          second rate<=>0.9621
b grade<=>0.9587        superior<=>0.9614
mtv<=>0.9585            classical<=>0.9612
western<=>0.9581        chinese<=>0.9609
low budget<=>0.9577     b grade<=>0.9606
european<=>0.9576       indian<=>0.9606


## Glasdoor

In [143]:
dataset = 'glassdoor'

### Company

In [146]:
query = 'company'

In [147]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: company, cluster size: 20
mask_vec                 avg_context_vec          avg_concat_mask_vec
-----------------------  -----------------------  ---------------------
warehouse<=>0.9287       amazon<=>0.9906          warehouse<=>0.9558
amazon<=>0.9243          management<=>0.9862      manager<=>0.9507
area<=>0.9241            area<=>0.9833            amazon<=>0.9479
management<=>0.9205      free<=>0.9831            stock<=>0.9446
stock<=>0.9155           stock<=>0.9829           management<=>0.9382
school<=>0.9107          warehouse<=>0.9820       area<=>0.9329
the company<=>0.9051     the company<=>0.9817     industry<=>0.9326
healthcare<=>0.8989      high<=>0.9811            college<=>0.9289
department<=>0.8979      manager<=>0.9786         kindle<=>0.9279
manager<=>0.8930         school<=>0.9780          culture<=>0.9275
money<=>0.8895           great benefits<=>0.9768  the company<=>0.9273
3 months<=>0.8883        life<=>0.9767            production<=>0.9198
dogs<=>0.8871  

#### Google

In [148]:
query = 'google'

In [149]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: google, cluster size: 20
mask_vec                      avg_context_vec               avg_concat_mask_vec
----------------------------  ----------------------------  ----------------------------
expedia<=>0.9087              seattle<=>0.9594              microsoft<=>0.8970
zillow<=>0.9082               microsoft<=>0.9518            zillow<=>0.8948
microsoft<=>0.9068            bay area<=>0.9505             facebook<=>0.8934
amazon<=>0.8767               expedia<=>0.9496              redfin<=>0.8934
facebook<=>0.8757             redfin<=>0.9467               expedia<=>0.8901
amazon web services<=>0.8708  area<=>0.9417                 amazon web services<=>0.8856
company<=>0.8478              amazon<=>0.9408               cruises<=>0.8682
software<=>0.8426             zillow<=>0.9397               mri<=>0.8611
stock<=>0.8405                amazon web services<=>0.9389  walmart<=>0.8597
management<=>0.8387           industry<=>0.9389             amazon<=>0.8553
school<=>0.8340      

#### Management

In [152]:
query = 'management'

In [153]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: management, cluster size: 20
mask_vec                 avg_context_vec          avg_concat_mask_vec
-----------------------  -----------------------  -----------------------
warehouse<=>0.9310       company<=>0.9862         warehouse<=>0.9454
amazon<=>0.9244          amazon<=>0.9861          stock<=>0.9394
company<=>0.9205         warehouse<=>0.9834       company<=>0.9382
stock<=>0.9172           free<=>0.9814            amazon<=>0.9341
school<=>0.9114          area<=>0.9804            manager<=>0.9297
food<=>0.9066            manager<=>0.9783         college<=>0.9282
area<=>0.9058            the company<=>0.9764     culture<=>0.9278
dogs<=>0.9035            school<=>0.9757          safety<=>0.9243
free<=>0.8983            culture<=>0.9734         school<=>0.9160
the company<=>0.8981     life<=>0.9734            production<=>0.9154
healthcare<=>0.8929      high<=>0.9722            industry<=>0.9142
great benefits<=>0.8925  dogs<=>0.9720            healthcare<=>0.9136
3 months<=>0

### Benefits

In [158]:
query = 'benefits'

In [157]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: benefit, cluster size: 20
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------


In [159]:
query = 'great benefits'

In [160]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: great benefits, cluster size: 20
mask_vec              avg_context_vec       avg_concat_mask_vec
--------------------  --------------------  -------------------------
stock<=>0.9406        stock<=>0.9854        job security<=>0.9365
food<=>0.9213         medical<=>0.9814      stock<=>0.9349
healthcare<=>0.9051   amazon<=>0.9771       gift cards<=>0.9315
gift cards<=>0.9030   company<=>0.9768      culture<=>0.9303
amazon<=>0.9002       health<=>0.9761       game room<=>0.9296
games<=>0.8964        healthcare<=>0.9732   food<=>0.9237
school<=>0.8943       free<=>0.9722         healthcare<=>0.9178
management<=>0.8925   a year<=>0.9721       parental leave<=>0.9171
dress code<=>0.8913   up to<=>0.9721        daily variety<=>0.9152
game room<=>0.8907    school<=>0.9700       medical<=>0.9151
warehouse<=>0.8894    the company<=>0.9700  management<=>0.9109
free<=>0.8894         management<=>0.9699   games<=>0.9106
company<=>0.8868      food<=>0.9686         company<=>0.9074
up to<=>0.8

#### stock

In [161]:
query = 'stock'

In [162]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: stock, cluster size: 20
mask_vec                 avg_context_vec          avg_concat_mask_vec
-----------------------  -----------------------  -----------------------
great benefits<=>0.9406  great benefits<=>0.9854  medical<=>0.9508
healthcare<=>0.9357      healthcare<=>0.9843      healthcare<=>0.9490
food<=>0.9350            company<=>0.9829         company<=>0.9446
warehouse<=>0.9236       medical<=>0.9826         college<=>0.9404
medical<=>0.9220         amazon<=>0.9821          kindle<=>0.9403
management<=>0.9172      health<=>0.9802          management<=>0.9394
company<=>0.9155         employment<=>0.9790      health<=>0.9387
amazon<=>0.9142          a year<=>0.9786          warehouse<=>0.9374
school<=>0.9112          up to<=>0.9773           food<=>0.9373
health<=>0.9103          high<=>0.9760            great benefits<=>0.9349
free<=>0.9102            the company<=>0.9751     safety<=>0.9292
kindle<=>0.9085          insurance<=>0.9721       free<=>0.9225
high<=>0.9056  

#### healthcare

In [163]:
query = 'healthcare'

In [164]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: healthcare, cluster size: 20
mask_vec                 avg_context_vec            avg_concat_mask_vec
-----------------------  -------------------------  -------------------------
stock<=>0.9357           stock<=>0.9843             stock<=>0.9490
food<=>0.9178            medical<=>0.9791           food<=>0.9397
great benefits<=>0.9051  company<=>0.9737           medical<=>0.9215
warehouse<=>0.9024       great benefits<=>0.9732    insurance<=>0.9202
company<=>0.8989         amazon<=>0.9724            company<=>0.9197
medical<=>0.8931         health<=>0.9706            great benefits<=>0.9178
management<=>0.8929      the company<=>0.9656       job security<=>0.9172
school<=>0.8881          employment<=>0.9641        management<=>0.9136
health care<=>0.8857     management<=>0.9635        culture<=>0.9121
amazon<=>0.8762          a year<=>0.9624            parental leave<=>0.9099
area<=>0.8750            college<=>0.9623           warehouse<=>0.9089
health<=>0.8744          high<=>0.

#### 401k

In [165]:
query = '401k'

In [166]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: 401k, cluster size: 20
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------


#### insurance

In [167]:
query = 'insurance'

In [168]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: insurance, cluster size: 20
mask_vec                 avg_context_vec          avg_concat_mask_vec
-----------------------  -----------------------  -----------------------
stock<=>0.8962           stock<=>0.9721           stock<=>0.9219
food<=>0.8868            company<=>0.9635         healthcare<=>0.9202
healthcare<=>0.8743      medical<=>0.9621         medical<=>0.9036
great benefits<=>0.8685  great benefits<=>0.9612  management<=>0.9010
wineries<=>0.8646        healthcare<=>0.9611      food<=>0.9003
health care<=>0.8591     signing bonus<=>0.9608   company<=>0.8993
warehouse<=>0.8587       amazon<=>0.9591          warehouse<=>0.8920
disney<=>0.8585          the company<=>0.9575     health<=>0.8878
company<=>0.8578         games<=>0.9559           college<=>0.8854
health<=>0.8526          food<=>0.9559            kindle<=>0.8836
medical<=>0.8517         free<=>0.9554            coffee<=>0.8811
school<=>0.8509          health<=>0.9551          health care<=>0.8795
management<=>

### Dress Code

In [193]:
query = 'dress code'

In [194]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: dress code, cluster size: 20
mask_vec                 avg_context_vec          avg_concat_mask_vec
-----------------------  -----------------------  -------------------------
great benefits<=>0.8913  culture<=>0.9746         drama<=>0.8841
food<=>0.8892            free<=>0.9743            gift cards<=>0.8756
dogs<=>0.8870            dogs<=>0.9675            great benefits<=>0.8748
stock<=>0.8804           music<=>0.9673           autonomy<=>0.8724
water<=>0.8683           company<=>0.9673         game room<=>0.8718
free<=>0.8675            food<=>0.9665            food<=>0.8690
sl<=>0.8639              atmosphere<=>0.9648      job security<=>0.8687
management<=>0.8629      amazon<=>0.9634          parental leave<=>0.8660
warehouse<=>0.8578       area<=>0.9621            politics<=>0.8631
healthcare<=>0.8576      great benefits<=>0.9613  playstation<=>0.8623
games<=>0.8544           management<=>0.9607      healthcare<=>0.8579
gift cards<=>0.8543      warehouse<=>0.9603       gam

#### polo shirt

In [195]:
query = 'polo shirt'

In [196]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: polo shirt, cluster size: 20
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------


### Pay

In [176]:
query = 'pay'

In [175]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: salaries, cluster size: 20
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------


In [177]:
query = 'salary'

In [178]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: salary, cluster size: 20
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------


#### good pay

In [183]:
query = 'good pay'

In [184]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: good pay, cluster size: 20
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------


### Job Title

In [181]:
query = 'job title'

In [182]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: job title, cluster size: 20
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------


#### engineer

In [189]:
query = 'engineer'

In [190]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: engineer, cluster size: 20
mask_vec             avg_context_vec      avg_concat_mask_vec
-------------------  -------------------  ---------------------
manager<=>0.6692     business<=>0.9170    emt<=>0.8405
aces<=>0.6578        products<=>0.9097    manager<=>0.7930
industry<=>0.6405    production<=>0.8992  student<=>0.7863
student<=>0.6034     politics<=>0.8925    company<=>0.7824
culture<=>0.5943     variety<=>0.8905     industry<=>0.7618
hair<=>0.5918        autonomy<=>0.8890    business<=>0.7534
department<=>0.5899  culture<=>0.8857     warehouse<=>0.7479
warehouse<=>0.5891   industry<=>0.8851    management<=>0.7445
products<=>0.5878    center<=>0.8823      safety<=>0.7395
management<=>0.5870  dogs<=>0.8794        publishing<=>0.7384
area<=>0.5849        area<=>0.8791        software<=>0.7374
cloud<=>0.5846       knowledge<=>0.8779   aces<=>0.7368
email<=>0.5837       warehouse<=>0.8773   lsa<=>0.7349
company<=>0.5833     manager<=>0.8771     products<=>0.7286
amazon<=>0.580

#### driver

In [191]:
query = 'driver'

In [192]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: driver, cluster size: 20
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------


### Environment

In [205]:
query = 'environment'

In [204]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: atmosphere, cluster size: 20
mask_vec                 avg_context_vec       avg_concat_mask_vec
-----------------------  --------------------  -----------------------
company<=>0.8413         company<=>0.9685      culture<=>0.9281
amazon<=>0.8405          free<=>0.9676         company<=>0.9103
daily variety<=>0.8326   amazon<=>0.9665       stock<=>0.9082
management<=>0.8318      dress code<=>0.9648   great benefits<=>0.9064
money<=>0.8285           area<=>0.9639         management<=>0.9036
school<=>0.8275          the company<=>0.9625  warehouse<=>0.9020
warehouse<=>0.8268       food<=>0.9623         food<=>0.8964
culture<=>0.8235         management<=>0.9619   healthcare<=>0.8934
dogs<=>0.8204            high<=>0.9616         amazon<=>0.8925
area<=>0.8198            warehouse<=>0.9606    area<=>0.8890
healthcare<=>0.8192      culture<=>0.9604      manager<=>0.8886
production<=>0.8126      dogs<=>0.9594         game room<=>0.8853
stock<=>0.8125           games<=>0.9571        job

#### poor environment

In [199]:
query = 'poor environment'

In [200]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: poor environment, cluster size: 20
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------


In [206]:
query = 'atmosphere'

In [207]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: atmosphere, cluster size: 20
mask_vec                 avg_context_vec       avg_concat_mask_vec
-----------------------  --------------------  -----------------------
company<=>0.8413         company<=>0.9685      culture<=>0.9281
amazon<=>0.8405          free<=>0.9676         company<=>0.9103
daily variety<=>0.8326   amazon<=>0.9665       stock<=>0.9082
management<=>0.8318      dress code<=>0.9648   great benefits<=>0.9064
money<=>0.8285           area<=>0.9639         management<=>0.9036
school<=>0.8275          the company<=>0.9625  warehouse<=>0.9020
warehouse<=>0.8268       food<=>0.9623         food<=>0.8964
culture<=>0.8235         management<=>0.9619   healthcare<=>0.8934
dogs<=>0.8204            high<=>0.9616         amazon<=>0.8925
area<=>0.8198            warehouse<=>0.9606    area<=>0.8890
healthcare<=>0.8192      culture<=>0.9604      manager<=>0.8886
production<=>0.8126      dogs<=>0.9594         game room<=>0.8853
stock<=>0.8125           games<=>0.9571        job

### Culture

In [201]:
query = 'culture'

In [202]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: culture, cluster size: 20
mask_vec                 avg_context_vec      avg_concat_mask_vec
-----------------------  -------------------  -----------------------
area<=>0.8896            free<=>0.9774        great benefits<=>0.9303
warehouse<=>0.8892       dogs<=>0.9764        atmosphere<=>0.9281
management<=>0.8882      company<=>0.9757     management<=>0.9278
great benefits<=>0.8850  dress code<=>0.9746  company<=>0.9275
company<=>0.8844         area<=>0.9745        warehouse<=>0.9222
dogs<=>0.8826            amazon<=>0.9742      stock<=>0.9178
amazon<=>0.8762          management<=>0.9734  job security<=>0.9153
food<=>0.8753            life<=>0.9723        games<=>0.9139
industry<=>0.8724        warehouse<=>0.9711   area<=>0.9137
stock<=>0.8711           manager<=>0.9703     manager<=>0.9133
water<=>0.8706           safety<=>0.9679      healthcare<=>0.9121
games<=>0.8673           food<=>0.9666        food<=>0.9108
free<=>0.8651            autonomy<=>0.9666    water<=>0.9051
s

#### friendly

In [209]:
query = 'friendly'

In [210]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: friendly, cluster size: 20
mask_vec    avg_context_vec        avg_concat_mask_vec
----------  ---------------------  -------------------------
            apple watch<=>0.9658   security<=>0.8844
            culture<=>0.9626       free<=>0.8816
            dress code<=>0.9600    college<=>0.8794
            company<=>0.9570       nice<=>0.8772
            life<=>0.9554          stock<=>0.8762
            employment<=>0.9549    culture<=>0.8758
            free<=>0.9529          management<=>0.8754
            job security<=>0.9506  great benefits<=>0.8748
            high<=>0.9505          warehouse<=>0.8697
            management<=>0.9469    part time<=>0.8696
            pressure<=>0.9463      downtown seattle<=>0.8659
            amazon<=>0.9459        safety<=>0.8649
            atmosphere<=>0.9446    company<=>0.8645
            stock<=>0.9446         industry<=>0.8595
            nice<=>0.9444          medical<=>0.8594
            dogs<=>0.9443          game room<=>0.8583


ValueError: arrays must all be same length

### Work life

In [219]:
query = 'life' # work/worklife/work life

In [218]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: life, cluster size: 20
mask_vec                 avg_context_vec          avg_concat_mask_vec
-----------------------  -----------------------  ---------------------
warehouse<=>0.9054       manager<=>0.9789         stock<=>0.9050
stock<=>0.8989           amazon<=>0.9779          warehouse<=>0.9024
area<=>0.8845            company<=>0.9767         culture<=>0.8997
management<=>0.8810      management<=>0.9734      safety<=>0.8920
food<=>0.8729            culture<=>0.9723         company<=>0.8919
high<=>0.8713            school<=>0.9703          management<=>0.8913
school<=>0.8705          high<=>0.9691            area<=>0.8871
company<=>0.8693         warehouse<=>0.9674       school<=>0.8841
free<=>0.8688            area<=>0.9642            industry<=>0.8796
industry<=>0.8681        free<=>0.9638            manager<=>0.8792
part time<=>0.8657       department<=>0.9635      healthcare<=>0.8767
kindle<=>0.8656          part time<=>0.9631       business<=>0.8762
healthcare<=>0.8654  

### staff

In [220]:
query = 'staff'

In [221]:
tabulate_results_knn_sim(dataset, query, 20, 1)

query: staff, cluster size: 20
mask_vec    avg_context_vec    avg_concat_mask_vec
----------  -----------------  ---------------------
