This notebook is used for examining word2vec similairties

In [1]:
#if jupyternotify is installed, we can add %notify to a cell to get an alert when it ifnished running

%load_ext jupyternotify
import metrics_helpers as indicators
import pickle as pk
import gc
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd


def dt_to_int(dt): #datetime to integer
    return dt.astype('int')/(10**9)
with open('/Users/sma/Documents/INRAE internship/scrape-git/netmums/allposts_rerun.pkl', 'rb') as f:
    netmums = pk.load(f)
with open('/Users/sma/Documents/INRAE internship/scrape-git/netmums/netmums_subset_keys.txt', 'r') as f:
    keys = [url.strip() for url in f.readlines()]
    
    
netmums = {key:netmums[key] for key in keys}

<IPython.core.display.Javascript object>

In [2]:
#netmums
nm_ind = indicators.indicators(netmums, fb=False)
#this one takes long, around 20 seconds I think.

nm_ind.add_available_comments()
nm_ind.add_num_unique_posters()
nm_ind.add_num_urls()
nm_ind.add_post_time()
nm_ind.add_lexical_richness()
nm_ind.add_term_distance_simple()

netmums = nm_ind.results_dict

# GenSim / Word2Vec Implementation on Threads

https://radimrehurek.com/gensim/models/word2vec.html

In [3]:
from gensim.models import Word2Vec, Phrases
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS



we can try running the model on only our subset and see how long it takes.

Then maybe we can run it on the entire dataset.

The end goal of this is to determine which words are related to ngrams like "baby food"

In [4]:
#Process our data to the right format for feeding into the model
keys = list(nm_ind.text_dict.keys())
text_list = list(nm_ind.text_dict.values())


In [5]:
import re

In [6]:
#define functions
def deEmojify(text):
    #remove emoji (FIXME: doesnt remove all of them.)
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def clean(text):
    #remove URLs
    pdf_regex='http[\S]+pdf[\S]*'
    regex = r'http\S+'
    text = re.sub(regex, 'urlpostedtopdf', text)
    text = re.sub(regex, 'urlpostedtosomething', text)
    #TODO: remove emails

    #replace commas and semicolons with spaces.
    text = re.sub('[;,&\+]+', ' ', text)
    #remove hyphens
    text = re.sub('[-]+', ' ', text)
    return text

In [7]:
#CLEAN TEXT
text_list = [deEmojify(i) for i in text_list]

text_list = [clean(item) for item in text_list]


#split sentences, new lines
text_list = [j for i in text_list for j in re.split('[\n?!.]+', i)]

#remove empty items
text_list = [i for i in text_list if i]
#strip reamining elements from text
text_list = [re.sub(r'[^A-Za-z0-9 ]+', '', i) for i in text_list]
#remove extra spaces
text_list = [re.sub(r'\s+', ' ', i) for i in text_list]

In [8]:
from gensim.utils import tokenize

In [9]:
#we need to transform list of sentence to LoL of words
tokens = [list(tokenize(doc, lower=True)) for doc in text_list]

In [10]:
# Train a bigram detector.
#this detects bigrams and converts them to single tokens simply by relplacing space w underscore
#check the google paper for more info on how it's done.
#https://datascience.stackexchange.com/questions/25524/how-does-phrases-in-gensim-work

#Detect phrases based on collocation counts.
bigram_transformer = Phrases(tokens, threshold = 2, connector_words=phrases.ENGLISH_CONNECTOR_WORDS)
#moving the threshhold a bit higher.. about 12, will get rid of a lot
#of non-phrase bigrams (what_brand, an_email) but I'm not sure how this would be beneficial.
#TODO: read how the word similarity is calculated.

# Apply the trained MWE detector to a corpus, using the result to train a Word2vec model.
model = Word2Vec(bigram_transformer[tokens], min_count=1)

%notify

<IPython.core.display.Javascript object>

In [11]:
#bigram_transformer = Phrases(tokens, threshold = 10) 
temp = sorted(bigram_transformer.export_phrases().items(), key= lambda x:x[1] * -1 )

In [12]:
baby_ = [item for item in bigram_transformer.export_phrases().items() if 'baby_' in item[0]]
baby_

[('baby_food', 18.980015343965896),
 ('baby_arrived', 14.238439978954187),
 ('baby_massage', 10.228430352228312),
 ('baby_rice', 15.396495339628856),
 ('baby_led', 36.9429794048541),
 ('baby_arrives', 10.848335222060332),
 ('baby_dermexa', 61.74061109227475),
 ('baby_dermexarange', 59.06315843121736),
 ('ismamiababy_food', 48.40609198169997),
 ('aldimamiababy_food', 48.40609198169997)]

In [13]:
_food = [item for item in bigram_transformer.export_phrases().items() if '_food' in item[0]]
_food

[('finger_food', 23.58245506800768),
 ('baby_food', 18.980015343965896),
 ('processed_food', 16.305209930677886),
 ('junk_food', 26.403322899109078),
 ('processed_foods', 97.29243799153055),
 ('junk_foods', 15.56023839931886),
 ('certain_foods', 57.79517119747005),
 ('solid_food', 21.355628815455873),
 ('sugary_foods', 18.017118146579733),
 ('mamia_food', 25.00981419054499),
 ('branded_food', 73.92930411750542),
 ('finger_foods', 200.78692242197988),
 ('farm_foods', 10.068389552500438),
 ('ismamiababy_food', 48.40609198169997),
 ('aldimamiababy_food', 48.40609198169997)]

In [14]:
formula = [item for item in bigram_transformer.export_phrases().items() if 'formula' in item[0]]
formula

[('soy_formula', 14.778394471807689),
 ('add_formula', 11.188607162166461),
 ('formula_powder', 14.468230637214194),
 ('infant_formula', 27.31763826606876),
 ('powdered_formula', 48.466777568831674),
 ('formula_feeding', 16.382072791957125),
 ('hydrolysed_formula', 45.07410313901345),
 ('formula_fed', 16.00694542466983),
 ('prescription_formula', 10.451965945278483)]

In [15]:
bottle = [item for item in bigram_transformer.export_phrases().items() if 'bottle' in item[0]]
bottle

[('bottle_feeding', 22.856322623189396),
 ('bottle_fed', 30.200230642795283),
 ('making_bottles', 11.023101133946),
 ('bottled_water', 66.63258203513423),
 ('sterile_bottle', 19.409734291121154),
 ('sterilised_bottles', 49.55219445395413),
 ('oz_bottles', 20.945975916526503),
 ('oz_bottle', 22.190940074234543),
 ('insulated_bottle', 45.28938001261602),
 ('bottle_warmer', 104.23981891428343),
 ('sterilised_bottle', 31.899476356712153),
 ('empty_bottle', 31.354186162580323),
 ('mam_bottles', 107.51891249442876),
 ('tippee_bottles', 24.668841394825648),
 ('plastic_bottles', 18.745073559883963),
 ('pump_bottle', 10.633158785570718),
 ('sterilise_bottles', 15.49709267110842),
 ('avent_bottles', 23.893091665428614),
 ('sterilising_bottles', 41.15585039370078),
 ('glass_bottles', 25.326677165354333),
 ('browns_bottles', 36.52886129618413),
 ('brown_bottles', 14.522177273712346)]

In [94]:
jarred = [item for item in bigram_transformer.export_phrases().items() if 'jarred' in item[0]]
jarred

[]

In [16]:
milk = [item for item in bigram_transformer.export_phrases().items() if 'milk' in item[0]]
milk

[('milk_allergy', 15.004515599343184),
 ('breast_milk', 27.512780772052597),
 ('soya_milk', 21.9982425961429),
 ('expressed_milk', 43.129495631143705),
 ('cows_milk', 123.60759149763705),
 ('soy_milk', 10.926995912510327),
 ('milk_protein', 16.90377926753759),
 ('milk_powder', 11.355981677964436),
 ('goats_milk', 77.35233896584471),
 ('almond_milk', 65.8885293758956),
 ('skimmed_milk', 81.7057307264482),
 ('oat_milk', 29.81919674019266),
 ('coconut_milk', 15.899280291047136),
 ('semiskimmed_milk', 13.330935013262598),
 ('reflux_milk', 16.568083668490804),
 ('milk_supply', 21.764791858387916),
 ('milk_intolerance', 13.112395095012394),
 ('comfort_milk', 14.70323714698081),
 ('powdered_milk', 12.900904851544452)]

In [17]:
bacteria = [item for item in bigram_transformer.export_phrases().items() if 'bacteria' in item[0]]
bacteria

[('kill_bacteria', 80.807835160948),
 ('harmful_bacteria', 103.14986958566726),
 ('anti_bacterial', 175.31729651162792),
 ('friendly_bacteria', 29.333244163424123),
 ('bacterial_growth', 104.70338541666666),
 ('kills_bacteria', 15.042689314576474),
 ('bacteria_growth', 16.296246757457848),
 ('bacterial_vaginosis', 1435.9321428571427)]

In [18]:
cereal = [item for item in bigram_transformer.export_phrases().items() if 'cereal' in item[0]]
cereal

[('cereal_bars', 200.44532895900676), ('snacker_cereal', 1418.2045855379188)]

In [19]:
porridge = [item for item in bigram_transformer.export_phrases().items() if 'porridge' in item[0]]
porridge

[('porridge_pouches', 229.9516295379538),
 ('recommend_porridge', 58.926345101615425),
 ('fruity_porridge', 314.2738405419489),
 ('creamy_porridge', 34.31725844998293),
 ('porridge_oats', 154.01912423385195)]

In [20]:
oats = [item for item in bigram_transformer.export_phrases().items() if 'oats' in item[0]]
oats

[('goats_milk', 77.35233896584471),
 ('goats_cheese', 88.75970927369369),
 ('porridge_oats', 154.01912423385195)]

In [21]:
fruit = [item for item in bigram_transformer.export_phrases().items() if 'fruit' in item[0]]
fruit

[('fruit_veg', 13.74195588088694),
 ('fresh_fruit', 38.762435629129726),
 ('dried_fruit', 27.844716539468397),
 ('fruit_juice', 14.294404411963384),
 ('citrus_fruits', 150.75403074615673),
 ('fruit_pouches', 29.25804684866869),
 ('fruit_pots', 62.2913255487785),
 ('fruity_porridge', 314.2738405419489),
 ('fruit_puree', 41.84897103431868),
 ('fruit_purees', 16.593178019438312),
 ('fruit_shoot', 22.585158970902146),
 ('dried_fruits', 43.367597885880706),
 ('salad_grapefruit', 30.76918956149078),
 ('breakfast_grapefruit', 17.969608259402445),
 ('grapefruit_boiled', 10.03221298999426),
 ('grapefruit_juice', 69.7175307785677)]

In [22]:
veg = [item for item in bigram_transformer.export_phrases().items() if 'veg' in item[0]]
veg

[('fruit_veg', 13.74195588088694),
 ('vegan_diet', 12.614027608607389),
 ('veg_purees', 11.556797930439782),
 ('vegetable_oil', 51.48028169014084),
 ('green_veg', 25.561389866979656),
 ('pureed_veg', 53.93172367538565),
 ('pured_veg', 73.86301285976731),
 ('green_vegetables', 19.525106837606838),
 ('carb_veg', 12.221937379673726),
 ('veganuary_challenge', 558.4180555555556),
 ('vegan_butter', 23.09699842022117),
 ('quorn_veganuary', 26.28536872384937),
 ('quorn_vegan', 38.23326359832636),
 ('vegan_pieces', 31.24017094017094),
 ('vegetarian_option', 119.92870991797166),
 ('veggie_option', 82.72860082304527)]

In [23]:
pure = [item for item in bigram_transformer.export_phrases().items() if 'pure' in item[0]]
#search pure rather than puree because the cleaning may have removed accented e.
pure

[('veg_purees', 11.556797930439782),
 ('purely_because', 10.130032753842277),
 ('fruit_puree', 41.84897103431868),
 ('fruit_purees', 16.593178019438312),
 ('pureed_veg', 53.93172367538565),
 ('pured_veg', 73.86301285976731)]

In [24]:
for_ = [item for item in bigram_transformer.export_phrases().items() if 'for_' in item[0]]
for_

[('for_ages', 10.572040860416791),
 ('for_example', 16.256043523787596),
 ('for_replying', 11.572098779645406),
 ('for_choosing', 11.068964050095605),
 ('for_instance', 12.252810472565724),
 ('for_brekkie', 12.18115661015306)]

In [25]:
_for = [item for item in bigram_transformer.export_phrases().items() if item[0].endswith('_for')]
_for

[('suitable_for', 14.136726076756009),
 ('opt_for', 13.018611127101083),
 ('responsible_for', 10.160867221152065),
 ('catered_for', 14.7281257195487),
 ('cater_for', 12.85788753293934)]

In [26]:
fish = [item for item in bigram_transformer.export_phrases().items() if 'fish' in item[0]]
fish

[('fish_oil', 19.47105210056903),
 ('fish_fingers', 18.681685123518935),
 ('fish_pie', 25.88847751199253)]

In [60]:
model.wv.most_similar(positive=['baby_food'], negative=['what_brand'], topn=70)

[('civil', 0.484002947807312),
 ('behalf', 0.4821856915950775),
 ('veganso', 0.4752374291419983),
 ('baby', 0.4699704945087433),
 ('tootsies', 0.46447181701660156),
 ('mealday', 0.46278780698776245),
 ('leavingclaire', 0.4603457450866699),
 ('child', 0.46033722162246704),
 ('pander', 0.4473734498023987),
 ('lber', 0.44080352783203125),
 ('climber', 0.4336532652378082),
 ('consumedx', 0.43335485458374023),
 ('choice', 0.42924654483795166),
 ('visitorgp', 0.42922401428222656),
 ('dog', 0.4282580614089966),
 ('dunk', 0.4268827438354492),
 ('do', 0.42445674538612366),
 ('newhv', 0.42384371161460876),
 ('juiceplusers', 0.4192289113998413),
 ('rountine', 0.4146220088005066),
 ('designs', 0.4119214415550232),
 ('tortured', 0.4094381034374237),
 ('go', 0.40819084644317627),
 ('dispersible', 0.4074210822582245),
 ('unsocial', 0.40705984830856323),
 ('whomis', 0.40534287691116333),
 ('pleasantto', 0.40413957834243774),
 ('ovalating', 0.40285322070121765),
 ('ask', 0.3986044228076935),
 ('maddnes

In [27]:
model.wv.most_similar('baby_food', topn=70)

[('wrongwhat', 0.7251508831977844),
 ('what_brand', 0.711052417755127),
 ('aveeno_baby', 0.7100450992584229),
 ('brands', 0.7007764577865601),
 ('when_selecting', 0.6926984190940857),
 ('price', 0.6922866106033325),
 ('dermexa_range', 0.6908367872238159),
 ('decide', 0.6845329403877258),
 ('key_factor', 0.6760783791542053),
 ('normally_purchase', 0.6741558909416199),
 ('choose', 0.6740629076957703),
 ('google', 0.6653648018836975),
 ('other_retailers', 0.6646943688392639),
 ('link', 0.6637616753578186),
 ('copy', 0.6614897847175598),
 ('switch', 0.6613793969154358),
 ('practitioner', 0.6609077453613281),
 ('possible', 0.6600980162620544),
 ('solution', 0.6596598625183105),
 ('optician', 0.6588265895843506),
 ('suggest', 0.6545343995094299),
 ('buy_other', 0.6544627547264099),
 ('feel_comfortable', 0.6531235575675964),
 ('question', 0.6505260467529297),
 ('planlike', 0.6497359275817871),
 ('follow', 0.6489999294281006),
 ('best_way', 0.6484217047691345),
 ('answer', 0.6434546113014221),

In [28]:
model.wv.most_similar('pureed', topn=70)
#there are specific fruits and veggies in here.
#TODO: make a list of all english fruits and veggies
#sweet_corn
#steamed (x)
#peach


[('fruitveg', 0.8857777118682861),
 ('jacket_potatoes', 0.8788002729415894),
 ('pure', 0.8766114711761475),
 ('lunch_poached', 0.8760175704956055),
 ('dinner_grilled', 0.8757866024971008),
 ('roast_chicken', 0.8725905418395996),
 ('crumpets', 0.8716704845428467),
 ('mediterranean', 0.870742917060852),
 ('sesame', 0.869845986366272),
 ('undercooked', 0.8686826825141907),
 ('leftover', 0.8685581684112549),
 ('special_fried', 0.867859959602356),
 ('blueberries', 0.8649089336395264),
 ('quorn_vegan', 0.8640193343162537),
 ('battered', 0.8638033270835876),
 ('puddings', 0.8636474013328552),
 ('sweetened', 0.8630293011665344),
 ('various', 0.8617982864379883),
 ('fruit_veg', 0.8612055778503418),
 ('tortillas', 0.8594556450843811),
 ('kebabs', 0.859329879283905),
 ('macaroni', 0.8592596650123596),
 ('smoothies', 0.858727216720581),
 ('grapes', 0.8587201833724976),
 ('packed', 0.8586691617965698),
 ('eg', 0.8583951592445374),
 ('birds_eye', 0.8580885529518127),
 ('containing', 0.85804218053817

In [29]:
model.wv.most_similar('puree', topn=70)

[('butternut_squash', 0.9555217027664185),
 ('tomato', 0.9467798471450806),
 ('flour', 0.9451279044151306),
 ('sweetcorn', 0.9449006915092468),
 ('hummus', 0.9441614747047424),
 ('avocado', 0.9436941742897034),
 ('potato', 0.9415420889854431),
 ('ham', 0.9406529664993286),
 ('apple', 0.9398086667060852),
 ('sliced', 0.9396859407424927),
 ('beetroot', 0.9391085505485535),
 ('green_beans', 0.9390202164649963),
 ('sweet_potato', 0.9383134245872498),
 ('red_wine', 0.9367283582687378),
 ('roasted', 0.9355555772781372),
 ('baked', 0.9354897141456604),
 ('stirfry', 0.9345808625221252),
 ('scrambled_egg', 0.933868408203125),
 ('crackers', 0.932793378829956),
 ('steamed', 0.9323605895042419),
 ('mushrooms', 0.9317981600761414),
 ('rice_cakes', 0.9313735365867615),
 ('herbs', 0.9300234317779541),
 ('sweet_corn', 0.929861843585968),
 ('tomatos', 0.9297149777412415),
 ('lettuce', 0.9291954040527344),
 ('meatballs', 0.92864590883255),
 ('tomato_sauce', 0.9286206960678101),
 ('lamb', 0.9283980727195

In [30]:
model.wv.most_similar('pure', topn=100)

[('blended', 0.9270418286323547),
 ('flour', 0.9254921078681946),
 ('red_wine', 0.9250652194023132),
 ('nut', 0.9217947721481323),
 ('sweet_corn', 0.9198572039604187),
 ('fruit_veg', 0.9197940826416016),
 ('kebabs', 0.9166755080223083),
 ('sweetened', 0.9158961176872253),
 ('fruit_juice', 0.9154179692268372),
 ('chicken_balls', 0.9147155284881592),
 ('puree', 0.9141282439231873),
 ('crumpets', 0.9119455814361572),
 ('squash', 0.9106736779212952),
 ('raisins', 0.9106623530387878),
 ('orange', 0.9106507897377014),
 ('pesto', 0.9096901416778564),
 ('cherry_tomatoes', 0.9087192416191101),
 ('roasted', 0.9078137874603271),
 ('teas', 0.9071935415267944),
 ('drizzle', 0.9064682722091675),
 ('chunks', 0.9054614305496216),
 ('greek_yoghurt', 0.9049233198165894),
 ('oat', 0.9046118855476379),
 ('grapes', 0.9043022394180298),
 ('salads', 0.9040697813034058),
 ('salt_pepper', 0.9040598273277283),
 ('apricot', 0.9037516713142395),
 ('pudding', 0.9037010669708252),
 ('served', 0.9036158919334412),
 

In [31]:
model.wv.most_similar('porridge', topn=70)

[('cereal', 0.9096716046333313),
 ('weetabix', 0.872613787651062),
 ('yoghurt', 0.8670910596847534),
 ('mixed', 0.8652952909469604),
 ('tea', 0.8603147864341736),
 ('baby_rice', 0.8596265912055969),
 ('banana', 0.8580417633056641),
 ('salad', 0.8570737838745117),
 ('toast', 0.8526354432106018),
 ('cheese', 0.8458250761032104),
 ('egg', 0.8378180861473083),
 ('yogurt', 0.836993396282196),
 ('bacon', 0.836723804473877),
 ('mince', 0.8357442617416382),
 ('spoon', 0.8348868489265442),
 ('pasta', 0.8290421366691589),
 ('rice', 0.828012228012085),
 ('apple', 0.8262617588043213),
 ('cow', 0.8201343417167664),
 ('pure', 0.8133713603019714),
 ('greek_yoghurt', 0.8127124309539795),
 ('chicken', 0.8112149834632874),
 ('berries', 0.8105810880661011),
 ('blueberries', 0.8062143921852112),
 ('chips', 0.8044428825378418),
 ('chocolate', 0.8037696480751038),
 ('veg', 0.8026527762413025),
 ('omelette', 0.8002580404281616),
 ('pieces', 0.7988482713699341),
 ('puree', 0.7972151637077332),
 ('breakfast', 

In [32]:
model.wv.most_similar('cereal', topn=70)

[('yoghurt', 0.9225842952728271),
 ('porridge', 0.9096714854240417),
 ('yogurt', 0.9023225903511047),
 ('egg', 0.900385856628418),
 ('berries', 0.8927662372589111),
 ('bacon', 0.889761745929718),
 ('toast', 0.8895549774169922),
 ('apple', 0.8893502950668335),
 ('weetabix', 0.8882739543914795),
 ('pasta', 0.8869725465774536),
 ('squash', 0.882042646408081),
 ('mixed', 0.8796114325523376),
 ('pure', 0.8771076202392578),
 ('banana', 0.8758437633514404),
 ('blueberries', 0.8723114132881165),
 ('mince', 0.8713884949684143),
 ('chips', 0.8703944087028503),
 ('cake', 0.8692929744720459),
 ('salad', 0.8663114309310913),
 ('mashed', 0.86626797914505),
 ('crackers', 0.8661465048789978),
 ('oats', 0.8660684823989868),
 ('rice', 0.8655409216880798),
 ('omelette', 0.864896833896637),
 ('broccoli', 0.8638949394226074),
 ('puree', 0.8628878593444824),
 ('greek_yoghurt', 0.861759603023529),
 ('butter', 0.8609000444412231),
 ('loaf', 0.8607364892959595),
 ('brown_rice', 0.8604463338851929),
 ('jar', 0.

In [33]:
model.wv.most_similar('sterilised', topn=70)

[('sterile', 0.8604623675346375),
 ('sterilising', 0.8569244742393494),
 ('stored', 0.8375067710876465),
 ('thoroughly', 0.8341736793518066),
 ('sealed', 0.8286930322647095),
 ('cooled_down', 0.8198946118354797),
 ('pre_boiled', 0.8178366422653198),
 ('right_temp', 0.816007673740387),
 ('cooled', 0.8113749027252197),
 ('defrost', 0.8105139136314392),
 ('boiled', 0.8101609349250793),
 ('freshly_boiled', 0.8049793839454651),
 ('boiling_hot', 0.803260862827301),
 ('preparing', 0.8025395274162292),
 ('teats', 0.8014493584632874),
 ('seal', 0.8007925748825073),
 ('add_oz', 0.800717294216156),
 ('measure_out', 0.8002644777297974),
 ('lids', 0.7989292144775391),
 ('cooling', 0.7986111640930176),
 ('add_formula', 0.79707932472229),
 ('containers', 0.7939073443412781),
 ('tray', 0.7925753593444824),
 ('destination', 0.7921411991119385),
 ('lactofree_organic', 0.7911903858184814),
 ('cooled_boiled', 0.7910804152488708),
 ('piping_hot', 0.7906176447868347),
 ('hot_enough', 0.7898123264312744),
 (

In [34]:
model.wv.most_similar('isnt_sterile', topn=70)

[('amino_acid', 0.8787685632705688),
 ('undigested', 0.8708703517913818),
 ('replaced', 0.8681244254112244),
 ('chemically', 0.8647598624229431),
 ('kills_bacteria', 0.8576038479804993),
 ('hot_enough', 0.8486086130142212),
 ('kill_off', 0.8482731580734253),
 ('enfamil', 0.8471897840499878),
 ('thicker', 0.8456480503082275),
 ('hotter', 0.843415379524231),
 ('semi_skimmed', 0.8415979146957397),
 ('sterile', 0.8404847383499146),
 ('breast_milk', 0.837358295917511),
 ('dissolves', 0.8371875286102295),
 ('encourages', 0.8358727097511292),
 ('after_brushing', 0.8289915919303894),
 ('mimics', 0.82891446352005),
 ('uts', 0.8245932459831238),
 ('pre_boiled', 0.8239516019821167),
 ('ebm', 0.8237350583076477),
 ('tonic', 0.8236982822418213),
 ('cooler_water', 0.8233959078788757),
 ('casein', 0.8223336935043335),
 ('membrane', 0.8222814202308655),
 ('lactofree_organic', 0.8215860724449158),
 ('most_likely', 0.8213858604431152),
 ('gels', 0.8213514089584351),
 ('sterilized', 0.820012092590332),
 

In [35]:
model.wv.most_similar('bacteria', topn=70)

[('harmful_bacteria', 0.755519688129425),
 ('grow', 0.7553578019142151),
 ('kill_any', 0.7417506575584412),
 ('kill', 0.7399042248725891),
 ('tap_water', 0.7360357046127319),
 ('multiply', 0.7234299778938293),
 ('nutrients', 0.7074774503707886),
 ('increase', 0.7036154866218567),
 ('animal', 0.7004827857017517),
 ('clumping', 0.6982685327529907),
 ('temperature', 0.6959445476531982),
 ('chemicals', 0.6955853700637817),
 ('above_degrees', 0.6727638840675354),
 ('degrees', 0.6647493839263916),
 ('formula_powder', 0.6603744626045227),
 ('sugar', 0.6603702306747437),
 ('higher', 0.6558517813682556),
 ('killing', 0.6518559455871582),
 ('milk', 0.6497973203659058),
 ('filter', 0.6480246186256409),
 ('sterile', 0.6456055045127869),
 ('kill_off', 0.645601749420166),
 ('your_body', 0.6436924934387207),
 ('hot_shot', 0.6427913308143616),
 ('product', 0.6422889828681946),
 ('water', 0.6408628821372986),
 ('formula', 0.6405421495437622),
 ('amount_of', 0.6372060179710388),
 ('powder', 0.6370040774

In [36]:
model.wv.most_similar('contaminated', topn=70)

[('carotid', 0.9036102890968323),
 ('bull', 0.8879947662353516),
 ('andrex', 0.8850195407867432),
 ('haram', 0.8833096623420715),
 ('shop_bought', 0.8780310153961182),
 ('physically', 0.8767325282096863),
 ('bn', 0.8766710758209229),
 ('burned', 0.8750360012054443),
 ('maitah', 0.8729749321937561),
 ('derivatives', 0.8725080490112305),
 ('compromised', 0.8719992637634277),
 ('haraam', 0.8710669279098511),
 ('marijuana', 0.8708672523498535),
 ('versa', 0.8705627918243408),
 ('naturally', 0.8688403964042664),
 ('delayed', 0.8683784008026123),
 ('whey', 0.8680708408355713),
 ('equal', 0.8670787811279297),
 ('watercress', 0.8670054078102112),
 ('reactive', 0.8659793138504028),
 ('sticky', 0.8658289313316345),
 ('okhrejat', 0.8656802773475647),
 ('lightly', 0.8656524419784546),
 ('values', 0.8654999136924744),
 ('cartilage', 0.8650272488594055),
 ('mondays', 0.864654004573822),
 ('dense', 0.864473819732666),
 ('esp', 0.8642118573188782),
 ('tooti', 0.8640950322151184),
 ('minced', 0.8640220

In [37]:
model.wv.most_similar('contaminated', topn=70)

[('carotid', 0.9036102890968323),
 ('bull', 0.8879947662353516),
 ('andrex', 0.8850195407867432),
 ('haram', 0.8833096623420715),
 ('shop_bought', 0.8780310153961182),
 ('physically', 0.8767325282096863),
 ('bn', 0.8766710758209229),
 ('burned', 0.8750360012054443),
 ('maitah', 0.8729749321937561),
 ('derivatives', 0.8725080490112305),
 ('compromised', 0.8719992637634277),
 ('haraam', 0.8710669279098511),
 ('marijuana', 0.8708672523498535),
 ('versa', 0.8705627918243408),
 ('naturally', 0.8688403964042664),
 ('delayed', 0.8683784008026123),
 ('whey', 0.8680708408355713),
 ('equal', 0.8670787811279297),
 ('watercress', 0.8670054078102112),
 ('reactive', 0.8659793138504028),
 ('sticky', 0.8658289313316345),
 ('okhrejat', 0.8656802773475647),
 ('lightly', 0.8656524419784546),
 ('values', 0.8654999136924744),
 ('cartilage', 0.8650272488594055),
 ('mondays', 0.864654004573822),
 ('dense', 0.864473819732666),
 ('esp', 0.8642118573188782),
 ('tooti', 0.8640950322151184),
 ('minced', 0.8640220

## We can find words that are close to a set of words as well.

In [38]:
model.wv.most_similar(['veg','puree'], topn=10)

[('yoghurt', 0.9599322080612183),
 ('rice', 0.9475286602973938),
 ('pasta', 0.9446308612823486),
 ('berries', 0.943392276763916),
 ('yogurt', 0.9397631883621216),
 ('apple', 0.9385702013969421),
 ('chips', 0.9367427825927734),
 ('vegetables', 0.9332002997398376),
 ('potatoes', 0.931500256061554),
 ('nuts', 0.9311074018478394)]

In [39]:
model.wv.most_similar(['veg','sterilised'], topn=20)

[('berries', 0.8825225234031677),
 ('frozen', 0.8815217018127441),
 ('juice', 0.8767626881599426),
 ('fruit_juice', 0.8671960830688477),
 ('homemade', 0.8662259578704834),
 ('cooked', 0.8638995289802551),
 ('fresh_fruit', 0.8608234524726868),
 ('swede', 0.8593836426734924),
 ('pure', 0.8575620055198669),
 ('yoghurt', 0.856556236743927),
 ('salt', 0.8560191988945007),
 ('cereal', 0.8532764911651611),
 ('mixed', 0.847929060459137),
 ('rice', 0.8442897200584412),
 ('blended', 0.84423828125),
 ('squash', 0.8434914350509644),
 ('pasta', 0.8407372832298279),
 ('mince', 0.8394392728805542),
 ('boiled', 0.8384739756584167),
 ('blend', 0.8375955820083618)]

In [40]:
model.wv.most_similar(['veg','baby_food', 'veggie','vegetable','vegetables'], topn=20)

[('fresh_fruit', 0.9279460310935974),
 ('potatoes', 0.9189800024032593),
 ('berries', 0.9184004068374634),
 ('veggies', 0.9114029407501221),
 ('fruits', 0.9086521863937378),
 ('biscuits', 0.907930314540863),
 ('green_veg', 0.9065358638763428),
 ('homemade', 0.9049237966537476),
 ('gluten', 0.9043185710906982),
 ('squash', 0.9036874771118164),
 ('cake', 0.9022833108901978),
 ('low_fat', 0.901229977607727),
 ('wheat', 0.9008134007453918),
 ('vegtables', 0.8981442451477051),
 ('gluten_free', 0.8949791789054871),
 ('plain', 0.8946621417999268),
 ('fruit_juice', 0.893598735332489),
 ('deli_meats', 0.8930169939994812),
 ('sweet_potatoes', 0.8917573690414429),
 ('greek_yoghurt', 0.8893910050392151)]

In [41]:
model.wv.most_similar(['home_made','homemade','puree'], topn=20)

[('potato', 0.9632910490036011),
 ('sweet_potato', 0.9618076086044312),
 ('hummus', 0.9585307836532593),
 ('tomato', 0.9575111269950867),
 ('yogurt', 0.9571416974067688),
 ('flour', 0.9567456841468811),
 ('herbs', 0.9565636515617371),
 ('butternut_squash', 0.956193208694458),
 ('greek_yoghurt', 0.9556903839111328),
 ('beans', 0.9537001848220825),
 ('carrots', 0.9530507922172546),
 ('rice_cakes', 0.950581967830658),
 ('vegetable', 0.9504945874214172),
 ('ham', 0.9498361349105835),
 ('sliced', 0.9497528076171875),
 ('fruit_juice', 0.9492877721786499),
 ('pancakes', 0.9491723775863647),
 ('chilli', 0.9490878582000732),
 ('lentils', 0.9485553503036499),
 ('sweetcorn', 0.948227047920227)]

In [42]:
model.wv.most_similar(['home_made','homemade'], topn=20)

[('yogurt', 0.9504057765007019),
 ('carrots', 0.9476836323738098),
 ('chilli', 0.9470958709716797),
 ('potato', 0.945925772190094),
 ('sweet_potato', 0.9453811645507812),
 ('beans', 0.9449406266212463),
 ('sauces', 0.9444630742073059),
 ('greek_yoghurt', 0.9432798624038696),
 ('vegetable', 0.9419602155685425),
 ('lentils', 0.9418543577194214),
 ('herbs', 0.9418541789054871),
 ('fruit_juice', 0.9402953386306763),
 ('berries', 0.9401469230651855),
 ('pancakes', 0.9384273290634155),
 ('potatoes', 0.9379295110702515),
 ('squash', 0.9377852082252502),
 ('biscuits', 0.9376203417778015),
 ('hummus', 0.9375230073928833),
 ('tomato', 0.9346681833267212),
 ('flour', 0.9343798756599426)]

In [43]:
model.wv.most_similar(['pure','puree','baby','food','baby_food'], topn=20)

[('breastmilk', 0.8380216956138611),
 ('tap_water', 0.8195614218711853),
 ('squash', 0.8030567169189453),
 ('breast_milk', 0.8019712567329407),
 ('allowance', 0.7960947155952454),
 ('nutrition', 0.7902892231941223),
 ('gluten', 0.7862802743911743),
 ('calories', 0.7795446515083313),
 ('animals_killed', 0.7774341106414795),
 ('espresso', 0.7719815373420715),
 ('unhealthy', 0.7719286680221558),
 ('space', 0.7717061042785645),
 ('protein', 0.7711865901947021),
 ('clothing', 0.7703192830085754),
 ('spirits', 0.7693000435829163),
 ('stunned', 0.7671787738800049),
 ('decaf_tea', 0.7661231756210327),
 ('coating', 0.7660806179046631),
 ('nutritional', 0.7660027742385864),
 ('vegsalad', 0.7659919857978821)]

In [44]:
model.wv.most_similar(['pure','puree','baby','food'], topn=20)

[('squash', 0.8281688094139099),
 ('cereal', 0.8257655501365662),
 ('breast_milk', 0.8113999962806702),
 ('breastmilk', 0.8111817240715027),
 ('gluten', 0.8069900274276733),
 ('yoghurt', 0.8068694472312927),
 ('decaf_tea', 0.8058289885520935),
 ('tap_water', 0.7950810194015503),
 ('whereas', 0.792306125164032),
 ('berries', 0.7917108535766602),
 ('protein', 0.7910234928131104),
 ('unlimited', 0.7903717160224915),
 ('fruitveg', 0.7895777225494385),
 ('intake', 0.7843006253242493),
 ('dish', 0.7801832556724548),
 ('spirits', 0.7800820469856262),
 ('juice', 0.7800807356834412),
 ('saturated', 0.7791770696640015),
 ('soya_milk', 0.7789856791496277),
 ('oats', 0.7785203456878662)]

In [58]:
model.wv.most_similar(positive = ['pure','puree','baby','food','weaning'], negative=['spirits', 'tea'], topn=20)

[('solids', 0.7143822908401489),
 ('child', 0.711353063583374),
 ('cat', 0.7087991833686829),
 ('eatingdrinking', 0.7014023065567017),
 ('digestive_system', 0.6856018900871277),
 ('suitable', 0.6828093528747559),
 ('dog', 0.6707337498664856),
 ('growing', 0.6684055924415588),
 ('babys', 0.6638669967651367),
 ('especially', 0.6559197306632996),
 ('son', 0.6554298400878906),
 ('heart_burn', 0.6536500453948975),
 ('my_lg', 0.652366578578949),
 ('babies', 0.6520152688026428),
 ('graph', 0.6512534022331238),
 ('kid', 0.6499610543251038),
 ('behaviour', 0.64906907081604),
 ('little_one', 0.6482453346252441),
 ('daddy', 0.6475297808647156),
 ('poo', 0.646476686000824)]

In [59]:
model.wv.most_similar(positive = ['baby_food','food','weaning'], negative=['spirits', 'tea'], topn=20)

[('immence', 0.6761588454246521),
 ('right', 0.6039339900016785),
 ('lashes', 0.5697875618934631),
 ('morethan', 0.5484611988067627),
 ('worrynerves', 0.5394238829612732),
 ('infectious', 0.5344986915588379),
 ('everyone', 0.5343134999275208),
 ('dont_know', 0.5280071496963501),
 ('why', 0.524859607219696),
 ('people', 0.5246093273162842),
 ('livebut', 0.5197388529777527),
 ('know', 0.5173550844192505),
 ('wrong', 0.5148544907569885),
 ('clarehope', 0.5036625862121582),
 ('best', 0.5028817057609558),
 ('parents', 0.4999096095561981),
 ('tag', 0.4980241358280182),
 ('what', 0.4946640133857727),
 ('advice', 0.4920639991760254),
 ('optician', 0.4883507788181305)]

In [45]:
model.wv.most_similar(['preservatives'], topn=20)

[('soups', 0.9558199048042297),
 ('oils', 0.9550164937973022),
 ('sweet_corn', 0.948668360710144),
 ('sweet_potatoes', 0.9461427330970764),
 ('parsnips', 0.946140706539154),
 ('salads', 0.9448376297950745),
 ('raisins', 0.9444329738616943),
 ('meats', 0.9412325024604797),
 ('nut', 0.9409106373786926),
 ('rye', 0.9403590559959412),
 ('corn', 0.9401417970657349),
 ('jam', 0.937225878238678),
 ('baked_beans', 0.936769962310791),
 ('vegetable', 0.9366881847381592),
 ('lentils', 0.9359491467475891),
 ('barley', 0.9353379011154175),
 ('honey', 0.9337289333343506),
 ('herbs', 0.9336785674095154),
 ('carbohydrates', 0.9336708784103394),
 ('kale', 0.9330786466598511)]

In [46]:
model.wv.most_similar(['what_brand'], topn=50)

[('currently_use', 0.9230327010154724),
 ('most_often', 0.9132694602012634),
 ('tommee', 0.8120124340057373),
 ('brands', 0.8014540076255798),
 ('aspects', 0.7974790930747986),
 ('ratings', 0.7944238185882568),
 ('articles', 0.7941250205039978),
 ('gadgets', 0.7898600697517395),
 ('penis', 0.7896952629089355),
 ('some_kind', 0.7843093276023865),
 ('slaughter', 0.7731927633285522),
 ('awareness', 0.7708163261413574),
 ('ou', 0.7699427008628845),
 ('lesser', 0.7673109173774719),
 ('wrongwhat', 0.7654685974121094),
 ('currently_buy', 0.7619627118110657),
 ('buy_other', 0.7618476748466492),
 ('quantity', 0.7614060044288635),
 ('bunch', 0.7581350207328796),
 ('bisphenola', 0.7554386854171753),
 ('phytic', 0.753519594669342),
 ('spiral', 0.7495668530464172),
 ('jungle', 0.7483108639717102),
 ('citric_acid', 0.7482093572616577),
 ('breakfast_products', 0.7475633025169373),
 ('your_current', 0.7471222877502441),
 ('about_theaveeno', 0.7448669075965881),
 ('scullcap', 0.7438331246376038),
 ('da

In [47]:
model.wv.most_similar(['posioning'], topn=50)

[('extremes', 0.7006442546844482),
 ('fraction', 0.6556109189987183),
 ('spirit', 0.6544153690338135),
 ('fair_bit', 0.6454771757125854),
 ('multitude', 0.6427399516105652),
 ('never_heard', 0.6383461356163025),
 ('bongo', 0.6374942064285278),
 ('disconnects', 0.6334449052810669),
 ('caravans', 0.6330868601799011),
 ('fair_amount', 0.6302483081817627),
 ('measuresperhaps', 0.6256277561187744),
 ('combination', 0.6232194900512695),
 ('infront', 0.620303750038147),
 ('bug', 0.6200685501098633),
 ('teaspoons', 0.6199474334716797),
 ('small_piece', 0.6184102296829224),
 ('huge_bag', 0.6183731555938721),
 ('wide_variety', 0.6178988814353943),
 ('photo', 0.616966724395752),
 ('in_front', 0.6164197325706482),
 ('aftermath', 0.6162249445915222),
 ('in_terms', 0.6135082840919495),
 ('vast_majority', 0.6130695939064026),
 ('matchbox', 0.6128855347633362),
 ('bootcamp', 0.6100156903266907),
 ('phobia', 0.6093139052391052),
 ('wide_range', 0.6069177985191345),
 ('outsider', 0.6066805124282837),
 (

In [48]:
model.wv.most_similar(['address'], topn=50)

[('mail', 0.890352189540863),
 ('manufacturer', 0.8798248171806335),
 ('any_further', 0.8770080208778381),
 ('hahahaha', 0.8742119669914246),
 ('yahoocouk', 0.8723781108856201),
 ('sponsored', 0.8715806603431702),
 ('your_bodys', 0.870268702507019),
 ('conspiracy', 0.8686214089393616),
 ('someone_please', 0.8668290972709656),
 ('marston', 0.8667101860046387),
 ('nux_vomica', 0.8634548783302307),
 ('requirements', 0.8621602654457092),
 ('safest', 0.8621189594268799),
 ('x_hiya', 0.861880898475647),
 ('click', 0.8609533905982971),
 ('investigate', 0.8609200119972229),
 ('tobacco', 0.8599651455879211),
 ('jesus', 0.859541118144989),
 ('attitudes', 0.8592957258224487),
 ('pts', 0.8592854142189026),
 ('greatly', 0.8590477108955383),
 ('gibson', 0.8590438961982727),
 ('knitting', 0.8584678769111633),
 ('repair', 0.8580302596092224),
 ('vera', 0.8573964834213257),
 ('nyreen', 0.8573287725448608),
 ('anyhoo', 0.857321560382843),
 ('tasksthey', 0.8572642803192139),
 ('magazine', 0.8570613861083

In [56]:
#this one is strange, it returns a bunch of 
#names and email addresses and stuff
model.wv.most_similar(['some_info'], topn=50)

[('an_email', 0.9250348210334778),
 ('x_hiya', 0.9094042778015137),
 ('laura', 0.9080759286880493),
 ('melanie', 0.9058457016944885),
 ('kelly', 0.9058345556259155),
 ('any_info', 0.9046316742897034),
 ('hun_xx', 0.9034294486045837),
 ('jess', 0.9030999541282654),
 ('ive_pmd', 0.9027352333068848),
 ('xx_hey', 0.9001736044883728),
 ('karen', 0.8999698758125305),
 ('could_someone', 0.8972940444946289),
 ('diane', 0.8957549929618835),
 ('jodie', 0.8956050872802734),
 ('amanda', 0.8905532956123352),
 ('stacey', 0.8903585076332092),
 ('rosie', 0.8888223171234131),
 ('kerry', 0.8887797594070435),
 ('ya', 0.8873005509376526),
 ('any_further', 0.88695228099823),
 ('yahoocouk', 0.8860926032066345),
 ('sian', 0.8856438398361206),
 ('firstly', 0.8850079774856567),
 ('hayley', 0.8830459713935852),
 ('hotmailcouk', 0.8819030523300171),
 ('ill_send', 0.8811845779418945),
 ('natalie', 0.8808896541595459),
 ('more_information', 0.8807965517044067),
 ('lm', 0.8805987238883972),
 ('thanks_sarah', 0.8786

In [62]:
model.wv.most_similar('urlpostedtosomething', topn=20)

KeyError: "Key 'urlpostedtosomething' not present"

In [67]:
model.wv.most_similar(['baby_rice'], topn=70)

[('porridge', 0.8596265316009521),
 ('finger_foods', 0.8506186008453369),
 ('roll', 0.8383837938308716),
 ('spoon', 0.8347414135932922),
 ('cereal', 0.8274294137954712),
 ('baths', 0.8164702653884888),
 ('leftovers', 0.8130574226379395),
 ('weetabix', 0.8125248551368713),
 ('brekky', 0.8098934888839722),
 ('onz', 0.8058479428291321),
 ('purees', 0.8053804636001587),
 ('pureed_veg', 0.8026633858680725),
 ('toast', 0.8020657896995544),
 ('cow', 0.8008295893669128),
 ('gaviscon', 0.7969177961349487),
 ('sachets', 0.7941731214523315),
 ('wiping', 0.79375159740448),
 ('poached_egg', 0.792059600353241),
 ('spoons', 0.7917876839637756),
 ('film', 0.7901484370231628),
 ('beaker', 0.78279048204422),
 ('rusk', 0.7819617390632629),
 ('tail', 0.7818548679351807),
 ('mince', 0.7814681529998779),
 ('piriton', 0.781239926815033),
 ('nibble', 0.78065425157547),
 ('ice_lollies', 0.7772060036659241),
 ('school_run', 0.777184009552002),
 ('chippy', 0.7764370441436768),
 ('dish', 0.7738562822341919),
 ('o

In [75]:
model.wv.most_similar(['fruit','puree', 'infant', 'contamination'], negative=['recipe', 'meat'], topn=70)

[('green_beans', 0.8987311124801636),
 ('tomato_sauce', 0.8979843258857727),
 ('pesto', 0.895997166633606),
 ('fried', 0.8957321643829346),
 ('blended', 0.8954463005065918),
 ('grilled', 0.8952873349189758),
 ('avocado', 0.8914393782615662),
 ('apricot', 0.8884199857711792),
 ('salt_pepper', 0.8882951140403748),
 ('red_wine', 0.8859628438949585),
 ('stirfry', 0.8858291506767273),
 ('covered_in', 0.8852866291999817),
 ('filled_with', 0.8844092488288879),
 ('pure', 0.8831825256347656),
 ('mushrooms', 0.882717490196228),
 ('pumped', 0.8824294805526733),
 ('cabbage', 0.8792598843574524),
 ('stilton', 0.8786580562591553),
 ('roast_chicken', 0.8784634470939636),
 ('cod', 0.8774142861366272),
 ('green', 0.8773579001426697),
 ('onion', 0.8769024610519409),
 ('yellow', 0.8765391111373901),
 ('tomato', 0.8751411437988281),
 ('decaff', 0.8749994039535522),
 ('beetroot', 0.8741223216056824),
 ('for_brekkie', 0.8740686774253845),
 ('mustard', 0.8738999962806702),
 ('flour', 0.8738195300102234),
 ('

In [77]:
model.wv.most_similar(['baby_food'], negative=['recipe', 'diy'], topn=70)

[('propley', 0.5634233355522156),
 ('roundworst', 0.5558487176895142),
 ('cmht', 0.5219064354896545),
 ('newhv', 0.5163677930831909),
 ('fromhow', 0.4897284209728241),
 ('mebut', 0.4865596890449524),
 ('hindz', 0.4550108015537262),
 ('teatstops', 0.4431160092353821),
 ('reasonbly', 0.440399169921875),
 ('doit', 0.4339812099933624),
 ('habing', 0.4273872375488281),
 ('whomis', 0.4264635443687439),
 ('thunders', 0.4228471517562866),
 ('dlapip', 0.42256826162338257),
 ('askdo', 0.4183881878852844),
 ('turmoili', 0.4167611300945282),
 ('rountine', 0.4052369296550751),
 ('monstersome', 0.404615193605423),
 ('proprietor', 0.40206775069236755),
 ('starti', 0.3983546495437622),
 ('daymeal', 0.3969991207122803),
 ('posession', 0.3916724920272827),
 ('aboutbut', 0.38531622290611267),
 ('hihiya', 0.37732937932014465),
 ('glitcing', 0.37648606300354004),
 ('organicnatural', 0.3728504180908203),
 ('tortured', 0.36474308371543884),
 ('strived', 0.3642115294933319),
 ('consumehow', 0.3632415533065796

In [86]:
model.wv.most_similar(['baby_food'], negative=['homemade'], topn=70)

[('post', 0.6090183854103088),
 ('havethanks', 0.5585094690322876),
 ('dlapip', 0.5539392232894897),
 ('askdo', 0.5503668189048767),
 ('turmoili', 0.5423431992530823),
 ('why', 0.5393729209899902),
 ('optician', 0.5355606079101562),
 ('question', 0.5337581038475037),
 ('how', 0.5182285904884338),
 ('livebut', 0.5163639187812805),
 ('immence', 0.51068115234375),
 ('representatives', 0.5103675723075867),
 ('temporary_housing', 0.5055130124092102),
 ('right', 0.5045444369316101),
 ('anticipation', 0.49955061078071594),
 ('ask', 0.4971388578414917),
 ('infectious', 0.4937967360019684),
 ('noroviris', 0.49344688653945923),
 ('understand', 0.4931699335575104),
 ('thread', 0.4930281341075897),
 ('please', 0.49186018109321594),
 ('morethan', 0.4916702210903168),
 ('whats', 0.4912470579147339),
 ('asains', 0.4900318682193756),
 ('leavingclaire', 0.4892365634441376),
 ('muntaha', 0.4888818562030792),
 ('passlasted', 0.48478153347969055),
 ('dont_know', 0.48454126715660095),
 ('read', 0.483993321

In [88]:
model.wv.most_similar(['yoghurt'], topn=70)

[('soup', 0.9427034258842468),
 ('apple', 0.9331918954849243),
 ('cheese', 0.927249550819397),
 ('tomato', 0.9262970685958862),
 ('rice', 0.9251180291175842),
 ('cereal', 0.9225842952728271),
 ('yogurt', 0.9215800762176514),
 ('egg', 0.9213607907295227),
 ('pasta', 0.9198905229568481),
 ('berries', 0.9196426868438721),
 ('chicken', 0.9180183410644531),
 ('banana', 0.9177352786064148),
 ('salad', 0.9130582809448242),
 ('slice_of', 0.9124873280525208),
 ('nuts', 0.9104272127151489),
 ('tomatoes', 0.9099122285842896),
 ('puree', 0.9072428345680237),
 ('fish', 0.9029405117034912),
 ('butter', 0.9024571776390076),
 ('potato', 0.9016027450561523),
 ('bacon', 0.9004813432693481),
 ('ham', 0.9001263976097107),
 ('cake', 0.8992815017700195),
 ('homemade', 0.8990651369094849),
 ('squash', 0.8972511291503906),
 ('sandwich', 0.897059440612793),
 ('vegetables', 0.8957829475402832),
 ('carrots', 0.8950196504592896),
 ('choc', 0.894852876663208),
 ('chips', 0.8942900896072388),
 ('blueberries', 0.893

In [90]:
model.wv.most_similar(['baby_food', 'premade'], topn=70)

[('sterilising', 0.8328830599784851),
 ('guide', 0.8258848190307617),
 ('company', 0.7817451357841492),
 ('slaughter', 0.7766173481941223),
 ('buy_other', 0.7759057283401489),
 ('brands', 0.7754129767417908),
 ('serve_halal', 0.7750930786132812),
 ('dermexa_range', 0.7738021016120911),
 ('aveeno_baby', 0.7690919041633606),
 ('copy', 0.7684815526008606),
 ('other_retailers', 0.7679007053375244),
 ('label', 0.7678582072257996),
 ('pump', 0.7678112387657166),
 ('remedy', 0.7665497064590454),
 ('win', 0.7653217315673828),
 ('medicines', 0.764909029006958),
 ('meal_plan', 0.7648950815200806),
 ('migrate', 0.7637519240379333),
 ('jewellery', 0.763024091720581),
 ('roam', 0.7607174515724182),
 ('google', 0.7606288194656372),
 ('mail', 0.7593492865562439),
 ('hipp_organic', 0.7590063214302063),
 ('reuse', 0.7585139870643616),
 ('halt', 0.7580937743186951),
 ('potable', 0.7578948140144348),
 ('offending', 0.7574496269226074),
 ('offers', 0.7569690346717834),
 ('selecting_baby', 0.75689560174942

In [92]:
model.wv.most_similar(['baby_food', 'premade'], negative=['homemade'], topn=70)

[('follow', 0.7423295378684998),
 ('web_site', 0.7219550609588623),
 ('decide', 0.6875738501548767),
 ('question', 0.6831241846084595),
 ('understand', 0.6791288256645203),
 ('aveeno_baby', 0.6725314855575562),
 ('write', 0.6683441996574402),
 ('livebut', 0.6682726144790649),
 ('whats', 0.660494327545166),
 ('advise', 0.6599637866020203),
 ('look_at', 0.6588088870048523),
 ('solution', 0.6587648391723633),
 ('phases', 0.6502673029899597),
 ('link', 0.6482031941413879),
 ('temporary_housing', 0.6481582522392273),
 ('answer', 0.6439734697341919),
 ('feel_comfortable', 0.6417751312255859),
 ('manage', 0.6407522559165955),
 ('solve', 0.6406257152557373),
 ('lashes', 0.6395214796066284),
 ('childyou', 0.6388802528381348),
 ('suggest', 0.6388155817985535),
 ('enzimes', 0.6376416087150574),
 ('important_thing', 0.6357167363166809),
 ('wrongwhat', 0.6342867016792297),
 ('probotics', 0.6326111555099487),
 ('practitioner', 0.6297556757926941),
 ('threadwhat', 0.6295520067214966),
 ('dermexa_rang

In [93]:
model.wv.most_similar(['aldi_organic'], topn=70)

[('offending', 0.8767739534378052),
 ('ordinary', 0.8618350028991699),
 ('bottled_water', 0.8613597750663757),
 ('supermarket_own', 0.8609319925308228),
 ('buying_branded', 0.8584685325622559),
 ('companies', 0.8569406867027283),
 ('currently_buy', 0.8518701791763306),
 ('mamia_food', 0.84846431016922),
 ('appledore', 0.847379744052887),
 ('fish_oil', 0.8466570973396301),
 ('gelatin', 0.8426364064216614),
 ('gal', 0.836393415927887),
 ('object', 0.8332284092903137),
 ('implement', 0.8325846195220947),
 ('hipp_organic', 0.8317365646362305),
 ('sourced', 0.8313223123550415),
 ('plastics', 0.8299975395202637),
 ('buy_other', 0.8286651968955994),
 ('organix', 0.8284889459609985),
 ('specifically', 0.828406572341919),
 ('aldi_mamia', 0.8266980051994324),
 ('fruit_pots', 0.8266264200210571),
 ('another_mum', 0.8259619474411011),
 ('cif_anti', 0.8257240653038025),
 ('avaliable', 0.825161874294281),
 ('activity', 0.8250676393508911),
 ('graze_super', 0.8244447708129883),
 ('shain', 0.824162185

In [95]:
bigram_transformer.scoring(
            worda_count=bigram_transformer.vocab.get('jarred'),
            wordb_count=bigram_transformer.vocab.get('food'),
            bigram_count=bigram_transformer.vocab.get('jarred_food'),
            len_vocab=len(bigram_transformer.vocab),
            min_count=bigram_transformer.min_count,
            corpus_word_count=bigram_transformer.corpus_word_count,
        )

-17.60221526607272

In [97]:
bigram_transformer.vocab.get('jarred')

11