This notebook uses the same cleaning routine as NETMUMS_topicmining_POSTS.
It is used to see what words co-occur with hazard words

In [1]:
#if jupyternotify is installed, we can add %notify to a cell to get an alert when it ifnished running
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [2]:
import metrics_helpers as indicators
import pickle as pk
import gc
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import traceback #needed to store full error tracebacks

In [3]:
def dt_to_int(dt): #datetime to integer
    return dt.astype('int')/(10**9)

In [4]:
with open('/Users/sma/Documents/INRAE internship/scrape-git/facebook/untypod_dict.pkl', 'rb') as f:
    netmums = pk.load(f)

#with open('/Users/sma/Documents/INRAE internship/scrape-git/netmums/allposts_rerun.pkl', 'rb') as f:
#    netmums = pk.load(f)
    
#with open('/Users/sma/Documents/INRAE internship/scrape-git/netmums/netmums_subset_keys.txt', 'r') as f:
#    keys = [url.strip() for url in f.readlines()]
        

In [5]:
nm_ind = indicators.indicators(netmums, fb=False)
#this one takes long, around 20 seconds I think.

posts_dict = nm_ind.get_posts_dict()

In [110]:
hazards = {
'Chemical contaminants': [],
'Endocrine disruptor': ["endocrine","estrogen"],
'FOOD PRESERVATIVES, SWEETENERS AND ADDITIVES':["preservatives","sweeteners","additives"],
"Pesticides":[],
"Veterinary drugs":["animal drugs","vet drugs"],
'GMO':['GM',"genetically modified"],
"Metals":[],
"Mycotoxin":[],
"Bisphenol A":['BPA','Bisphenol','BisphenolA'],
'Furan':[],
'DON': #(note that this acronym nobody uses and all results are from words like "don't")
["deoxynivalenol",
"vomitoxin"],
'DIOXIN AND PCB':["Dioxin","PCB","biphenyls"],
'MOSH and MOAH':["hydrocarbons","saturated hydrocarbons","MOAH", 'MOH',"aromatic hydrocarbons"],
'Nitrates':[],
"Acrylamid":["Acrylamide"],
"phthalates":[],
"Microbiologic contaminants":
["spores",
"mold",
"mould",
#"virus",
"microbes",
"contaminated"],
"Salmonella":[],
"Campylobacter":[],
"Listeria":[],
"EColi":["E-coli"],
"Cronobacter":[],
"Histamine":[],
'other bacteria':["bacteria"],
"Virus":[],
"Parasites":[],
'Related Terms':["carcinogen","chemicals", "toxic", "toxin", "poisonous", "fungus", "food poisoning", "hazard","EFSA","European Food Safety Authority"]
}

products = {
'infant formula':
["formula","baby formula", "bottle-fed", "bottle"]
,'sterilized vegetable mixed with fish':
["veggie baby food","vegetable baby food",
"veg puree", "veg purée"]
,'fresh fruit puree mildly processed':
["fruit puree","fruit baby food", "fruit purée", "applesauce", "apple sauce", "fruit sauce"]
,'infant cereals':
["cereal for baby", "cereal", "porridge", "oats", "oatmeal"]
,'other':
["jar food", "baby food", "jarred", "premade food", "puree", "purée", "jarred food"
,"yoghurt", "pudding"]
}


#IMPORTANT!: terms used for count vectorizer must be lower-case o.w. get 0 matches
hazards = {key.lower():[v.lower() for v in value] + [key.lower()] for key,value in hazards.items()}
products = {key.lower():[v.lower() for v in value]+[key.lower()] for key,value in products.items()}

In [7]:

extras = {\
'baby_food_brands':
['ellas',
'organix',
'heinz baby',
"plum baby",
'little angels',
'farleys'],
'formula_brands':['sma','aptamil comfort','infasoy','nutramigen','neocate','powdered milk','comfort milk'],
 'food_or_formula_brands':
['aptamil', # formula and cereals.
'hipp organic',# - formula and baby food
'cow gate','cow and gate','c g',
'mamia'],
##NON BRAND SIGNALS##
'cereal':['baby_cereal','baby riceporridge','baby rice','baby porridge'],
'baby_food':['mashed','tinned','premade','canned','jarred','pouches','pouch','ready made','readymade','cartons'],  
#INDICATORS TO BE USED IN CONJUNCTION WITH 'baby food' label: this way we 
#can observe if both terms are used in a document (but are not used right next to each other.)
'fruit':['fruit'],
'vegetable':['vegetable'],
'baby':['infant', 'baby' ,'for littles']
         }


In [8]:
import re

def make_phrases(list_of_phrases, text):
    """
    convert phrases to bigrams within a larger text corpus.
    example: "I love collard greens for breakfast" -> "I love collard_greens for breakfast"
    example: "I love collard-greens for breakfast" -> "I love collard_greens for breakfast"
    """
    for phrase in list_of_phrases:
        #spaces
        text = re.sub(phrase, re.sub(' ', '_',phrase), text)
        #hyphens
        text = re.sub(re.sub(' ', '-', phrase), re.sub(' ', '_',phrase), text)
    return text

def make_underscores(item):
    """
    recursively replace spaces and hyphens in strings, lists, sets, or other iterables.
    Return the same type if string, list, set. If other type, returns list.
    """
    if type(item) is str:
        return re.sub(' |-', '_', item)
    else:
        temp = []
        for thing in item:
            temp.append(make_underscores(thing))
    if type(item) is set:
        return set(temp)
    elif type(item) is list:
        return temp
    elif isinstance(item, type({}.keys())):
        #if the object is a dict.key() view
        return temp
    else:
        print('Object must be string, list, set, or dict.keys()')
    #TODO this would be cleaner if i just check that it's iterable, and then check that it's a string.

In [9]:
#from the dict which representes our subcategories, create lists of all words in the subcategories.
h = [item for val in hazards.values() for item in val]
p = [item for val in products.values() for item in val]
e = [item for val in extras.values() for item in val]

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#concatenate list of all phrases (bigrams, anything with a space in it)
phrases = {'baby formula', 'baby cereal'}.union({item for item in p + h + e if ' ' in item})

#step 1: make a dict of just the text
text_dict = {key:value['body'] for key,value in posts_dict.items()}

#step 2 : convert the relevant phrases to bigrams with re.sub
text_dict = {key: make_phrases(phrases, text) for key, text in text_dict.items()}

#replace "don't" with "do not" (so that we don't get false positives for don count.)
for key in text_dict:
    text_dict[key] = re.sub('don[\W]+t', 'do not', text_dict[key], flags=re.I) #TODO. there are cases of "don' " need to catch.


In [116]:
 # //// END OF THE CLEANING PART FROM POSTS NOTEBOOK

# GenSim / Word2Vec Implementation on Threads

https://radimrehurek.com/gensim/models/word2vec.html

In [29]:
from gensim.models import Word2Vec, Phrases
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS

we can try running the model on only our subset and see how long it takes.

Then maybe we can run it on the entire dataset.

The end goal of this is to determine which words are related to ngrams like "baby food"

In [12]:
#Process our data to the right format for feeding into the model
keys = list(nm_ind.text_dict.keys())
text_list = list(nm_ind.text_dict.values())


In [13]:
import re

In [14]:
#define functions
def deEmojify(text):
    #remove emoji (FIXME: doesnt remove all of them.)
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def clean(text):
    #remove URLs
    pdf_regex='http[\S]+pdf[\S]*'
    regex = r'http\S+'
    text = re.sub(regex, 'urlpostedtopdf', text)
    text = re.sub(regex, 'urlpostedtosomething', text)
    #TODO: remove emails

    #replace commas and semicolons with spaces.
    text = re.sub('[;,&\+]+', ' ', text)
    #remove hyphens
    text = re.sub('[-]+', ' ', text)
    return text

In [15]:
#CLEAN TEXT
text_list = [deEmojify(i) for i in text_list]

text_list = [clean(item) for item in text_list]


#split sentences, new lines
text_list = [j for i in text_list for j in re.split('[\n?!.]+', i)]

#remove empty items
text_list = [i for i in text_list if i]
#strip reamining elements from text
text_list = [re.sub(r'[^A-Za-z0-9 ]+', '', i) for i in text_list]
#remove extra spaces
text_list = [re.sub(r'\s+', ' ', i) for i in text_list]

In [16]:
from gensim.utils import tokenize

In [17]:
#we need to transform list of sentence to LoL of words
tokens = [list(tokenize(doc, lower=True)) for doc in text_list]

In [30]:
# Train a bigram detector.
#this detects bigrams and converts them to single tokens simply by relplacing space w underscore
#check the google paper for more info on how it's done.
#https://datascience.stackexchange.com/questions/25524/how-does-phrases-in-gensim-work

#Detect phrases based on collocation counts.
bigram_transformer = Phrases(tokens, threshold = 2, connector_words=ENGLISH_CONNECTOR_WORDS)
#moving the threshhold a bit higher.. about 12, will get rid of a lot
#of non-phrase bigrams (what_brand, an_email) but I'm not sure how this would be beneficial.
#TODO: read how the word similarity is calculated.

# Apply the trained MWE detector to a corpus, using the result to train a Word2vec model.
model = Word2Vec(bigram_transformer[tokens], min_count=1)

%notify

<IPython.core.display.Javascript object>

In [None]:
#bigram_transformer = Phrases(tokens, threshold = 10) 
temp = sorted(bigram_transformer.export_phrases().items(), key= lambda x:x[1] * -1 )

In [114]:
most_similar_by_hazard = {}

def try_most_sim(pos:list, top=5):
#recursion approach
    pos = pos[:] #get a copy
    try:
        return model.wv.most_similar(positive=pos, topn=top)
    except KeyError as e:
        #drop the offending key
        #drop the offending key
        ##workaround: gensim does not keep args, only keeps message and returns it as arg
        ##we have to extract the key from the message
        ##this will severely slow down code at scale so it would be best to remove when the error 
        k = re.search("'([^']*)'", e.args[0]).group(1)
        
        pos.remove(k)
        if len(pos) > 0:
            return try_most_sim(pos, top)
        else:
            pass
        
        
for key in hazards.keys():
        most_similar_by_hazard[key] = try_most_sim(pos=hazards[key], top=15)

In [115]:
#the most similar words based on the mean of the words in each hazard category

most_similar_by_hazard

{'chemical contaminants': None,
 'endocrine disruptor': [('cellular', 0.9653998613357544),
  ('practices', 0.9601715803146362),
  ('red_meat', 0.9586284160614014),
  ('trust_your', 0.9585038423538208),
  ('calves', 0.9578831195831299),
  ('hormone', 0.9572047591209412),
  ('christians', 0.9553148746490479),
  ('boutique', 0.9551282525062561),
  ('horse', 0.9550603032112122),
  ('reactions', 0.9545850157737732),
  ('wild', 0.9537221193313599),
  ('booze', 0.9520724415779114),
  ('about_the_aveeno', 0.9518207907676697),
  ('their_kids', 0.9504043459892273),
  ('cefalexin', 0.9501385688781738)],
 'food preservatives, sweeteners and additives': [('fats', 0.9651685953140259),
  ('artificial', 0.9626595973968506),
  ('no_added', 0.9599071741104126),
  ('soft_cheeses', 0.9597725868225098),
  ('non', 0.9580036401748657),
  ('flavourings', 0.9575098752975464),
  ('gluten', 0.9572489857673645),
  ('sauces', 0.9572428464889526),
  ('grains', 0.9570308327674866),
  ('chicken_balls', 0.956840932369

In [None]:
#TODO: why some of these have none????