In this notebook, we:
* select a subset based on indicators derived from the text and related data
    * we do not process the text as thorougly at this step because we are only working with entire threads at this point, so it should be expected that the vocabulary we are searching for appear at least once in each thread.
* process text so that it can be better evaluated in further steps.

In [1]:
#if jupyternotify is installed, we can add %notify to a cell to get an alert when it ifnished running
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [2]:
import metrics_helpers as indicators
import pickle as pk
import gc
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd

# Create Indicators

In [3]:
def dt_to_int(dt): #datetime to integer
    return dt.astype('int')/(10**9)

In [4]:
#netmums


with open('/Users/sma/Documents/INRAE internship/scrape-git/netmums/allposts_rerun.pkl', 'rb') as f:
    netmums = pk.load(f)


### Export the Keys for our Desired Subset.

# Clean Text
We now construct our subset form the desired keys and then process the text.
* TODO: clean the text before we run it through the next steps. By removing hyphens, upper cases, etc.
    * but not lemmatization, unless we also lemmatize our lists of words to search for!!!

* remove typos of relevant words using Levenshtein Distances
* replace tokens for specific foods and brands with their category, after compiling lists of these terms using word2vec
    * replace tokens for all types of fruits with fruit
    * replace tokens for all types of vegetable with vegetable
    * replace tokens for all types of grains with "cereal" (???) should I??

### Setup: Define Cleaning Function

In [5]:
#note if we replace hyphens with spaces at this step we may have
#issues fully removing URLs. Let's remove hyphens later in the pipeline.

#lowercasing is implemented as an option within the package.
import re #TODO: is there a better way of doing this? my pakage already imports re.
def clean(text):
    #lowercase
    text = text.lower()
    #remove URLs.
    reg = '\S+.(?:co|net|tv|org|edu|gov)\S*'
    text = re.sub(reg, '', text)
    
    return text

### Setup: Create Lists for Relevant Terms

In [6]:
baby_formula = \
['nutramigen',
 'neocate',
 'powdered milk',
 'infasoy',
 'comfort milk', #brand name which people dont write formula alongside
 'sma' 
]

baby_cereal = \
['baby rice '#this one is really useful / important. idk how exactly to handle it.
'rusks' #a cereal food for babies to teethe with
]

cereal = \
['cornflakes',
'muesli',
'bran flakes',
'cheerios',
'shreddies',
'weetabix',
'ready brek',
'rice pudding',
'rice'
]

fruit = \
['banana',
'berries',
'blueberries',
'raisins',
'apples',
'pear',
'strawberries',
'pineapple', 
'raspberries',
'mango', 
'prunes', 
'grapefruit']

veg = \
['mushroom', 
'red_pepper',
'green_beans', 
'courgette', 
'broccoli', 
'tomato',
'parsnips', 
'greens', 
'potato', 
'carrots',
'broccoli',
'cucumber', 
'peas', 
'tomatoes', 
'sweet_potato',
'sweetcorn', 
'corn', 
'spinach', 
'cauliflower',
'butternut squash', 
'beetroot',
'squash']

In [7]:
foodwords = [
#infant formula
"formula","baby formula", "bottle-fed", "bottle",
#sterizlized vegetable mixed with fish
"veggie baby food","vegetable baby food",
"veg puree", "veg purée",
#fresh fruit puree mildly processed
"fruit puree","fruit baby food", "fruit purée", "applesauce",
#infant cereals
"cereal for baby", "cereal", "porridge", "oats", "oatmeal",
#other
"jar food", "baby food", "jarred", "premade food", "puree", "purée", "jarred food"
,"yoghurt", "pudding"]

hazardwords = ["Chemical contaminants",#ENDOCRINE DISRUPTOR
"Endocrine disruptor","endocrine","estrogen",#end
#FOOD PRESERVATIVES, SWEETENERS AND ADDITIVES
"preservatives","sweeteners","additives", #end
"Pesticides",#VETERINARY DRUGS
"Veterinary drugs","animal drugs","vet drugs", #end
#GMO
"GMO", "genetically modified",#end
"Metals","Mycotoxin",#BISPHENOL A
"Bisphenol","BPA", #end
#FURAN - removed because nothing related to this returns results
#DON (note that this acronym nobody uses and all results are from words like "don't")
"deoxynivalenol","vomitoxin",#end
#DIOXIN AND PCB
"Dioxin","PCB","biphenyls",#end
#MOH
"MOH","hydrocarbons","saturated hydrocarbons","MOAH","aromatic hydrocarbons",#end
"Nitrates",
#ACRYLAMID
"Acrylamide",
"phthalates",
#MICROBIOLOGIC CONTAMINANTS
"Microbiologic contaminants","spores","mold","mould","virus","microbes","contaminated",#end
"Salmonella","Campylobacter","Listeria",
#ECOLI
"EColi",
"Cronobacter",
"Histamine",
#other bacteria
"bacteria",#end
"Virus",
"Parasites",
#UNRELATED BUT MAYBE USEFUL?
"carcinogen","chemicals", "toxic", "toxin", "poisonous", "fungus", "food poisoning", "hazard","EFSA","European Food Safety Authority"]

### Setup: Define Functions

In [8]:
import fuzzy_typos

In [9]:
typos_to_fix_or_replace = veg + fruit + cereal + baby_formula + baby_cereal
typos_to_fix_or_replace = {word for phrase in typos_to_fix_or_replace for word in phrase.split()} #typos to fix and single tokens to replace

replacements_dictionary = {'vegetable':veg, 'fruit':fruit, 'cereal':cereal, 'baby cereal':baby_cereal, 'baby formula': baby_formula}

remaining_words_to_replace = {key:[item for item in value if ' ' in item] for key, value in replacements_dictionary.items()}

In [10]:
fix_and_replace_tokens = fuzzy_typos.fuzzy_typos(typos_to_fix_or_replace, replacements_dictionary, cleaner = clean)
replace_phrases = fuzzy_typos.replacements(remaining_words_to_replace)

### Setup: Parallel Processing

In [11]:
from joblib import Parallel, delayed
import time

In [12]:
keys = list(netmums.keys())
num_keys = len(keys)
num_lists = 20 #how many instances will be split for parallel processing
list_of_list_of_keys = [keys[slice(i,num_keys,num_lists)] for i in range(num_lists)]

def get_small_dict(list_of_keys): #we give process small dicts because o.w. the whole dict (a global) will get duplicated in each instance
    return {key: netmums[key] for key in list_of_keys}

def process(typofixer,replacer,small_dict): #now process takes two objects
    #approx 1.5x slower than the text_dict way.
    #THE RELEVANT THINGS:
    #netmums[blah]['title']
    #netmums[blah]['posts'][n]['body']
    #netmums[blah]['posts'][n]['quotes_w']
    #netmums[blah]['posts'][n]['quotes_y']['text']
    for key, value in small_dict.items():
        small_dict[key]['title'] = typofixer.fix_typos(small_dict[key]['title'])
        for ind, item in enumerate(small_dict[key]['posts']): #a list of dicts
            if item['body']:
                small_dict[key]['posts'][ind]['body'] = replacer.replace_all(typofixer.fix_typos(item['body']))
            if item['quotes_w']:
                for qind, quote in enumerate(item['quotes_w']):
                    small_dict[key]['posts'][ind]['quotes_w'][qind] = replacer.replace_all(typofixer.fix_typos(quote))
            if item['quotes_y']:
                for qind, quote in enumerate(item['quotes_y']):
                    small_dict[key]['posts'][ind]['quotes_y'][qind]['text'] = replacer.replace_all(typofixer.fix_typos(quote['text']))     
    return small_dict

In [13]:
#generate dicts which we will feed into the parallel processing
#if we feed the entire dict in and generate them from within it, 
#the whole dict will get duplicated  many times wasting memory.

list_of_small_dict = [get_small_dict(i) for i in list_of_list_of_keys]

### Finally Running It

In [14]:
start = time.time()
results = Parallel(n_jobs=-1)(delayed(process)(fix_and_replace_tokens,replace_phrases,i) for i in list_of_small_dict)
end = time.time()
print('default food words time: ' + str(end - start),)
%notify

default food words time: 3105.523297071457


<IPython.core.display.Javascript object>

In [15]:
untypod_dict = {key:value for dictionary in results for key,value in dictionary.items()}

In [16]:
with open('FULL_untypod_dict.pkl', 'wb') as f:
    pk.dump(untypod_dict, f)