In [1]:
#requires gensim 3.8.3 , NOT version 4: no wrapper for sklearn
#python -m pip install gensim==3.8.3

### Term Counting Approach

In [2]:
#if jupyternotify is installed, we can add %notify to a cell to get an alert when it ifnished running
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [3]:
import metrics_helpers as indicators
import pickle as pk
import gc
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import traceback #needed to store full error tracebacks

In [4]:
def dt_to_int(dt): #datetime to integer
    return dt.astype('int')/(10**9)

In [5]:
with open('/Users/sma/Documents/INRAE internship/scrape-git/facebook/untypod_dict.pkl', 'rb') as f:
    netmums = pk.load(f)
      

In [6]:
nm_ind = indicators.indicators(netmums, fb=False)
#this one takes long, around 20 seconds I think.

posts_dict = nm_ind.get_posts_dict()

In [7]:
hazards = {
'Chemical contaminants': [],
'Endocrine disruptor': ["endocrine","estrogen"],
'FOOD PRESERVATIVES, SWEETENERS AND ADDITIVES':["preservatives","sweeteners","additives"],
"Pesticides":[],
"Veterinary drugs":["animal drugs","vet drugs"],
'GMO':['GM',"genetically modified"],
"Metals":[],
"Mycotoxin":[],
"Bisphenol A":['BPA','Bisphenol','BisphenolA'],
'Furan':[],
'DON': #(note that this acronym nobody uses and all results are from words like "don't")
["deoxynivalenol",
"vomitoxin"],
'DIOXIN AND PCB':["Dioxin","PCB","biphenyls"],
'MOSH and MOAH':["hydrocarbons","saturated hydrocarbons","MOAH", 'MOH',"aromatic hydrocarbons"],
'Nitrates':[],
"Acrylamid":["Acrylamide"],
"phthalates":[],
"Microbiologic contaminants":
["spores",
"mold",
"mould",
#"virus",
"microbes",
"contaminated"],
"Salmonella":[],
"Campylobacter":[],
"Listeria":[],
"EColi":["E-coli"],
"Cronobacter":[],
"Histamine":[],
'other bacteria':["bacteria"],
"Virus":[],
"Parasites":[],
'Related Terms':["carcinogen","chemicals", "toxic", "toxin", "poisonous", "fungus", "food poisoning", "hazard","EFSA","European Food Safety Authority"]
}

products = {
'infant formula':
["formula","baby formula", "bottle-fed", "bottle"]
,'sterilized vegetable mixed with fish':
["veggie baby food","vegetable baby food",
"veg puree", "veg purée"]
,'fresh fruit puree mildly processed':
["fruit puree","fruit baby food", "fruit purée", "applesauce", "apple sauce", "fruit sauce"]
,'infant cereals':
["cereal for baby", "cereal", "porridge", "oats", "oatmeal"]
,'other':
["jar food", "baby food", "jarred", "premade food", "puree", "purée", "jarred food"
,"yoghurt", "pudding"]
}


#IMPORTANT!: terms used for count vectorizer must be lower-case o.w. get 0 matches
hazards = {key.lower():[v.lower() for v in value] + [key.lower()] for key,value in hazards.items()}
products = {key.lower():[v.lower() for v in value]+[key.lower()] for key,value in products.items()}

In [8]:

extras = {\
'baby_food_brands':
['ellas',
'organix',
'heinz baby',
"plum baby",
'little angels',
'farleys'],
'formula_brands':['sma','aptamil comfort','infasoy','nutramigen','neocate','powdered milk','comfort milk'],
 'food_or_formula_brands':
['aptamil', # formula and cereals.
'hipp organic',# - formula and baby food
'cow gate','cow and gate','c g',
'mamia'],
##NON BRAND SIGNALS##
'cereal':['baby_cereal','baby riceporridge','baby rice','baby porridge'],
'baby_food':['mashed','tinned','premade','canned','jarred','pouches','pouch','ready made','readymade','cartons'],  
#INDICATORS TO BE USED IN CONJUNCTION WITH 'baby food' label: this way we 
#can observe if both terms are used in a document (but are not used right next to each other.)
'fruit':['fruit'],
'vegetable':['vegetable'],
'baby':['infant', 'baby' ,'for littles']
         }


In [9]:
import re

def make_phrases(list_of_phrases, text):
    """
    convert phrases to bigrams within a larger text corpus.
    example: "I love collard greens for breakfast" -> "I love collard_greens for breakfast"
    example: "I love collard-greens for breakfast" -> "I love collard_greens for breakfast"
    """
    for phrase in list_of_phrases:
        #spaces
        text = re.sub(phrase, re.sub(' ', '_',phrase), text)
        #hyphens
        text = re.sub(re.sub(' ', '-', phrase), re.sub(' ', '_',phrase), text)
    return text

def make_underscores(item):
    """
    recursively replace spaces and hyphens in strings, lists, sets, or other iterables.
    Return the same type if string, list, set. If other type, returns list.
    """
    if type(item) is str:
        return re.sub(' |-', '_', item)
    else:
        temp = []
        for thing in item:
            temp.append(make_underscores(thing))
    if type(item) is set:
        return set(temp)
    elif type(item) is list:
        return temp
    elif isinstance(item, type({}.keys())):
        #if the object is a dict.key() view
        return temp
    else:
        print('Object must be string, list, set, or dict.keys()')
    #TODO this would be cleaner if i just check that it's iterable, and then check that it's a string.

In [10]:
#from the dict which representes our subcategories, create lists of all words in the subcategories.
h = [item for val in hazards.values() for item in val]
p = [item for val in products.values() for item in val]
e = [item for val in extras.values() for item in val]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#concatenate list of all phrases (bigrams, anything with a space in it)
phrases = {'baby formula', 'baby cereal'}.union({item for item in p + h + e if ' ' in item})

#step 1: make a dict of just the text
text_dict = {key:value['body'] for key,value in posts_dict.items()}

#step 2 : convert the relevant phrases to bigrams with re.sub
text_dict = {key: make_phrases(phrases, text) for key, text in text_dict.items()}

#replace "don't" with "do not" (so that we don't get false positives for don count.)
for key in text_dict:
    text_dict[key] = re.sub('don[\W]+t', 'do not', text_dict[key], flags=re.I) #TODO. there are cases of "don' " need to catch.

#step 3: count occurences using countvectorizer
vocab = p + h + e
vocab = [re.sub(' |-','_',item) for item in vocab] #should I use make_underscores instead??
vocab = set(vocab)
term_counter = CountVectorizer(vocabulary = vocab, stop_words = 'english')
counts = term_counter.fit_transform(text_dict.values())

#note that hyphens will be treated as spaces by countvectorizer

In [12]:
counts = counts.toarray() #run once
count_dict = {}
for num, key in enumerate(text_dict.keys()): #TODO just use netmums, not text_dict?? its confusing. (they have the same keys)
    count_dict[key] = {term: counts[num][value] for term, value in term_counter.vocabulary_.items()}

In [13]:
countdf = pd.DataFrame.from_dict(count_dict).transpose()

In [14]:
summed_df = pd.DataFrame()

for key in products.keys():
    summed_df[key] = countdf[make_underscores(products[key])].sum(axis=1)
for key in hazards.keys():
    summed_df[key] = countdf[make_underscores(hazards[key])].sum(axis=1)
for key in extras.keys():
    summed_df[key] = countdf[make_underscores(extras[key])].sum(axis=1)

In [15]:
#count mentions of fruit or vegetable
#return 0 if there is no words indicating a context of BABY foods (not adult foods)
#note that baby food brand names occur much more than fruit or veg. Am not sure if they co-occur.
#TODO: maybe it is better to add the brands in with the fruit / veg. But since it is highly corr with them alreayd, at least looking by post it isnt a problem
summed_df['fruit_in_baby_context'] = summed_df['fruit']  * (summed_df[['baby_food_brands', 'food_or_formula_brands', 'baby']].sum(axis=1) > 0)
summed_df['veg_in_baby_context'] = summed_df['vegetable']  * (summed_df[['baby_food_brands', 'food_or_formula_brands', 'baby']].sum(axis=1) > 0)

#if there is mention of fruit or vegetable it's not uncategorized. return 0
# if no mentions, sum the counts of mentions of baby food brands
# possible improvement: check for words indicating a food, or in weaning forum etc. THEN we can also add food_or_formula_brands to the COUNT.
summed_df['baby_food_uncategorized'] = (summed_df[['fruit','vegetable']].sum(axis=1) > 0) * summed_df['baby_food_brands']

In [16]:
class_df = summed_df.copy()

In [17]:
product_cols = list(products.keys()) + ['veg_in_baby_context', 'fruit_in_baby_context', 'baby_food_uncategorized']

In [18]:
#classify
class_df['product_type'] = class_df[product_cols].idxmax(axis=1)
# idxmax has a strange behavior where it will set all-zero sets to an arbitrary category (the first one available?)
# so we must manually change them to an NA category.
class_df.loc[class_df[product_cols].max(axis=1) == 0,'product_type'] = 'NA'
# convert to categorical (factors)
class_df['product_type'] = class_df['product_type'].astype('category')

In [19]:
#make classification for hazards and check it as well.
class_df['hazard_type'] = class_df[hazards.keys()].idxmax(axis=1)
class_df.loc[class_df[hazards.keys()].max(axis=1) == 0,'hazard_type'] = 'NA'
class_df['hazard_type'] = class_df['hazard_type'].astype('category')

# Running KNN on the processed numbers. (maybe) TODO 

# Guided LDA Approach...

 https://github.com/scign/GuidedLDA/blob/master/Guided%20LDA%20using%20gensim.ipynb
 
 how many possible cateorgires do we have?
 4 Products
 18 Hazards

 so we do
     * k = 4
     * k = 18
     * k = $18*4$ = 72


In [20]:
import time

import gensim
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

In [21]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sma/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/sma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
# simplify Penn tags to n (NOUN), v (VERB), a (ADJECTIVE) or r (ADVERB)
def simplify(penn_tag):
    pre = penn_tag[0]
    if (pre == 'J'):
        return 'a'
    elif (pre == 'R'):
        return 'r'
    elif (pre == 'V'):
        return 'v'
    else:
        return 'n'
def preprocess(text):
    stop_words = stopwords.words('english')
    toks = gensim.utils.simple_preprocess(str(text), deacc=True)
    wn = WordNetLemmatizer()
    return [wn.lemmatize(tok, simplify(pos)) for tok, pos in nltk.pos_tag(toks) if tok not in stop_words]


In [23]:
corp = [preprocess(line) for line in text_dict.values()]

In [24]:
dictionary = gensim.corpora.Dictionary(corp)
len(corp)

19671

In [None]:
# lda model:
# update_every - 0 is single-batch, 1 is online. any other number is the batch size
# distributed - distributed computing (not relevant to us)
# ns_conf - only used with distributed
# 
# lda multicore:
# batch - batch true or false
# workers - 
# per_word_topics - proba of topic is assigned to each word. 
#

In [None]:
#   model = gensim.models.ldamodel.LdaModel(
#               corpus=bow, id2word=dictionary, num_topics=ntopics,
#               random_state=42, chunksize=100, eta=eta,
#               eval_every=-1, update_every=0, #0 is batch, 1 is online.
#               passes=10, alpha='auto', per_word_topics=False) #I LOWERED THE PASSES BY A LOT TO BE FASTER, hEh..

In [46]:
# https://radimrehurek.com/gensim/models/ldamodel.html
# https://radimrehurek.com/gensim/models/ldamulticore.html

def test_eta(eta, dictionary, ntopics, print_topics=True, print_dist=True, the_corpus = corp ):
    """
    Run a LDA model using the eta function.
    """
    np.random.seed(42) # set the random seed for repeatability

    bow = [dictionary.doc2bow(line) for line in the_corpus] # get the bow-format lines with the set dictionary
    
    with (np.errstate(divide='ignore')):  # ignore divide-by-zero warnings
        model = gensim.models.ldamulticore.LdaMulticore(
            corpus=bow, id2word=dictionary, num_topics=ntopics,
            random_state=42, chunksize=100, eta=eta,
            eval_every=-1, batch=True, #update_every=0, #0 is batch, 1 is online.
            passes=10, alpha='symmetric', per_word_topics=False) #I LOWERED THE PASSES BY A LOT TO BE FASTER, hEh..
        
    # visuzlize the model term topics
    print('Perplexity: {:.2f}'.format(model.log_perplexity(bow)))
    if print_topics:
        # display the top terms for each topic
        for topic in range(ntopics):
            print('Topic {}: {}'.format(topic, [dictionary[w] for w,p in model.get_topic_terms(topic, topn=18)]))
    return model

In [26]:
def create_eta(priors, etadict, ntopics):
    """
    Generates a matrix of coefficients corresponding to apriori-beliefs about word occurence in each class.
    """
    eta = np.full(shape=(ntopics, len(etadict)), fill_value=1) # create a (ntopics, nterms) matrix and fill with 1
    for word, topic in priors.items(): # for each word in the list of priors
        keyindex = [index for index,term in etadict.items() if term==word] # look up the word in the dictionary
        if (len(keyindex)>0): # if it's in the dictionary
            eta[topic,keyindex[0]] = 1e7  # put a large number in there
    eta = np.divide(eta, eta.sum(axis=0)) # normalize so that the probabilities sum to 1 over all topics
    return eta

In [27]:
#create eta (to me this is the topic which each word goes to... idk lol)

apriori_hazards = {}
apriori_products = {}
for ind, key in enumerate(hazards.keys()):
    for item in hazards[key]:
        apriori_hazards[item] = ind
        
for ind, key in enumerate(products.keys()):
    for item in products[key]:
        apriori_products[item] = ind

In [28]:
apriori_hazards

{'chemical contaminants': 0,
 'endocrine': 1,
 'estrogen': 1,
 'endocrine disruptor': 1,
 'preservatives': 2,
 'sweeteners': 2,
 'additives': 2,
 'food preservatives, sweeteners and additives': 2,
 'pesticides': 3,
 'animal drugs': 4,
 'vet drugs': 4,
 'veterinary drugs': 4,
 'gm': 5,
 'genetically modified': 5,
 'gmo': 5,
 'metals': 6,
 'mycotoxin': 7,
 'bpa': 8,
 'bisphenol': 8,
 'bisphenola': 8,
 'bisphenol a': 8,
 'furan': 9,
 'deoxynivalenol': 10,
 'vomitoxin': 10,
 'don': 10,
 'dioxin': 11,
 'pcb': 11,
 'biphenyls': 11,
 'dioxin and pcb': 11,
 'hydrocarbons': 12,
 'saturated hydrocarbons': 12,
 'moah': 12,
 'moh': 12,
 'aromatic hydrocarbons': 12,
 'mosh and moah': 12,
 'nitrates': 13,
 'acrylamide': 14,
 'acrylamid': 14,
 'phthalates': 15,
 'spores': 16,
 'mold': 16,
 'mould': 16,
 'microbes': 16,
 'contaminated': 16,
 'microbiologic contaminants': 16,
 'salmonella': 17,
 'campylobacter': 18,
 'listeria': 19,
 'e-coli': 20,
 'ecoli': 20,
 'cronobacter': 21,
 'histamine': 22,
 'b

In [29]:
eta_haz = create_eta(apriori_hazards, dictionary, 27)
eta_prod = create_eta(apriori_products, dictionary, 5)

In [30]:
bow  = [dictionary.doc2bow(line) for line in corp]

In [31]:
#TODO: GridSearchCV for each ! 
#TODO: I can add the multi-category terms into my seeded LDA model. maybe even give them lower coeffs than the more confident ones.

In [49]:
seconds_auto

197.32287216186523

In [47]:
#model without Informed Priors
start = time.time()
test_eta('auto', dictionary, 27)
end = time.time()
seconds_auto = end - start

Perplexity: -7.58
Topic 0: ['month', 'go', 'think', 'week', 'baby', 'use', 'know', 'like', 'get', 'good', 'try', 'help', 'one', 'time', 'take', 'say', 'start', 'need']
Topic 1: ['get', 'go', 'take', 'hi', 'like', 'know', 'love', 'would', 'day', 'food', 'work', 'give', 'make', 'say', 'thing', 'time', 'eat', 'good']
Topic 2: ['juice', 'shake', 'get', 'plus', 'plan', 'would', 'diet', 'eat', 'help', 'meal', 'hi', 'day', 'use', 'start', 'also', 'people', 'send', 'thanks']
Topic 3: ['food', 'go', 'think', 'know', 'make', 'eat', 'old', 'want', 'give', 'good', 'year', 'help', 'would', 'son', 'get', 'really', 'day', 'month']
Topic 4: ['baby', 'people', 'bottle', 'make', 'get', 'say', 'know', 'would', 'food', 'like', 'milk', 'take', 'old', 'cat', 'feed', 'use', 'give', 'thing']
Topic 5: ['use', 'make', 'bottle', 'think', 'sent', 'netmums', 'really', 'need', 'milk', 'app', 'like', 'water', 'baby', 'mobile', 'day', 'feed', 'iphone', 'get']
Topic 6: ['use', 'get', 'good', 'lol', 'day', 'would', 'li

In [None]:
start = time.time()
test_eta('auto', dictionary, 5)
end = time.time()

In [None]:
#model with Informed Priors

start = time.time()
test_eta(eta_prod, dictionary, 5)
end = time.time()
seconds_eta_p = end - start

In [None]:
seconds_eta_p

In [50]:
#model with Informed Priors

start = time.time()
test_eta(eta_haz, dictionary, 27)
end = time.time()
seconds_eta_h = end - start
#it runs faster after changing passes to a low number (3) and update to 0 (batch instead of online.)
#I can tweak it later on i guess.
#But now I know it doent have to be inhumanly slow! :)

Perplexity: -1.13
Topic 0: ['month', 'go', 'think', 'week', 'baby', 'use', 'like', 'know', 'get', 'good', 'try', 'help', 'one', 'time', 'say', 'want', 'take', 'need']
Topic 1: ['get', 'go', 'take', 'hi', 'like', 'food', 'know', 'would', 'day', 'eat', 'give', 'say', 'love', 'time', 'work', 'way', 'make', 'think']
Topic 2: ['shake', 'juice', 'get', 'plus', 'plan', 'would', 'help', 'diet', 'eat', 'meal', 'day', 'hi', 'start', 'use', 'weight', 'thanks', 'also', 'rep']
Topic 3: ['food', 'go', 'think', 'eat', 'know', 'make', 'want', 'old', 'give', 'get', 'day', 'year', 'good', 'really', 'would', 'help', 'son', 'like']
Topic 4: ['baby', 'people', 'bottle', 'make', 'say', 'get', 'know', 'cat', 'take', 'would', 'like', 'feed', 'old', 'food', 'go', 'need', 'use', 'milk']
Topic 5: ['use', 'make', 'think', 'bottle', 'sent', 'really', 'need', 'netmums', 'milk', 'water', 'app', 'like', 'day', 'get', 'mobile', 'baby', 'try', 'much']
Topic 6: ['use', 'get', 'good', 'lol', 'like', 'day', 'would', 'say'

In [None]:
print(seconds_auto, seconds_eta)

In [None]:
#TODO: simplify the creation of eta according to the documentation so its just based on dictionary
#TODO: check for convergence to see if pass # is correctly specified.
#TODO: set up grid searchCV


#eta ({float, np.array, str}, optional) –
#
#A-priori belief on word probability, this can be:
#
#        scalar for a symmetric prior over topic/word probability,
#
#        vector of length num_words to denote an asymmetric user defined probability for each word,
#
#        matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination,
#         ^ this is the one the guy chose.
#        the string ‘auto’ to learn the asymmetric prior from the data.



In [None]:
# https://stats.stackexchange.com/questions/74487/topic-models-evaluation-in-gensim


# GridSearchCV

In [None]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

In [None]:
#https://stackoverflow.com/questions/60602768/scikit-learn-gridsearchcv-failing-on-on-a-gensim-lda-model
# https://github.com/RaRe-Technologies/gensim/blob/a83e61b768d53ab3bab72abe4aa7db9aab66593c/docs/notebooks/sklearn_wrapper.ipynb

1) build grid search
2) add in cross validation


GRidSearcg:
- dictionary of all parameters.
- generate and evaluate each model one by one
- save model and results into a dict i guess
- save results into dict as well.
- show the best perfoming model from each.
https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/topic_coherence_model_selection.ipynb

In [None]:
np.random.seed(42) # set the random seed for repeatability

bow = [dictionary.doc2bow(line) for line in corp] # get the bow-format lines with the set dictionary

with (np.errstate(divide='ignore')):  # ignore divide-by-zero warnings
    informed_model = gensim.models.ldamodel.LdaModel(
        corpus=bow, id2word=dictionary, num_topics=ntopics,
        random_state=42, chunksize=100, eta=eta,
        eval_every=-1, update_every=0, #0 is batch, 1 is online.
        passes=10, alpha='auto', per_word_topics=False) #I LOWERED THE PASSES BY A LOT TO BE FASTER, hEh..
    normal_model

In [None]:
# Init the Model
#lda = LatentDirichletAllocation()

In [None]:
# Init Grid Search Class
#model = GridSearchCV(lda, param_grid=search_params)

In [None]:


## Do the Grid Search
#model.fit(data_vectorized)
#
#
#GridSearchCV(cv=None, error_score='raise',
#       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
#             evaluate_every=-1, learning_decay=0.7, learning_method=None,
#             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
#             mean_change_tol=0.001, n_components=10, n_jobs=1,
#             n_topics=None, perp_tol=0.1, random_state=None,
#             topic_word_prior=None, total_samples=1000000.0, verbose=0),
#       fit_params=None, iid=True, n_jobs=1,
#       param_grid={'n_topics': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
#       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
#       scoring=None, verbose=0)
#	   
## Best Model
#best_lda_model = model.best_estimator_
#
## Model Parameters
#print("Best Model's Params: ", model.best_params_)
#
## Log Likelihood Score
#print("Best Log Likelihood Score: ", model.best_score_)
#
## Perplexity
#print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
#	   
## Get Log Likelyhoods from Grid Search Output
#n_topics = [10, 15, 20, 25, 30]
#log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.5]
#log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.7]
#log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.9]
#
## Show graph
#plt.figure(figsize=(12, 8))
#plt.plot(n_topics, log_likelyhoods_5, label='0.5')
#plt.plot(n_topics, log_likelyhoods_7, label='0.7')
#plt.plot(n_topics, log_likelyhoods_9, label='0.9')
#plt.title("Choosing Optimal LDA Model")
#plt.xlabel("Num Topics")
#plt.ylabel("Log Likelyhood Scores")
#plt.legend(title='Learning decay', loc='best')
#plt.show()