# Topic Modeling

### To perform topic modeling on the 'steps' column of the RAW_recipes.csv

In [3]:
import numpy as np
import pandas as pd
import pickle 

recipes = pd.read_csv('../../data/RAW_recipes.csv', usecols=['steps'])
print (recipes)

                                                    steps
0       ['make a choice and proceed with recipe', 'dep...
1       ['preheat oven to 425 degrees f', 'press dough...
2       ['brown ground beef in large pot', 'add choppe...
3       ['place potatoes in a large pot of lightly sal...
4       ['mix all ingredients& boil for 2 1 / 2 hours ...
...                                                   ...
231632  ['heat oil in a 4-quart dutch oven', 'add cele...
231633        ['mix all ingredients together thoroughly']
231634  ['in a bowl , combine the mashed yolks and may...
231635  ['place melted butter in a large mixing bowl a...
231636  ['whip sugar and shortening in a large bowl , ...

[231637 rows x 1 columns]


### Refer to ../data/topic-model/experiments.txt for experiment designs.

In [4]:
# Preprocessing
import nltk
import string

from nltk.corpus import stopwords

'''
- text = text.replace("'","") is used before the tokenisation to remove "'" because the tokenisation itself cannot separate
  that properly and if not it appears at the front of every sentence.
- Warning: Cell takes very long to process.
'''
# mystopwords = open('../../data/topic-model/stopwords/exp1.txt', 'r')
# mystopwords = mystopwords.read().splitlines()
mystopwords = stopwords.words("english")
WNlemma = nltk.WordNetLemmatizer()

def pre_process(text):
    text = text.replace("'","")
    tokens = nltk.word_tokenize(text)
    tokens = [ t for t in tokens if t not in string.punctuation+"’“”'" ]
    tokens = [ WNlemma.lemmatize(t.lower()) for t in tokens ]
    tokens = [ t for t in tokens if t not in mystopwords ]
    tokens = [ t for t in tokens if len(t) >= 3 ]
    return(tokens)

text = recipes['steps']

# For testing on individual recipes (uncomment when required)
# tokens = pre_process(text[1])
# print (tokens)

tokens = text.apply(pre_process)

In [5]:
# Save tokens
tokens.to_pickle('../../data/topic-model/tokens/exp2.pkl')

In [51]:
# Load tokens
# tokens = pd.read_pickle('../../data/topic-model/tokens/exp1.pkl')

In [40]:
# Use dictionary (built from corpus) to prepare a DTM (using frequency)
import logging
import gensim 
from gensim import corpora

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary(tokens)
print(dictionary)

# Filter off any words with document frequency less than #, or appearing in more than #% documents.
dictionary.filter_extremes(no_below=100, no_above=0.7)
"""
        no_below : int, optional
            Keep tokens which are contained in at least `no_below` documents.
        no_above : float, optional
            Keep tokens which are contained in no more than `no_above` documents
            (fraction of total corpus size, not an absolute number).
"""
print(dictionary)

2021-09-19 13:43:46,535 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-09-19 13:43:47,052 : INFO : adding document #10000 to Dictionary(10159 unique tokens: ['350', 'aluminum', 'bake', 'baking', 'burn']...)
2021-09-19 13:43:47,571 : INFO : adding document #20000 to Dictionary(14308 unique tokens: ['350', 'aluminum', 'bake', 'baking', 'burn']...)
2021-09-19 13:43:48,090 : INFO : adding document #30000 to Dictionary(17597 unique tokens: ['350', 'aluminum', 'bake', 'baking', 'burn']...)
2021-09-19 13:43:48,617 : INFO : adding document #40000 to Dictionary(20509 unique tokens: ['350', 'aluminum', 'bake', 'baking', 'burn']...)
2021-09-19 13:43:49,161 : INFO : adding document #50000 to Dictionary(22868 unique tokens: ['350', 'aluminum', 'bake', 'baking', 'burn']...)
2021-09-19 13:43:49,704 : INFO : adding document #60000 to Dictionary(25287 unique tokens: ['350', 'aluminum', 'bake', 'baking', 'burn']...)
2021-09-19 13:43:50,189 : INFO : adding document #70000 to Dictiona

Dictionary(53776 unique tokens: ['350', 'aluminum', 'bake', 'baking', 'burn']...)
Dictionary(3072 unique tokens: ['350', 'aluminum', 'bake', 'baking', 'burn']...)


In [41]:
# Save dictionary
dictionary.save('../../data/topic-model/dicts/exp2.dict')

2021-09-19 13:44:45,894 : INFO : Dictionary lifecycle event {'fname_or_handle': '../Data/topic-model/dicts/exp2.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-09-19T13:44:45.894430', 'gensim': '4.0.1', 'python': '3.9.6 | packaged by conda-forge | (default, Jul  6 2021, 08:46:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18363-SP0', 'event': 'saving'}
2021-09-19 13:44:45,900 : INFO : saved ../Data/topic-model/dicts/exp2.dict


In [52]:
# Load dictionary
# dictionary = corpora.Dictionary.load('../../data/topic-model/dicts/exp1.dict')

2021-09-19 14:22:47,290 : INFO : loading Dictionary object from ../Data/topic-model/dicts/exp1.dict
2021-09-19 14:22:47,701 : INFO : Dictionary lifecycle event {'fname': '../Data/topic-model/dicts/exp1.dict', 'datetime': '2021-09-19T14:22:47.701248', 'gensim': '4.0.1', 'python': '3.9.6 | packaged by conda-forge | (default, Jul  6 2021, 08:46:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18363-SP0', 'event': 'loaded'}


In [53]:
# dtm here is a list of lists, which is exactly a matrix
'''
 Warning: Cell takes very VERY long to process.
'''
dtm = [dictionary.doc2bow(d) for d in tokens]
lda = gensim.models.ldamodel.LdaModel(dtm, num_topics=15, id2word=dictionary, passes=10, chunksize=128, random_state=10)

In [43]:
# Save LDA model
lda.save('../../data/topic-model/models/exp2.model')

2021-09-19 14:04:27,589 : INFO : LdaState lifecycle event {'fname_or_handle': '../Data/topic-model/models/exp2.model.state', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-09-19T14:04:27.589154', 'gensim': '4.0.1', 'python': '3.9.6 | packaged by conda-forge | (default, Jul  6 2021, 08:46:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18363-SP0', 'event': 'saving'}
2021-09-19 14:04:28,912 : INFO : saved ../Data/topic-model/models/exp2.model.state
2021-09-19 14:04:28,924 : INFO : LdaModel lifecycle event {'fname_or_handle': '../Data/topic-model/models/exp2.model', 'separately': "['expElogbeta', 'sstats']", 'sep_limit': 10485760, 'ignore': ['state', 'dispatcher', 'id2word'], 'datetime': '2021-09-19T14:04:28.924394', 'gensim': '4.0.1', 'python': '3.9.6 | packaged by conda-forge | (default, Jul  6 2021, 08:46:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18363-SP0', 'event': 'saving'}
2021-09-19 14:04:28,925 : INFO : st

In [54]:
# Load LDA model
# lda = gensim.models.ldamodel.LdaModel.load('../../data/topic-model/models/exp1.model')

2021-09-19 14:23:11,454 : INFO : loading LdaModel object from ../Data/topic-model/models/exp1.model
2021-09-19 14:23:11,456 : INFO : loading expElogbeta from ../Data/topic-model/models/exp1.model.expElogbeta.npy with mmap=None
2021-09-19 14:23:11,459 : INFO : setting ignored attribute state to None
2021-09-19 14:23:11,459 : INFO : setting ignored attribute dispatcher to None
2021-09-19 14:23:11,460 : INFO : setting ignored attribute id2word to None
2021-09-19 14:23:11,460 : INFO : LdaModel lifecycle event {'fname': '../Data/topic-model/models/exp1.model', 'datetime': '2021-09-19T14:23:11.460833', 'gensim': '4.0.1', 'python': '3.9.6 | packaged by conda-forge | (default, Jul  6 2021, 08:46:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18363-SP0', 'event': 'loaded'}
2021-09-19 14:23:11,462 : INFO : loading LdaState object from ../Data/topic-model/models/exp1.model.state
2021-09-19 14:23:11,464 : INFO : LdaState lifecycle event {'fname': '../Data/topic-model/models/exp1.mo

In [55]:
# Save dtm
with open('../../data/topic-model/dtms/exp1.pkl', 'wb') as f:
    pickle.dump(dtm, f)

In [None]:
# Load dtm
# with open('../../data/topic-model/dtms/exp2.pkl', 'rb') as f:
#     dtm = pickle.load(f)

In [44]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(lda, dtm, dictionary)
pyLDAvis.save_html(LDAvis_prepared, '../../data/topic-model/pyLDAvis/exp2.html')

  default_term_info = default_term_info.sort_values(


In [56]:
## Evaluate the coherence score of LDA models using the metrics for LDA
'''
u_mass:prefer the model close to 0. 
c_v: [0,1], prefer bigger value.   
Do not fully rely on the coherence score.
'''
from gensim.models.coherencemodel import CoherenceModel
cm_umass = CoherenceModel(lda, dictionary=dictionary, corpus=dtm, coherence='u_mass')
cm_cv = CoherenceModel(lda, dictionary=dictionary, texts=tokens, coherence='c_v')
lda_umass = cm_umass.get_coherence()
lda_cv = cm_cv.get_coherence()

2021-09-19 14:23:29,244 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2021-09-19 14:23:29,268 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2021-09-19 14:23:29,292 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2021-09-19 14:23:29,317 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2021-09-19 14:23:29,341 : INFO : CorpusAccumulator accumulated stats from 5000 documents
2021-09-19 14:23:29,365 : INFO : CorpusAccumulator accumulated stats from 6000 documents
2021-09-19 14:23:29,388 : INFO : CorpusAccumulator accumulated stats from 7000 documents
2021-09-19 14:23:29,413 : INFO : CorpusAccumulator accumulated stats from 8000 documents
2021-09-19 14:23:29,438 : INFO : CorpusAccumulator accumulated stats from 9000 documents
2021-09-19 14:23:29,460 : INFO : CorpusAccumulator accumulated stats from 10000 documents
2021-09-19 14:23:29,485 : INFO : CorpusAccumulator accumulated stats from 11000 documents
2021-09-19 14:23:29

2021-09-19 14:23:31,535 : INFO : CorpusAccumulator accumulated stats from 93000 documents
2021-09-19 14:23:31,559 : INFO : CorpusAccumulator accumulated stats from 94000 documents
2021-09-19 14:23:31,585 : INFO : CorpusAccumulator accumulated stats from 95000 documents
2021-09-19 14:23:31,607 : INFO : CorpusAccumulator accumulated stats from 96000 documents
2021-09-19 14:23:31,631 : INFO : CorpusAccumulator accumulated stats from 97000 documents
2021-09-19 14:23:31,652 : INFO : CorpusAccumulator accumulated stats from 98000 documents
2021-09-19 14:23:31,675 : INFO : CorpusAccumulator accumulated stats from 99000 documents
2021-09-19 14:23:31,700 : INFO : CorpusAccumulator accumulated stats from 100000 documents
2021-09-19 14:23:31,726 : INFO : CorpusAccumulator accumulated stats from 101000 documents
2021-09-19 14:23:31,749 : INFO : CorpusAccumulator accumulated stats from 102000 documents
2021-09-19 14:23:31,772 : INFO : CorpusAccumulator accumulated stats from 103000 documents
2021-0

2021-09-19 14:23:33,812 : INFO : CorpusAccumulator accumulated stats from 184000 documents
2021-09-19 14:23:33,837 : INFO : CorpusAccumulator accumulated stats from 185000 documents
2021-09-19 14:23:33,864 : INFO : CorpusAccumulator accumulated stats from 186000 documents
2021-09-19 14:23:33,890 : INFO : CorpusAccumulator accumulated stats from 187000 documents
2021-09-19 14:23:33,912 : INFO : CorpusAccumulator accumulated stats from 188000 documents
2021-09-19 14:23:33,935 : INFO : CorpusAccumulator accumulated stats from 189000 documents
2021-09-19 14:23:33,959 : INFO : CorpusAccumulator accumulated stats from 190000 documents
2021-09-19 14:23:33,983 : INFO : CorpusAccumulator accumulated stats from 191000 documents
2021-09-19 14:23:34,009 : INFO : CorpusAccumulator accumulated stats from 192000 documents
2021-09-19 14:23:34,034 : INFO : CorpusAccumulator accumulated stats from 193000 documents
2021-09-19 14:23:34,059 : INFO : CorpusAccumulator accumulated stats from 194000 documents

In [57]:
print(lda_umass)
print(lda_cv)

-2.2833287487837315
0.43888768907805975
