# Topic Assignment

### After performing topic modeling, to then assign the derived topics to the recipes.

In [1]:
import gensim 
import numpy as np
import pandas as pd
import pickle 



### Get the topic distribution for the recipes

In [2]:
# Load LDA model
lda = gensim.models.ldamodel.LdaModel.load('../../data/topic-model/lda/exp4.model')

# Load dtm
with open('../../data/topic-model/dtm/exp4.pkl', 'rb') as f:
    dtm = pickle.load(f)

In [3]:
# Get the topic distribution of recipes
doc_topics = lda.get_document_topics(dtm)

### Assign the topic distribution to each recipe

In [4]:
# Preprocess RAW_recipes.csv (same as in topic_model.ipynb)
recipes = pd.read_csv('../../data/RAW_recipes.csv')

# Drop rows that are blank
recipes['steps'].replace('', np.nan, inplace=True)
recipes.dropna(subset=['steps'], inplace=True)

# Keep rows that have string length >= 10
recipes = recipes[recipes['steps'].str.split().str.len().ge(10)]

# Reset the index sinces rows have been dropped
recipes = recipes.reset_index(drop=True)

# We only need the 'id' column
recipes= recipes[['id']]

In [5]:
# Add columns for all the topics and initialise each recipe's distribution with 0.0
for i in range (1,16):
    recipes['Topic {}'.format(i)] = 0
    recipes['Topic {}'.format(i)] = recipes['Topic {}'.format(i)].astype(float)
print (recipes)

            id  Topic 1  Topic 2  Topic 3  Topic 4  Topic 5  Topic 6  Topic 7  \
0       137739      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1        31490      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
2       112140      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
3        59389      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
4        44061      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
...        ...      ...      ...      ...      ...      ...      ...      ...   
229732  188810      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
229733  486161      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
229734  308080      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
229735  298512      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
229736  298509      0.0      0.0      0.0      0.0      0.0      0.0      0.0   

        Topic 8  Topic 9  T

In [6]:
# Assignment done here  
for index,row in recipes.iterrows():
    for item in doc_topics[index]:
        # Probability values are rounded up to one decimal place to create larger groups of similar ranking recipes
        recipes.at[index, 'Topic {}'.format(1+item[0])] = "{:.1f}".format(item[1])
print (recipes)

            id  Topic 1  Topic 2  Topic 3  Topic 4  Topic 5  Topic 6  Topic 7  \
0       137739      0.0      0.0      0.0      0.0      0.2      0.1      0.0   
1        31490      0.0      0.1      0.0      0.2      0.0      0.0      0.0   
2       112140      0.0      0.0      0.0      0.0      0.0      0.0      0.2   
3        59389      0.0      0.0      0.0      0.0      0.2      0.0      0.0   
4        44061      0.0      0.0      0.0      0.0      0.0      0.2      0.0   
...        ...      ...      ...      ...      ...      ...      ...      ...   
229732  188810      0.0      0.0      0.0      0.0      0.1      0.0      0.2   
229733  486161      0.0      0.0      0.0      0.1      0.0      0.0      0.0   
229734  308080      0.0      0.0      0.2      0.0      0.0      0.0      0.0   
229735  298512      0.0      0.3      0.0      0.4      0.0      0.1      0.0   
229736  298509      0.0      0.4      0.0      0.6      0.0      0.0      0.0   

        Topic 8  Topic 9  T

In [7]:
recipes.to_csv('../../data/topic-model/assignments/exp4.csv')