# Document Similarity
### To obtain the Word Mover's Distance model on the 'steps' column of the RAW_recipes.csv

In [1]:
import gensim
import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from gensim.similarities import WmdSimilarity
from nltk.corpus import stopwords



In [2]:
recipes = pd.read_csv('../../data/RAW_recipes.csv', usecols=['steps'])
print (recipes)

                                                    steps
0       ['make a choice and proceed with recipe', 'dep...
1       ['preheat oven to 425 degrees f', 'press dough...
2       ['brown ground beef in large pot', 'add choppe...
3       ['place potatoes in a large pot of lightly sal...
4       ['mix all ingredients& boil for 2 1 / 2 hours ...
...                                                   ...
231632  ['heat oil in a 4-quart dutch oven', 'add cele...
231633        ['mix all ingredients together thoroughly']
231634  ['in a bowl , combine the mashed yolks and may...
231635  ['place melted butter in a large mixing bowl a...
231636  ['whip sugar and shortening in a large bowl , ...

[231637 rows x 1 columns]


### Preprocessing

In [3]:
# Drop rows that are NaN
recipes.dropna(subset=['steps'], inplace=True)

# Keep rows that have string length >= 10
recipes = recipes[recipes['steps'].str.split().str.len().ge(10)]
print (recipes)

                                                    steps
0       ['make a choice and proceed with recipe', 'dep...
1       ['preheat oven to 425 degrees f', 'press dough...
2       ['brown ground beef in large pot', 'add choppe...
3       ['place potatoes in a large pot of lightly sal...
4       ['mix all ingredients& boil for 2 1 / 2 hours ...
...                                                   ...
231631  ['bring 3 quarts salted to water to a boil', '...
231632  ['heat oil in a 4-quart dutch oven', 'add cele...
231634  ['in a bowl , combine the mashed yolks and may...
231635  ['place melted butter in a large mixing bowl a...
231636  ['whip sugar and shortening in a large bowl , ...

[229737 rows x 1 columns]


In [11]:
mystopwords = stopwords.words("english")
WNlemma = nltk.WordNetLemmatizer()

'''
- text = text.replace("'","") is used before the tokenisation to remove "'" because the tokenisation itself cannot separate
  that properly and if not it appears at the front of every sentence.
'''
def preprocess(text):
    text = text.replace("'","")
    tokens = nltk.word_tokenize(text)
    tokens = [ t for t in tokens if t.isalpha() ]  # Remove numbers and punctuation.
    tokens = [ WNlemma.lemmatize(t.lower()) for t in tokens ]
    tokens = [ t for t in tokens if not t in mystopwords ]
    tokens = [ t for t in tokens if len(t) >= 3 ]
    
    return tokens

### Perform Document Similarity

In [13]:
# Use all recipes to create the word2vec corpus and the WMD corpus
recipes = recipes['steps']
w2v_corpus = []   
wmd_corpus = []
for text in recipes:
    w2v_corpus.append(preprocess(text))
    wmd_corpus.append(preprocess(text))    

In [None]:
# Train Word2Vec model
w2v_model = Word2Vec(w2v_corpus, workers=4, size=100, min_count=1)



In [None]:
# Train WMDSimilarity model
# The top 5 documents are retrieved
num_best = 5
wmd_model = WmdSimilarity(wmd_corpus, w2v_model, num_best=5)

In [None]:
# Save WMD model
wmd_model.save('../../data/getSimilar/wmd.model')