# Connect to Google Drive

In [None]:
from google.colab import drive 
drive.mount('/content/drive')
%cd /content/drive/MyDrive/BachelorThesis/domain_analysis

Mounted at /content/drive
/content/drive/MyDrive/BachelorThesis/domain_analysis


In [None]:
import sys 
l = ["Hallo du kek", "kalsdjfkoasdjf asodf iasdo f", " lasdfsa "]
print(sys.getsizeof(l))

96


# Create vocabulary for datasets
This notebooks creates the vocabulary of the 4 datasets:

*   RecipeNLG (instructions)
*   Recipe1M+ (instructions)
*   Allrecipes.com
*   Wikipedia and book corpus

The generated vocabulary- json files will later be used to measure the vocabulary overlap between the datasets.

The created vocabularies are in the form of a dictionary with
>key = token <br>
>value = frequency of token>

Example:
```
{
  "token1": 2021,
  "token2": 1019,
  "token3": 988,
  ...
}
```

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import json
import os.path

vocab_size = 10000 # number of words the vocabulary of each dataset should consist

# define excluded tokens: stopwords and punctuation
stops = stopwords.words('english') # stopwords
for char in string.punctuation:
  stops.append(char) # append punctuation characters to the stopword list
print(len(stops), "excluded stopwords: ", stops)

# params --> data: list of text, n: number of considered vocabs, stopwords: list of tokens to be excluded from the vocab
def create_vocab_for_most_n_frequent_words(data, n, stopwords = []):
  vocab = {}
  progress = 0 # progress of vocabulary creation
  for chunk in data:
    # print current progress
    print(f"Progress: {progress} / {len(data)}", end = "")
    progress += 1
    # remove punctuation from text
    for punct in string.punctuation:
      chunk = chunk.replace(punct, ' ')
    # lowercase and tokenize text
    tokens = word_tokenize(chunk.lower())
    # append tokens to vocab and increase the frequency count
    for token in tokens:
      vocab[token] = vocab.get(token, 0) + 1
    print(end="\r")

  # remove stopwords from vocab
  for word in stopwords:
    if word in vocab:
      del vocab[word]
      
  # sort the vocabulary based on token frequency
  vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True) # returns list

  # truncate vocabulary to specific size
  while len(vocab) > n:
    vocab.pop(-1)
  return dict(vocab) 

def save_vocab_to_file(vocab, filename):
  if os.path.isfile(filename):
    print(f"Vocab file '{filename}' already exists!")
  else:
    with open(filename, 'w') as file:
      file.write(json.dumps(vocab))

def print_vocab(vocab):
  if type(vocab) is not dict:
    print("'vocab' is not a dictionary")
    return
  else:
    for key, value in vocab.items():
      print(f"{key}: {value}")

## Create vocab for Recipe1M+ (instructions)

In [None]:
data = []
with open("../datasets/recipe1M/recipe1M_instructions.txt") as file:
    for line in file:
      data.append(line.rstrip())

recipe1m_vocab = create_vocab_for_most_n_frequent_words(data, vocab_size, stopwords = stops) # this method can take quite a while
# print_vocab(recipe1m_vocab)
save_vocab_to_file(recipe1m_vocab, "recipe1M_vocabulary.json")



## Create vocab for RecipeNLG (instructions)


In [None]:
data = []
with open("../datasets/recipeNLG/recipeNLG_instructions.txt") as file:
    for line in file:
      data.append(line.rstrip())

recipeNLG_vocab = create_vocab_for_most_n_frequent_words(data, vocab_size, stopwords = stops)
# print_vocab(recipeNLG_vocab)
save_vocab_to_file(recipeNLG_vocab, "recipeNLG_vocabulary.json")



## Create vocab for allrecipes.com

In [None]:
data = []
with open("../datasets/recipeNLG/allrecipe_instructions.txt") as file:
    for line in file:
      data.append(line.rstrip())

allrecipes_vocab = create_vocab_for_most_n_frequent_words(data, vocab_size, stopwords = stops)
#print_vocab(allrecipes_vocab)
save_vocab_to_file(allrecipes_vocab, "allrecipes_vocabulary.json")



## Create vocab for Wikipedia/BookCorpus 

In [None]:
import os
filenames = os.listdir("../datasets/wikipedia_book_corpus/data")

data = []
# iterate over files since wikipedia/bookcorpus is made up of several individual files
for filename in filenames:
  filepath = "../datasets/wikipedia_book_corpus/data/" + filename
  with open(filepath) as file:
    for line in file:
      data.append(line.rstrip())

wiki_book_vocab = create_vocab_for_most_n_frequent_words(data, vocab_size, stopwords = stops)
#print_vocab(wiki_book_vocab)
save_vocab_to_file(wiki_book_vocab, "wiki_book_vocabulary.json")