# Connect to Google Drive

In [None]:
from google.colab import drive 
drive.mount('/content/drive')
%cd /content/drive/MyDrive/BachelorThesis/datasets/foodBase

Mounted at /content/drive
/content/drive/MyDrive/BachelorThesis/datasets/foodBase


# Installations

The Github Repo to the paper "" by ...(2021) provides the FoodBase dataset annotated with 5 different schemes: 
- food-classification
- foodon
- hansard-closest
- hansard-parent
- snomedct
(See the paper for information on the individual tagging schemes)

Let's load this data:

In [None]:
# !git clone https://github.com/ds4food/FoodNer.git # download the prepared foodcorpus data

The downloaded data includes a train and test file for each of the 5 different schemes. Additionally, the single recipes in those files are all concatenated, so that the beginning and end of a recipe can no longer be clearly determined.


In the following sections, the data is preprocessed, so that all information from all those 10 files is gathered in one single json file. Furthermore, each line of this file will represent one recipe and its different tagging results, so that it is compatible with the run_ner.py script from huggingface, that is used for finetuning on NER.

**Example of the desired format**:

{"id": recipe1, "category": "Drinks", "words:" ["Add", "apples"], "ner-food-classification": ["O", "FOOD"], "ner-foodon": ...}

{"id": recipe2, "category": "Breakfast and Lunch", "words:" ["Oats", "stink"], "ner-food-classification": ["FOOD", "O"], "ner-foodon": ...}

...


# Data Processing

In [None]:
# Method for extracting the corresponding named entity tags from the processed foodbase data; this method will take a tokenized recipe and look in whole_data where to find it. Then it returns the corresponding tags

# params: whole_data that contains the recipe (=list of tokens), tags of whole_data (= list of the corresponding tags for the whole_data tokens), tokenized_recipe to find (= list of tokens)
# Example input:
# whole_data =          ["hi", "my", "name", "is", "kevin", "and", "i", "like", "it", "!"]
# corresponding_tags =  ['0,  '0',    '0',   '0', 'PERSON',  '0',  '0',  '0',   '0',  '0'] 
# tokenized_recipe =    ["name", "is", "kevin"]
# Output:               ['0',    '0', 'PERSON']
def extract_tags(whole_data, corresponding_tags, tokenized_recipe):
  recipe_len = len(tokenized_recipe)
  correct_count = 0 # counting how many tokens matched already; if all tokens of tokenized_recipe are matched, then the recipe was found in the whole data
  for i in range(len(whole_data)): # iterate over every token of the entire data/ words
    if whole_data[i] == tokenized_recipe[correct_count]:
      correct_count += 1
      if correct_count == recipe_len: # the tokenized recipe was found in the whole_data list --> return corresponding tags
        start = i - recipe_len + 1 # startindex is inclusive
        end = i + 1 # endindex is not inclusive
        return corresponding_tags[start:end]
    else:
      correct_count = 0
  print("No corresponding tags found for: ", tokenized_recipe)  

Read in the 10 train and test files: 

In [None]:
# read in the preprocessed data and put it in dict
corpora = {
         "food-classification": {},
         "foodon": {},
         "hansard-closest": {},
         "hansard-parent": {},
         "snomedct": {}
        }

for task in corpora:
  with open(f"train-{task}.txt", "r") as train_file, open(f"test-{task}.txt", "r") as test_file:
    next(train_file) # skip first header-line
    next(test_file) # skip first header-line
    tokens = []
    ner = []
    for line in train_file.readlines():
      token_ner_split = line.rstrip().split('\t')
      tokens.append(token_ner_split[0])
      ner.append(token_ner_split[1])
    for line in test_file.readlines():
      token_ner_split = line.rstrip().split('\t')
      tokens.append(token_ner_split[0]) # append tokens of test_file to the tokens of train_file
      ner.append(token_ner_split[1]) # append the tags of test_file to the tags of train_file
    corpora[task]["tokens"] = tokens
    corpora[task]["ner"] = ner

# corpora dict now has for each of the 5 tasks two lists of same length: list with tokens and list with corresponding named-entity tag
# here you can see that for a given text, 5 different tag-sets are provided
print("text")
print(corpora["foodon"]["tokens"][:10])
print("\ntags")
for key in corpora:
  print(corpora[key]["ner"][:10])

text
['Mix', 'the', 'cream', 'cheese', ',', 'beef', ',', 'olives', ',', 'onion']

tags
['O', 'O', 'B-FOOD', 'I-FOOD', 'O', 'B-FOOD', 'O', 'B-FOOD', 'O', 'B-FOOD']
['O', 'O', 'B-FOODON_03301889', 'I-FOODON_03301889', 'O', 'O', 'O', 'O', 'O', 'B-FOODON_03316347']
['O', 'O', 'B-AG.01.e.02', 'I-AG.01.e.02', 'O', 'B-AG.01.d.03', 'O', 'B-AG.01.h.01.e', 'O', 'B-AG.01.h.02.e']
['O', 'O', 'B-AG.01.e', 'I-AG.01.e', 'O', 'B-AG.01', 'O', 'B-AG.01.h', 'O', 'B-AG.01.h']
['O', 'O', 'B-226849005', 'I-226849005', 'O', 'B-226916002', 'O', 'B-227436000', 'O', 'B-735047000']


Now let's read in the official FoodBase data. For each recipe in it, the recipe is tokenized in the same way as it was done to create the 10 train and test files above. The position of the tokenized recipe is then looked up in the corpora-dict that was created before, to find the corresponding tags for each of the 5 tagging schemes.


In [None]:
# some tokens are not correctly tokenized to produce the same output as the preprocessed recipes from FoodNer,
# they will therefore be manually adapted
special_tokenize_cases = {
    "butter/margarine": ["butter", "/", "margarine"],
    "1/2-inch": ["1", "/", "2-inch"],
    "3/4-full": ["3", "/", "4-full"],
    "'em": ["'", "em"],
    "1/4-inch-thick": ["1", "/", "4-inch-thick"],
    "tequila/lime": ["tequila", "/", "lime"],
    "1/4-inch": ["1", "/", "4-inch"],
    "oil/juice": ["oil", "/", "juice"],
    "Fruit/Nut": ["Fruit", "/", "Nut"],
    "1/4": ["1", "/", "4"],
    "and/or": ["and", "/", "or"],
    ".Grease": [".", "Grease"],
    "1/2x11-inch": ["1", "/", "2x11-inch"],
    "C.": ["C", "."],
    "3/4": ["3", "/", "4"],
    "2/3": ["2", "/", "3"],
    "1/2": ["1", "/", "2"],
    "F.": ["F", "."],
    "1/3": ["1", "/", "3"],
    "1/2-cupful": ["1", "/", "2-cupful"],
    "mushroom/sausage": ["mushroom", "/", "sausage"],
    "ok.": ["ok", "."],
    "1/8": ["1", "/", "8"],
    "1/2-inch-thick": ["1", "/", "2-inch-thick"],
    "1/2-ounce": ["1", "/", "2-ounce"],
}

In [None]:
# load the official foodbase data
from xml.dom import minidom
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

xmldoc = minidom.parse('original_foodbase/FoodBase_curated.xml')
recipes = xmldoc.getElementsByTagName('document')

recipe_corpus = [] # the adapted/ resulting recipe corpus --> contains a dict for each recipe. the dict contains id, category, words/ tokens of the recipe and the corresponding tags for all 5 tasks

for recipe in recipes: # itearate over each recipe in the dataset (1000 in total)
  id = recipe.getElementsByTagName('id')[0].firstChild.nodeValue
  category = recipe.getElementsByTagName('infon')[0].firstChild.nodeValue
  full_text = recipe.getElementsByTagName('infon')[1].firstChild.nodeValue.strip()
  tokenized_text = word_tokenize(full_text)
  
  for idx, word in enumerate(tokenized_text):
    if word in special_tokenize_cases: # check for special tokenized words
      tokenized_text[idx:idx+1] = special_tokenize_cases[word]
  
  if "confectioners" in tokenized_text: # the token "confectioners" followed by "'" should be tokenized into "confectioners'" which is done manually here, since word_tokenize splits those up
    for idx, word in enumerate(tokenized_text):
      if word == "confectioners" and tokenized_text[idx + 1] == "'":
        tokenized_text[idx : idx + 2] = ["confectioners'"]
  if "!" in tokenized_text: # the token "!" followed by ")" should be tokenized into "!)" which is done manually here, since word_tokenize splits those up
    for idx, word in enumerate(tokenized_text):
      try:
        if word == "!" and tokenized_text[idx + 1] == ")":
          tokenized_text[idx : idx + 2] = ["!)"]
      except:
        pass
  recipe_entry = {
                  "id": id,
                  "category": category,
                  "words": tokenized_text, 
                }
  for task in corpora: # iterate over each of the 5 tagging schemes
    tags_for_recipe = extract_tags(corpora[task]['tokens'], corpora[task]['ner'], tokenized_text) # extract tags of current tagging scheme for current recipe
    recipe_entry[f"ner-{task}"] = tags_for_recipe
  recipe_corpus.append(recipe_entry)

print("Number of final recipes: ", len(recipe_corpus))
print("Example entry: ", recipe_corpus[0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Number of final recipes:  1000
Example entry:  {'id': '0recipe1006', 'category': 'Appetizers and snacks', 'words': ['Mix', 'the', 'cream', 'cheese', ',', 'beef', ',', 'olives', ',', 'onion', ',', 'and', 'Worcestershire', 'sauce', 'together', 'in', 'a', 'bowl', 'until', 'evenly', 'blended', '.', 'Keeping', 'the', 'mixture', 'in', 'the', 'bowl', ',', 'scrape', 'it', 'into', 'a', 'semi-ball', 'shape', '.', 'Cover', ',', 'and', 'refrigerate', 'until', 'firm', ',', 'at', 'least', '2', 'hours', '.', 'Place', 'a', 'large', 'sheet', 'of', 'waxed', 'paper', 'on', 'a', 'flat', 'surface', '.', 'Sprinkle', 'with', 'walnuts', '.', 'Roll', 'the', 'cheese', 'ball', 'in', 'the', 'walnuts', 'until', 'completely', 'covered', '.', 'Transfer', 'the', 'cheese', 'ball', 'to', 'a', 'serving', 'plate', ',', 'or', 'rewrap', 'with', 'waxed', 'paper', 'and', 'refrigerate', 'until', 'needed', '.'], 'ner-food-

As a final step the data is now shuffled in a way, to enable stratified cross-validation. There are 5 categories a recipe can belong to on each for each of these 5 categories, 200 recipes are in the dataset. For stratified shuffling, the data is split into its 5 categorie-sets. Each of them is randomly shuffled a then rotatory appended again to the dataset. For 10 fold cross-validation, the data can then be loaded and just be fold into 10 parts (based on position). This ensures that each fold has the same amount of recipes for each split.

In [None]:
# save shuffled data to json file
import json
import random
import os.path

path = 'foodbase_with_tags_for_5_tasks.json'

if not os.path.exists(path):
  # extract recipes for each category
  category_appetizers = [recipe for recipe in recipe_corpus if recipe['category'] == "Appetizers and snacks"]
  category_breakfast = [recipe for recipe in recipe_corpus if recipe['category'] == "Breakfast and Lunch"]
  category_desserts = [recipe for recipe in recipe_corpus if recipe['category'] == "Desserts"]
  category_dinners = [recipe for recipe in recipe_corpus if recipe['category'] == "Dinners"]
  category_drinks = [recipe for recipe in recipe_corpus if recipe['category'] == "Drinks"]

  # shuffle each category randomly
  random.shuffle(category_appetizers)
  random.shuffle(category_breakfast)
  random.shuffle(category_desserts)
  random.shuffle(category_dinners)
  random.shuffle(category_drinks)

  stratified_shuffled_recipes = []
  # rotatory append the shuffled data
  for i in range(len(category_appetizers)):
    stratified_shuffled_recipes.append(category_appetizers.pop())
    stratified_shuffled_recipes.append(category_breakfast.pop())
    stratified_shuffled_recipes.append(category_desserts.pop())
    stratified_shuffled_recipes.append(category_dinners.pop())
    stratified_shuffled_recipes.append(category_drinks.pop())

  # save final data
  with open(path, 'w') as file:
    for recipe in stratified_shuffled_recipes:
      json.dump(recipe, file)
      file.write("\n")
else:
  print("file already exists")