# Connect to Google Drive

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


# Necessary Imports and Methods

In [None]:
import pandas as pd
import re
import json
import os.path

In [None]:
# Method for extracting the website from a given URL
def extract_website_from_url(url):
  website = re.search('(https*://)?(www\.)?(.+?)\.com', url)
  if website:
    return website.group(3)
  else:
    return ""

# Extract Recipe Instructions from Datasets
The extracted instruction data is used for comparing the domain-vocabulary (for measuring domain similarity). The instructions from the RecipeNLG dataset are furthermore used for domain-adaptive pre-training of BERT for the cooking domain.

## Extract Recipe1M+ instructions

In [None]:
# Navigate to folder
%cd /content/drive/MyDrive/BachelorThesis/datasets/recipe1M

In [None]:
filepath = "recipe1M_instructions.txt"

if not os.path.isfile(filepath):
  file_recipe1M_instructions = open(filepath, "w")
  with open('recipe1M.json') as file:
    data = json.load(file)
    for recipe in data:
      instructions = ""
      for instr in recipe['instructions']:
        instructions += " " + instr['text']
      file_recipe1M_instructions.write(instructions.strip() + "\n") # write instructions to file/ one recipe per line
  file_recipe1M_instructions.close()
else:
  print(f"file '{filepath}' already exists")

/content/drive/MyDrive/BachelorThesis/datasets/recipe1M


## Extract RecipeNLG and Allrecipes.com instructions



In [None]:
# Navigate to folder
%cd /content/drive/MyDrive/BachelorThesis/datasets/recipeNLG

In [None]:
filepath_recipeNLG_instructions = "recipeNLG_instructions.txt"

if not os.path.isfile(filepath_recipeNLG_instructions):
  file_recipeNLG_instructions = open(filepath_recipeNLG_instructions, "w")
  file_allrecipes_instructions = open("allrecipe_instructions.txt", "w")
  # load recipeNLG data in chunks since it is to big to process at once
  # columns of recipeNLG data: title, ingredients, directions, link, source, NER
  for chunk in pd.read_csv("RecipeNLG_dataset.csv", chunksize = 10**4):
    for index, row in chunk.iterrows():
      # 1. extract website from where the recipe comes from
      website = extract_website_from_url(row['link'])
      # 2. extract the instructions and join them to one string + write cleaned recipeNLG instruction to file
      cleaned_instruction = row['directions'].replace('", "', " ")[2:-2]
      file_recipeNLG_instructions.write(cleaned_instruction + "\n")  
      # 3. if recipe comes from allrecipes website --> append to extra file for allrecipe recipes
      if website.lower() == "allrecipes":
        file_allrecipes_instructions.write(cleaned_instruction + "\n") 
  file_recipeNLG_instructions.close()
  file_allrecipes_instructions.close()
else:
  print(f"file '{filepath_recipeNLG_instructions}' already exists")

/content/drive/MyDrive/BachelorThesis/datasets/recipeNLG
