In [1]:
import json

In [2]:
RECIPES_FILE = '.json'
RESULT_FILE = 'out.json'

In [3]:
with open(RECIPES_FILE, "r") as f:
    data = json.load(f)

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

def fits_tokenizer(input_text: str) -> bool:
    MAX = 77
    inputs = tokenizer(input_text, padding=True, return_tensors="pt")
    return len(inputs['input_ids'][0]) < MAX

In [5]:
print(f"Data before filtering for tokenizer: {len(data)}")

recipes_filtered_tokenizer = {recipe_id: _recipe for recipe_id, _recipe in data.items() if all(fits_tokenizer(step_obj["stepText"]) for step_obj in _recipe["recipe"]["instructions"])}

print(f"Data after filtering for tokenizer: {len(recipes_filtered_tokenizer)}")

Data before filtering for tokenizer: 2884


Token indices sequence length is longer than the specified maximum sequence length for this model (94 > 77). Running this sequence through the model will result in indexing errors


Data after filtering for tokenizer: 2883


In [6]:
def average_step_length(recipes, words=False):
    def num_words(string):
        return len(string.split(" "))

    steps = [step["stepText"] for _, _recipe in recipes.items() for step in _recipe["recipe"]["instructions"]]

    total_avg = sum(map(num_words if words else len, steps)) / len(steps)

    return total_avg


def average_number_steps(recipes):
        steps = [len(_recipe["recipe"]["instructions"]) for _, _recipe in recipes.items()]

        total_avg = sum(steps) / len(steps)

        return total_avg

In [7]:
def filter_recipe_num_steps(recipes, target_steps=5, window=1):
    return { recipe_id: _recipe for recipe_id, _recipe in recipes.items() if target_steps - window <= len(_recipe["recipe"]["instructions"]) <= target_steps + window }

In [8]:
recipes_filtered_num_steps = filter_recipe_num_steps(data, target_steps=5)

In [9]:
filtered_recipes = filter_recipe_num_steps(recipes_filtered_tokenizer, target_steps=5)

In [10]:
def print_info(recipes, recipe_type):
    print(f"{recipe_type}: {len(recipes)}.\n"
          f"Average Step Length: {average_step_length(recipes)}.\n"
          f"Average Step #Words: {average_step_length(recipes, words=True)}.\n"
          f"Average #Steps: {average_number_steps(recipes)}.")
    print("-" * 10)

for r, t in [(data, "Original Recipes"), (recipes_filtered_tokenizer, "Recipes Filtered for Step Length"), 
             (recipes_filtered_num_steps, "Recipes Filtered for #Steps"), (filtered_recipes, "Recipes Filtered for Step Length and #Steps")]:
    print_info(r, t)

Original Recipes: 2884.
Average Step Length: 90.81410020072919.
Average Step #Words: 16.29822620949572.
Average #Steps: 8.464285714285714.
----------
Recipes Filtered for Step Length: 2883.
Average Step Length: 90.81120399967216.
Average Step #Words: 16.297926399475454.
Average #Steps: 8.464099895941727.
----------
Recipes Filtered for #Steps: 822.
Average Step Length: 91.64038231780167.
Average Step #Words: 16.194982078853048.
Average #Steps: 5.091240875912408.
----------
Recipes Filtered for Step Length and #Steps: 822.
Average Step Length: 91.64038231780167.
Average Step #Words: 16.194982078853048.
Average #Steps: 5.091240875912408.
----------


In [12]:
with open(RESULT_FILE, "w") as f:
    json.dump(filtered_recipes, f, indent=4)