In [73]:
# Import libraries
import pandas as pd
from ast import literal_eval
from datasets import load_dataset
import re
from sklearn.model_selection import train_test_split
import json

In [17]:
# Import data from HuggingFace
dataset = load_dataset("recipe_nlg", split='train', data_dir="../data/recipe_nlg/", trust_remote_code=True)

In [55]:
# Load data as dataframe and reformat lists
df = pd.read_csv("../data/recipe_nlg/full_dataset.csv", index_col=0, converters={"ingredients": literal_eval,
                                                                                 "directions": literal_eval,
                                                                                 "NER": literal_eval})
df.shape

(2231142, 6)

In [56]:
df.head()

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[1 c. firmly packed brown sugar, 1/2 c. evapor...","[In a heavy 2-quart saucepan, mix brown sugar,...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[brown sugar, milk, vanilla, nuts, butter, bit..."
1,Jewell Ball'S Chicken,"[1 small jar chipped beef, cut up, 4 boned chi...","[Place chipped beef on bottom of baking dish.,...",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[beef, chicken breasts, cream of mushroom soup..."
2,Creamy Corn,"[2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg. c...","[In a slow cooker, combine all ingredients. Co...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[frozen corn, cream cheese, butter, garlic pow..."
3,Chicken Funny,"[1 large whole chicken, 2 (10 1/2 oz.) cans ch...","[Boil and debone chicken., Put bite size piece...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[chicken, chicken gravy, cream of mushroom sou..."
4,Reeses Cups(Candy),"[1 c. peanut butter, 3/4 c. graham cracker cru...",[Combine first four ingredients and press in 1...,www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[peanut butter, graham cracker crumbs, butter,..."


In [57]:
df[df['title'].str.len() < 4].head()

Unnamed: 0,title,ingredients,directions,link,source,NER
4071,Mud,"[2 c. cold milk, 1 pkg. Jell-O chocolate flavo...","[You will also need a large flowerpot, trowel,...",www.cookbooks.com/Recipe-Details.aspx?id=323986,Gathered,"[cold milk, chocolate sandwich cookies]"
15581,Pie,"[2 eggs, 2 c. milk (1 1/2 c. to heat, 1/2 c. t...","[Mix flour, sugar, salt together. Then add to ...",www.cookbooks.com/Recipe-Details.aspx?id=88188,Gathered,"[eggs, milk, sugar, vanilla, flour, salt, butt..."
46648,Pie,"[Premade graham cracker crust, 1 small pkg. in...","[Combine cream cheese, instant jello and sugar...",www.cookbooks.com/Recipe-Details.aspx?id=60141,Gathered,"[graham cracker crust, instant jello mix, inst..."
51268,Tea,"[2 c. water, boiled, 3-inch cinnamon stick, 6 ...","[In heated tea pot, pour boiled water over tea...",www.cookbooks.com/Recipe-Details.aspx?id=772911,Gathered,"[water, cinnamon, cloves, orange juice, brown ..."
55233,Dip,"[1 lb. ground round, browned and drained, 1 pk...","[Mix beef, taco seasoning, tomatoes and cheese...",www.cookbooks.com/Recipe-Details.aspx?id=258779,Gathered,"[ground round, taco seasoning, Ro-Tel, Velveet..."


In [59]:
df[df['ingredients'].str.len() < 3].head()

Unnamed: 0,title,ingredients,directions,link,source,NER
62,Phylis' Pineapple-Banana Salad,"[1 large can chunk pineapple, 4 to 5 bananas]","[Drain pineapple and reserve juice., Cut banan...",www.cookbooks.com/Recipe-Details.aspx?id=682439,Gathered,"[pineapple, bananas]"
72,Dream Pie,"[vanilla wafers, butter]",[Mix vanilla wafer crumbs with butter.],www.cookbooks.com/Recipe-Details.aspx?id=621453,Gathered,"[vanilla wafers, butter]"
388,Herb Butter,"[2 Tbsp. dried herbs: equal parts of parsley, ...",[Blend all together and chill overnight.],www.cookbooks.com/Recipe-Details.aspx?id=503383,Gathered,"[herbs, margarine]"
508,Raw Gluten,"[2 lb. whole wheat flour, 1 qt. water]",[Mix whole wheat flour with water into a firm ...,www.cookbooks.com/Recipe-Details.aspx?id=1075892,Gathered,"[whole wheat flour, water]"
530,Pigs In Blanket,"[1 pkg. cocktail hot dogs, 5 cans crescent rolls]","[Wrap wiener, rolls around hot dogs and cook a...",www.cookbooks.com/Recipe-Details.aspx?id=589806,Gathered,[crescent rolls]


In [61]:
df[df['directions'].str.len() < 2].head()

Unnamed: 0,title,ingredients,directions,link,source,NER
2,Creamy Corn,"[2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg. c...","[In a slow cooker, combine all ingredients. Co...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[frozen corn, cream cheese, butter, garlic pow..."
17,Broccoli Salad,"[1 large head broccoli (about 1 1/2 lb.), 10 s...",[Trim off large leaves of broccoli and remove ...,www.cookbooks.com/Recipe-Details.aspx?id=50992,Gathered,"[broccoli, bacon, green onions, raisins, mayon..."
22,Cuddy Farms Marinated Turkey,"[2 c. 7-Up or Sprite, 1 c. vegetable oil, 1 c....",[Buy whole turkey breast; remove all skin and ...,www.cookbooks.com/Recipe-Details.aspx?id=9449,Gathered,"[vegetable oil, soy sauce, garlic]"
28,Taco-Filled Green Pepper,"[1 lb. Ground Beef, 1 pkg. taco seasoning, 1 s...","[In a large skillet, brown the ground beef. Dr...",www.cookbooks.com/Recipe-Details.aspx?id=39004,Gathered,"[Ground Beef, taco seasoning, beans, onion, sa..."
31,Summer Chicken,"[1 pkg. chicken cutlets, 1/2 c. oil, 1/3 c. re...",[Double recipe for more chicken.],www.cookbooks.com/Recipe-Details.aspx?id=969444,Gathered,"[chicken cutlets, oil, red vinegar, oregano, g..."


In [63]:
df[df['directions'].map(lambda x: re.search('(step|mix all)', ''.join(str(x)), re.IGNORECASE)!=None)]

Unnamed: 0,title,ingredients,directions,link,source,NER
14,Pink Stuff(Frozen Dessert),"[1 can pie filling (cherry or strawberry), 1 c...","[Mix all ingredients together., Pour into a 9 ...",www.cookbooks.com/Recipe-Details.aspx?id=982483,Gathered,"[pie filling, pineapple, condensed milk, lemon..."
30,Punch Bowl Fruit Salad,"[2 large cans sliced peaches, 2 large cans fru...",[In a 6-quart punch bowl mix all of the cans o...,www.cookbooks.com/Recipe-Details.aspx?id=1059877,Gathered,"[peaches, fruit cocktail, pineapple, fresh str..."
50,Chicken Ole,"[4 chicken breasts, cooked, 1 can cream of chi...","[Dice chicken., Mix all ingredients together.,...",www.cookbooks.com/Recipe-Details.aspx?id=445786,Gathered,"[chicken breasts, cream of chicken soup, cream..."
63,Chicken Casserole,"[1 can cream of mushroom soup, 1 can cream of ...","[Mix all ingredients together in baking dish.,...",www.cookbooks.com/Recipe-Details.aspx?id=665397,Gathered,"[cream of mushroom soup, cream of chicken soup..."
65,Annie'S Diabetic Candy,[1 (8 oz.) Jell-O sugar-free vanilla or French...,"[Mix all ingredients, form in small balls and ...",www.cookbooks.com/Recipe-Details.aspx?id=942266,Gathered,"[vanilla, cream cheese, butter, peanut butter]"
...,...,...,...,...,...,...
2231107,Strawberry Mousse,[1 (10 ounce) package frozen unsweetened straw...,"[Mix all ingredients in blender and freeze., S...",www.food.com/recipe/strawberry-mousse-168896,Recipes1M,"[nonfat yogurt, sugar, vanilla, lemon juice]"
2231119,Ham Florentine Mini Cups,"[1 (9 ounce) box frozen chopped spinach, thawe...","[Heat oven to 375F In medium bowl, mix all ing...",www.food.com/recipe/ham-florentine-mini-cups-3...,Recipes1M,"[garlic, mozzarella cheese, five-cheese, roman..."
2231126,Cucumber Sandwiches,"[1 cucumber, lightly peeled, 1/2 tsp salt, 2 t...","[Prepare ahead, Step 1 can be completed 1 hour...",www.cookstr.com/recipes/cucumber-sandwiches,Recipes1M,"[cucumber, salt, white wine vinegar, bread, bu..."
2231127,Simple Corn Bread (with variations),[2 cups (475 ml) biscuit mix (like BISQUICK or...,[Mix all the dry ingredients in a large mixing...,online-cookbook.com/goto/cook/rpage/000EFE,Recipes1M,"[biscuit mix, butter, cream, yellow cornmeal, ..."


In [70]:
def preprocess(data):
    # Drop recipes that have titles that are too short (less than 4 characters)
    data.drop(data[data['title'].str.len() < 4].index, inplace=True)

    # Drop recipes that have fewer than 2 ingredients
    data.drop(data[data['ingredients'].str.len() < 2].index, inplace=True)

    # Drop recipes that have fewer than 2 steps or are shorter than 30 characters
    data.drop(data[data.directions.map(lambda x: len(x) < 2 or len(''.join(x)) < 30)].index, inplace=True)
    
    # Drop recipes that contain "step" or "mix all" since this token is too frequent?
    data.drop(data[data.directions.map(lambda x: re.search('(step|mix all)', ''.join(str(x)), re.IGNORECASE)!=None)].index, inplace=True)

    return data

In [71]:
preprocessed = preprocess(df)
preprocessed.shape

(1939938, 6)

In [72]:
# Split into train and test sets, keeping test size to 5%
train, test = train_test_split(df, test_size=0.05)

In [74]:
def df_to_plaintext_file(input_df, output_file):
    print("Writing to", output_file)
    with open(output_file, 'w') as f:
        for index, row in input_df.iterrows():
            if index%100000==0:
                print(index)
            if type(row.NER)!=str:
                continue
            title = row['title']
            directions = json.loads(row['directions'])
            ingredients = json.loads(row['ingredients'])
            ner = json.loads(row['NER'])
            # Tokenize as text
            res = "<RECIPE_START> <INPUT_START> " + " <NEXT_INPUT> ".join(ner) + " <INPUT_END> <INGR_START> " + \
              " <NEXT_INGR> ".join(ingredients) + " <INGR_END> <INSTR_START> " + \
              " <NEXT_INSTR> ".join(directions) + " <INSTR_END> <TITLE_START> " + title + " <TITLE_END> <RECIPE_END>"
            f.write("{}\n".format(res))

In [75]:
df_to_plaintext_file(train, '../data/processed/train.txt')
df_to_plaintext_file(train, '../data/processed/test.txt')

Writing to ../data/processed/train.txt
1200000
100000
900000
400000
500000
1900000
1300000
800000
700000
1800000
1600000
200000
0
2000000
1700000
1100000
600000
1000000
2100000
2200000
300000
Writing to ../data/processed/test.txt
1200000
100000
900000
400000
500000
1900000
1300000
800000
700000
1800000
1600000
200000
0
2000000
1700000
1100000
600000
1000000
2100000
2200000
300000
