In [110]:
import pandas as pd
import numpy as np


### Helper functions

In [111]:
def clean_ingredients(ingredients):
    # necessary due to the way .csv stores data. Lists are converted to a long string. Functions returns and actual list of strings
    # and takes opportunity to remove whitespace and unnecessary characters
    full = []
    ingredients = ingredients.replace('\\n', ',').replace('\n',',')
    for string in ingredients.split(','):
        string = ' '.join(string.strip().split())
        string = string.replace("'", '').replace('\\xa0',' ').replace('[','').replace(']','').replace('"','')
        full += string.split(',')
    return full

In [119]:
def generate_variants(list_of_ingredients, substitutions):
    # take a list of ingredients
    new_lists_of_ingredients = []
    #for each entry in the list of ingredients
    for entry_string in list_of_ingredients:
        #print(f"iterating over {entry_string} from original list")
        #see if any of the things in sub.keys is in the string
        for sub_candidate in substitutions.keys():
            if sub_candidate in entry_string:
                # the substition key was in the recipe item, go into replacement mode 
                for substitute_item in substitutions[sub_candidate]:
                    # for every available substitution item, make that item replacing the word that the match was on
                    replacement_string = entry_string.replace(sub_candidate, substitute_item) # may need to stringify sub_candidate
                    #print(f"replaced {sub_candidate} with {substitute_item}")
                    # make a copy of the list of ingredients
                    replacement_ingredients = list_of_ingredients.copy()
                    
                    # remove the entry that we're currently on because it had a match in it
                    # print(f"checking entry string again {entry_string}")
                    replacement_ingredients.remove(entry_string)
                    # add the new variant to the copy of the list
                    replacement_ingredients.append(replacement_string)
                    #print(f"replacement ingredients: {replacement_ingredients}")
                    # add the mutated list of ingredients to the total running list
                    new_lists_of_ingredients.append(replacement_ingredients)
    return new_lists_of_ingredients

In [120]:
def extract_alternates(alternates):
    for alternate in alternates:
        new_recipes.append(alternate)

In [112]:
# a map of possible recipe ingredient subsitutions
substitutions = {
    # sweeteners
    "white sugar": ["brown sugar", "honey", "maple syrup", "agave nectar"],
    "granulated sugar": ["white sugar","brown sugar", "honey", "maple syrup", "agave nectar"],
    "brown sugar": ["white sugar", "honey", "maple syrup", "agave nectar"],
    "honey": ["white sugar", "brown sugar", "maple syrup", "agave nectar"],
    # flour
    "all-purpose flour": ["whole wheat flour", "gluten-free flour blend", "oat flour"],
    "whole wheat flour": ["all-purpose flour", "gluten-free flour blend", "oat flour"],
    "all purpose flour": ["whole wheat flour", "gluten-free flour blend", "oat flour"],
    "bread flour": ["all-purpose flour", "whole wheat bread flour", "cake flour"],
    # Eggs
    "eggs": ["applesauce", "tofu", "chia seeds", "cornstarch", "chickpea flour", "ground flaxseed"],
    "egg": ["applesauce", "tofu", "chia seeds", "cornstarch", "chickpea flour", "ground flaxseed"],
    # Butter
    "butter": ["olive oil", "applesauce", "vegetable broth"],
    # Oil
    "vegetable oil": ["canola oil", "olive oil", "avocado oil", "apple sauce"],
    "olive oil": ["canola oil", "vegetable oil", "avocado oil", "apple sauce"],
    "canola oil": ["olive oil", "vegetable oil", "avocado oil", "apple sauce"],
    # Others
    "baking powder": ["baking soda", "double-acting baking powder"],
    "baking soda": ["baking powder", "double-acting baking powder"],
    "breadcrumbs": ["crushed low-sodium crackers", "crushed low-sodium tortilla chips"],
    "mayo": ["Greek yogurt", "hummus", "avocado"],
    "mayonnaise": ["Greek yogurt", "hummus", "avocado"],
    "cornstarch": ["arrowroot flour", "potato starch"],
    # Greens
    "kale": ["chard", "collard greens", "mustard greens"],
    "chard": ["kale", "collard greens", "mustard greens"],
    "collard greens": ["kale", "chard", "mustard greens"],
    "arugula": ["watercress", "baby lettuce"],
    "watercress": ["arugula", "baby lettuce"],
    "lemon": ["lime"],
    "lime": ["lemon"],
    # Cruciferous vegetables
    "cauliflower": ["kohlrabi"],
    "kohlrabi": ["cauliflower", "turnips", "rutabaga"],
    # Alliums
    "onion": ["shallots", "leeks", "garlic"],
    "shallots": ["onions", "leeks", "garlic"],
    "leeks": ["onions", "shallots", "garlic"],
    "garlic": ["onions", "shallots", "ginger"],
    # Root vegetables
    "potato": ["sweet potato", "turnips", "carrots"],
    "sweet potato": ["potato", "parsnips", "butternut squash"],
    "turnips": ["potato", "kohlrabi", "rutabaga"],
    "parsnips": ["sweet potato", "carrots", "rutabaga"],
    # Other vegetables
    "celery": ["bok choy", "bell peppers", "cucumbers"],
    "bok choy": ["celery", "swiss chard"],
    "bell peppers": ["celery", "zucchini", "squash"],
    "cucumbers": ["celery", "radishes", "cherry tomatoes"],
    "zucchini": ["bell peppers", "yellow squash", "eggplant"],
    "yellow squash": ["zucchini", "butternut squash", "acorn squash"],
    "eggplant": ["zucchini"]
    
}

In [113]:
df = pd.read_csv('../data/generated/kidney-friendly-standardized.csv')

In [114]:
df.head()

Unnamed: 0.1,Unnamed: 0,serving_size,ingredients_raw,label,standardized_ingredients
0,0,5,"['½ cup canned chickpeas rinsed and drained', ...",1,['23.66 grams canned chickpeas rinsed and drai...
1,1,8,['1.5 lbs – Sea Bass or Tilapia Fish of choice...,1,['85.05 grams – Sea Bass or Tilapia Fish of ch...
2,2,4,['1 1/2 cups rhubarb ginger simple syrup <<< G...,1,['59.15 0.5 grams rhubarb ginger simple syrup ...
3,3,1,"['1/3 cup (75 ml) beef broth, no salt added\n1...",1,"['78.07 grams (75 ml) beef broth', 'no salt ad..."
4,4,8,['5 egg yolks\n2 Tbsp fresh lemon zest\n½ cup ...,1,"['0.62 egg yolks', '0.25 Tbsp fresh lemon zest..."


In [115]:
df.shape

(831, 5)

In [116]:
df['standardized_ingredients'][0]

"['23.66 grams canned chickpeas rinsed and drained', '94.64 grams red pepper cut in 2-inch wedges', '0.4 garlic cloves', '6.0 grams extra-virgin olive oil  ', '6.0 grams lemon juice', '3.0 grams of tahini', 'Black pepper to taste', 'You may also add cumin or paprika or chili pepper if you like']"

In [117]:
df['standardized_ingredients'] = df['standardized_ingredients'].apply(clean_ingredients)

In [118]:
df['standardized_ingredients'][0]

['23.66 grams canned chickpeas rinsed and drained',
 '94.64 grams red pepper cut in 2-inch wedges',
 '0.4 garlic cloves',
 '6.0 grams extra-virgin olive oil ',
 '6.0 grams lemon juice',
 '3.0 grams of tahini',
 'Black pepper to taste',
 'You may also add cumin or paprika or chili pepper if you like']

### Generate variants of recipes for each subsitutable ingredient

In [121]:
df['alternates'] = df['standardized_ingredients'].apply(lambda x : generate_variants(x, substitutions))

In [122]:
df.head()

Unnamed: 0.1,Unnamed: 0,serving_size,ingredients_raw,label,standardized_ingredients,alternates
0,0,5,"['½ cup canned chickpeas rinsed and drained', ...",1,[23.66 grams canned chickpeas rinsed and drain...,[[23.66 grams canned chickpeas rinsed and drai...
1,1,8,['1.5 lbs – Sea Bass or Tilapia Fish of choice...,1,[85.05 grams – Sea Bass or Tilapia Fish of cho...,[]
2,2,4,['1 1/2 cups rhubarb ginger simple syrup <<< G...,1,[59.15 0.5 grams rhubarb ginger simple syrup <...,[[59.15 0.5 grams rhubarb ginger simple syrup ...
3,3,1,"['1/3 cup (75 ml) beef broth, no salt added\n1...",1,"[78.07 grams (75 ml) beef broth, no salt added...","[[78.07 grams (75 ml) beef broth, no salt adde..."
4,4,8,['5 egg yolks\n2 Tbsp fresh lemon zest\n½ cup ...,1,"[0.62 egg yolks, 0.25 Tbsp fresh lemon zest, 1...","[[0.25 Tbsp fresh lemon zest, 14.79 grams lemo..."


In [123]:
# create a master list of variants per recipe
new_recipes = []
df['alternates'].apply(extract_alternates)

0      None
1      None
2      None
3      None
4      None
       ... 
826    None
827    None
828    None
829    None
830    None
Name: alternates, Length: 831, dtype: object

### Create a dataframe to contain alternates

In [124]:
augmented_kidney_data = pd.DataFrame({'ingredients' : new_recipes})

In [126]:
augmented_kidney_data['label'] = 1

### Bring in non-kidney-friendly recipe sample and combine

In [127]:
nkf_df = pd.read_csv('../data/generated/nkf_sample_10000.csv')

In [128]:
nkf_df['ingredients'] = nkf_df['standardized_ingredients'].apply(clean_ingredients)

In [129]:
nkf_df.ingredients[1]

['113.4 grams lamb chop',
 'trimmed of fat and bones ',
 '118.29 0.5 grams low-sodium beef bouillon',
 'mixed ',
 '1.0 medium potatoes',
 'peeled and cubed ',
 '0.25 carrot',
 'peeled and sliced ',
 '0.25 celery rib',
 'chopped ',
 '0.12 white onion',
 'rough chopped ',
 '28.35 grams frozen corn',
 '14.79 grams fresh mushrooms',
 ' half salt and pepper',
 'to taste ',
 ' parsley',
 'to taste ']

In [130]:
nkf_df.head()

Unnamed: 0.1,Unnamed: 0,standardized_ingredients,label,ingredients
0,0,"['19.72 grams sugar', '13.01 grams cocoa', '9....",0,"[19.72 grams sugar, 13.01 grams cocoa, 9.86 gr..."
1,1,"['113.4 grams lamb chop', 'trimmed of fat and ...",0,"[113.4 grams lamb chop, trimmed of fat and bon..."
2,2,"['1.0 beef loin steaks', 'cut 0.25-inch thick ...",0,"[1.0 beef loin steaks, cut 0.25-inch thick , 3..."
3,3,['1.0 corn tortillas (like Mission Super Size)...,0,"[1.0 corn tortillas (like Mission Super Size),..."
4,4,"['39.43 grams butter', 'softened ', '7.5 grams...",0,"[39.43 grams butter, softened , 7.5 grams gree..."


In [131]:
nkf_df.shape

(10000, 4)

In [132]:
augmented_kidney_data.head()

Unnamed: 0,ingredients,label
0,[23.66 grams canned chickpeas rinsed and drain...,1
1,[23.66 grams canned chickpeas rinsed and drain...,1
2,[23.66 grams canned chickpeas rinsed and drain...,1
3,[23.66 grams canned chickpeas rinsed and drain...,1
4,[23.66 grams canned chickpeas rinsed and drain...,1


In [133]:
nkf_df = nkf_df[['ingredients', 'label']]

In [134]:
nkf_df.head()

Unnamed: 0,ingredients,label
0,"[19.72 grams sugar, 13.01 grams cocoa, 9.86 gr...",0
1,"[113.4 grams lamb chop, trimmed of fat and bon...",0
2,"[1.0 beef loin steaks, cut 0.25-inch thick , 3...",0
3,"[1.0 corn tortillas (like Mission Super Size),...",0
4,"[39.43 grams butter, softened , 7.5 grams gree...",0


In [135]:
augmented_kidney_data.shape

(8696, 2)

In [136]:
recipe_data = pd.concat([nkf_df, augmented_kidney_data], axis = 0)

In [137]:
recipe_data.shape

(18696, 2)

In [138]:
recipe_data.to_csv('../data/generated/tidy_recipe_data_all_2.csv', index=False)

In [94]:
test_ingrediets_1 = ["1 tbsp butter",
       "2 steaks",
       "0.5 cup chopped apples"
       ]

test_ingredients_2 = ["2 eggs",
        "1 cup flour"]

test_substitutions = {
    "butter": ["olive oil", "vegetable oil", "pam"],
    "apples": ["oranges", "bananas"],
    "eggs": ["flax"],
    "flour": ["whole wheat flour"]
}

#generate_variants(test, substitutions)

In [95]:
pd_test = pd.DataFrame({'recipe': ['yummy thing', 'yucky thing'], "ingredients" : [test, test2]})
pd_test

NameError: name 'test' is not defined

In [None]:
pd_test['alternates'] = pd_test['ingredients'].apply(lambda x : generate_variants(x, substitutions))

In [None]:
new_recipes = []
pd_test['alternates'].apply(extract_alternates)

In [None]:
new_recipes

In [None]:
pd_test

In [None]:
pd.DataFrame(new_recipes)