In [459]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
from fractions import Fraction
import re

In [460]:
# kidney friendly recipes
kf1 = pd.read_csv('../data/generated/kidney-ca-recipes-with-ingredients-and-nutrition.csv')
kf2 = pd.read_csv('../data/generated/kidney-kitchen-recipes-with-ingredients-and-nutrition.csv')

In [461]:
# non kidney friendly recipes
nkf1 = pd.read_csv('../data/generated/non_kidney_friendly_recipes.csv')

In [462]:
fraction_map = {'¼':'.25',
                '½':'.5',
                '⅓':'.33',                
                '¾':'.75',
                '⅒':'.1',
                '⅔':'.66',
                '⅕':'.2',
                '⅖':'.4',
                '⅗':'.6',
                '⅘':'.8',
                '⅙':'.166',
                '⅛':'.125'} 

In [463]:
# bard
conversion_table = {
    "teaspoon": 5.0,
    "teaspoons": 5.0,
    "tsp": 5.0,
    "tsps": 5.0,
    "tablespoon": 15.0,
    "tablespoons": 15.0,
    "tbsp": 15.0,
    "tbsps": 15.0,
    "fl oz": 29.5735,
    "fluid ounce": 29.5735,
    "fl ozs": 29.5735,
    "fluid ounces": 29.5735,
    "cup": 236.588,
    "cups": 236.588,
    "pint": 473.176,
    "pints": 473.176,
    "quart": 946.352,
    "quarts": 946.352,
    "gallon": 3785.41,
    "pound": 453.592,
    "pounds": 453.592,
    "ounce": 28.3495,
    "ounces": 28.3495,
    "lb": 453.592,
    "lbs": 453.592,
    "oz": 28.3495,
    "ozs": 28.3495
}

In [464]:
def find_numerics(string):
    #regex = r"\d+\.?\d*"
    #regex = r"\d+(?:\.\d+)?"
    regex = r"\d*\.?\d+"
    matches = re.findall(regex, string)
    return matches

In [465]:
def find_units(string):
    units = []
    [units.append(x) for x in string.split() if x in conversion_table.keys()]
    return units

In [466]:
def convert_vulgar_fractions_in_place(value):
    new_string = value
    for c in new_string:
        if c in fraction_map.keys():
            new_string = new_string.replace(c,str(fraction_map[c]))
    return new_string

In [467]:
def convert_vulgar_fractions_concat(value):
    new_string = value
    for c in new_string:
        if c in fraction_map.keys():
            new_string = new_string.replace(c,str(fraction_map[c]))
            
    return new_string.replace(' ','')

In [468]:
def fraction_replacer(string):
    if '/' in string:
        for c in string:
            if c in '/':
                pos_min_2 = ''
                pos_min_1 = ''
                pos = string.index(c)
                pos_plus_1 = ''
                pos_plus_2 = ''
                if pos >= 2:
                    pos_min_2 = string[pos - 2]
                pos_min_1 = string[pos - 1]
                pos_plus_1 = string[pos + 1]
                if len(string) > pos+2:
                    pos_plus_2 = string[pos + 2]
                eval_char = [pos_min_2, pos_min_1, c, pos_plus_1, pos_plus_2]
                final = [i if i.isnumeric() or i in '/' else '' for i in eval_char]
        fraction = ''.join(final)
        
        if fraction != '/':
            fraction_replacement = str(round(float(sum(Fraction(c) for c in fraction.split())),2))
            string = string.replace(fraction, fraction_replacement)
            #return string.replace(fraction, fraction_replacement)
        else:
            pass
    return string
    

In [469]:
def ingredient_standardizer(string, serving_size):
    #grab a list of numerics, if present
    quantities = find_numerics(string)
    # print(quantities)
    #grab a list of convertible units, if present
    units = find_units(string.lower())
    # if the string has at least one quantity and a convertible unit (examples 1 cup water, 1 14.5 oz can of beans)
    if (len(quantities) and len(units) > 0) and units[0] in conversion_table.keys():
        # take the first instance of a quantity and convert it to grams via conversion table
        # this assumes that the first unit is associated with the first quantity, which is often the case (i.e. 1 cup water) 
        grams = float(quantities[0]) * conversion_table[units[0]]
        
        # create a new string that will replace the convertible unit with grams and the value used for conversion, also dividing by serving size
        new_string = string.lower().replace(units[0], 'grams', 1).replace(quantities[0], str(round(grams/serving_size, 2)), 1)
    # if the string has at leasy one quantity but no units (example: 1 egg) 
    elif len(quantities) > 0:
        # just replace the quantity divided by serving size
        new_string = string.replace(quantities[0], str(round(float(quantities[0]) / serving_size, 2)), 1)
    else:
        new_string = string
    return new_string

In [470]:
def clean_ingredients(ingredients):
    full = []
    ingredients = ingredients.replace('\\n', ',').replace('\n',',')
    for string in ingredients.split(','):
        string = ' '.join(string.strip().split())
        string = string.replace("'", '').replace('\\xa0',' ').replace('[','').replace(']','').replace('"','')
        full += string.split(',')
    return full

In [471]:
# def clean_ingredients(ingredients):
#     full = []
#     ingredients = ingredients.replace('\\n', ',').replace('\n',',')
#     for string in ingredients.split(','):
#         string = ' '.join(string.strip().split())
#         string = string.replace("'", '').replace('\\xa0',' ').replace('[','').replace(']','')
#         full += string.split(',')
#     return full

In [472]:
test_unseen = []
test = '''[16 ounce package Super Firm Tofu' '1/4 cup red onion
1 stalk celery
1 Tablespoon fresh dill (or 1 teaspoon dried dill)
1 Tablespoon mustard of choice
5 Tablespoons mayonnaise
1/2 teaspoon turmeric for color
Juice from 1/2 lemon']'''

test = clean_ingredients(test)
for x in test:
    string = convert_vulgar_fractions_in_place(x)
    try:    
        string = fraction_replacer(string)
    except Exception as e:
        print(f"Error converting [{string}] on item {count}")
        continue
    else:
        pass
    string = ingredient_standardizer(string, 4)
    print(string)

113.4 grams package super firm tofu 0.25 cup red onion
0.25 stalk celery
3.75 grams fresh dill (or 1 teaspoon dried dill)
3.75 grams mustard of choice
18.75 grams mayonnaise
0.62 grams turmeric for color
Juice from 0.12 lemon


In [473]:
kf1.head()

Unnamed: 0.1,Unnamed: 0,url,serving_size,ingredients_raw,calories,saturated fat,protein,sodium,potassium,phosphorus
0,0,https://www.kidneycommunitykitchen.ca/kkcookbo...,Servings per recipe: 5,"['½ cup canned chickpeas rinsed and drained', ...",102 KCal,6.9 g,1.4 g,36.4 mg,122.4 mg,47.4 mg
1,1,https://www.kidneycommunitykitchen.ca/kkcookbo...,Servings per recipe: 8,['1.5 lbs – Sea Bass or Tilapia Fish of choice...,140 KCal,5 g,20 g,128 mg,353 mg,169 mg
2,2,https://www.kidneycommunitykitchen.ca/kkcookbo...,Servings per recipe: 4,['1 1/2 cups rhubarb ginger simple syrup <<< G...,363 KCal,,,100 mg,285 mg,13.54 mg
3,3,https://www.kidneycommunitykitchen.ca/kkcookbo...,Servings per recipe: Makes 1 1/2 cups (375 ml)...,"['1/3 cup (75 ml) beef broth, no salt added\n1...",6.1 KCal,,0.1 g,4 mg,26 mg,2 mg
4,4,https://www.kidneycommunitykitchen.ca/kkcookbo...,Servings per recipe: 8,['5 egg yolks\n2 Tbsp fresh lemon zest\n½ cup ...,150.7 KCal,10.7 g,4.3 g,34.1 mg,57.93 mg,75.21 mg


In [474]:
kf2.head()

Unnamed: 0.1,Unnamed: 0,url,serving_size,ingredients_raw,calories,saturated fat,protein,sodium,potassium,phosphorus
0,0,https://kitchen.kidneyfund.org/recipe/sweet-ch...,12 servings,"['9-inch-deep dish pie crust', '1 ½ cups (abou...",213,\n4 g\n,\n3 g\n,\n91 mg\n,\n197 mg\n,\n49 mg\n
1,1,https://kitchen.kidneyfund.org/recipe/sweet-ca...,4 servings,"['3 cups carrots, sliced', '1 tablespoon sugar...",111,\n4 g\n,\n1 g\n,\n81 mg\n,\n376 mg\n,\n43 mg\n
2,2,https://kitchen.kidneyfund.org/recipe/sugar-fr...,8 servings,"['1 ¼ cups graham cracker crumbs', '⅓ cup unsa...",229,\n7 g\n,\n4 g\n,\n172 mg\n,\n136 mg\n,\n82 mg\n
3,3,https://kitchen.kidneyfund.org/recipe/non-alco...,6 servings,['1 ½ cups liquid non-dairy coffee creamer\xa0...,110,\n3 g\n,\n2 g\n,\n81 mg\n,\n45 mg\n,\n15 mg\n
4,4,https://kitchen.kidneyfund.org/recipe/non-alco...,4 servings,"['1 cup apple juice, chilled\xa0', '1 cup ging...",49,\n0 g\n,\n0 g\n,\n7 mg\n,\n63 mg\n,\n4 mg\n


In [475]:
kf_df = pd.concat([kf1, kf2], axis = 0)

In [476]:
kf_df['label'] = 1

In [477]:
kf_df = kf_df[['ingredients_raw','serving_size','label']]

In [478]:
kf_df['serving_size'] = kf_df['serving_size'].apply(find_numerics)


In [479]:
test = []
test

[]

In [480]:
serving_size_values = []
for i in kf_df['serving_size']:
    if len(i) > 0:
        serving_size_values.append(i[0])
    else:
        serving_size_values.append(1)
# kf_df['serving_size_nums'] = serving_size_values

In [481]:
kf_df['serving_size_nums'] = serving_size_values

In [482]:
sn = kf_df['serving_size_nums']

In [483]:
odd_servings = kf_df[sn.str.len() > 3].index

In [484]:
kf_df = kf_df.drop(odd_servings)

In [485]:
kf_df.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            657, 658, 659, 660, 661, 662, 663, 664, 665, 666],
           dtype='int64', length=831)

In [486]:
kf_df.reset_index(inplace=True)

In [487]:
kf_df.index

RangeIndex(start=0, stop=831, step=1)

In [488]:
nkf_df.index

RangeIndex(start=0, stop=185848, step=1)

In [489]:
kf_df.drop('index', axis = 1, inplace=True)

In [490]:
kf_df.index

RangeIndex(start=0, stop=831, step=1)

In [491]:
# kf_df.reset_index(inplace=True)
# nkf_df.reset_index(inplace=True)

In [492]:
kf_df['serving_size_nums'].unique()

array(['5', '8', '4', '1', '30', '64', '2', '40', '16', '6', '12', '10',
       '18', '9', '22', '17', '20', '3', '24', '19', '15', '32', '14',
       '36', '23', '7', 1, '48', '72', '52'], dtype=object)

### Replace instances of vulgar fractions (unicode fractions) with decimals

In [493]:
#kf_df['serving_size_nums'] = kf_df['serving_size_nums'].apply(convert_vulgar_fractions_concat)
#kf_df['serving_size_nums'] = kf_df['serving_size_nums'].apply(lambda x: convert_vulgar_fractions(x))

### Replace instances of fractions in strings represented as '1 1/4' with decimals

In [494]:
#kf_df['serving_size_nums'] = kf_df['serving_size_nums'].apply(fraction_replacer)

In [495]:
#kf_df['serving_size_nums'].unique()

In [496]:
# kf_df['serving_size_nums'] = kf_df['serving_size_nums'].apply(lambda x : float(sum(Fraction(c) for c in x.split())))

### Cast converted serving sizes to numeric

In [497]:
kf_df['serving_size_nums'] = pd.to_numeric(kf_df['serving_size_nums'])

In [498]:
kf_df.dtypes

ingredients_raw      object
serving_size         object
label                 int64
serving_size_nums     int64
dtype: object

In [499]:
kf_df = kf_df[['serving_size_nums', 'ingredients_raw', 'label']]

In [500]:
kf_df.head()

Unnamed: 0,serving_size_nums,ingredients_raw,label
0,5,"['½ cup canned chickpeas rinsed and drained', ...",1
1,8,['1.5 lbs – Sea Bass or Tilapia Fish of choice...,1
2,4,['1 1/2 cups rhubarb ginger simple syrup <<< G...,1
3,1,"['1/3 cup (75 ml) beef broth, no salt added\n1...",1
4,8,['5 egg yolks\n2 Tbsp fresh lemon zest\n½ cup ...,1


In [501]:
nkf1['label'] = 0

In [502]:
nkf_df = nkf1[['RecipeServings', 'ingredients_raw_str', 'label']]

In [503]:
nkf_df.dtypes

RecipeServings         float64
ingredients_raw_str     object
label                    int64
dtype: object

In [504]:
kf_df.dtypes

serving_size_nums     int64
ingredients_raw      object
label                 int64
dtype: object

In [505]:
nkf_df.rename({'RecipeServings' : 'serving_size'}, axis =1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nkf_df.rename({'RecipeServings' : 'serving_size'}, axis =1, inplace = True)


In [506]:
kf_df.rename({'serving_size_nums': 'serving_size'}, axis = 1, inplace = True)

In [507]:
nkf_df.shape

(185848, 3)

In [508]:
kf_df.shape

(831, 3)

In [509]:
nkf_df.index

RangeIndex(start=0, stop=185848, step=1)

### Convert the list-like raw string in kidney friendly recipes to a true list, then feed each item into a "standardizer" workflow

In [510]:
parsed_ingredients = []
for x in kf_df['ingredients_raw'].index:
    parsed_ingredients.append(clean_ingredients(kf_df['ingredients_raw'][x]))


In [511]:
all_standardized_ingredients = []
count = 0
for item_list in parsed_ingredients:
    item_standardized_ingredients = []
    for i in item_list:
        string = convert_vulgar_fractions_in_place(i)
        try:    
            string = fraction_replacer(string)
        except Exception as e:
            print(f"Error converting [{string}] on item {count}")
            continue
        else:
            pass
        string = ingredient_standardizer(string, kf_df['serving_size'][count])
        item_standardized_ingredients.append(string)
    all_standardized_ingredients.append(item_standardized_ingredients)
    count+=1

Error converting [200 g dry rice noodles (.25 inch/.5 cm wide)] on item 124


In [512]:
kf_df['standardized_ingredients'] = all_standardized_ingredients

In [513]:
kf_df.head()

Unnamed: 0,serving_size,ingredients_raw,label,standardized_ingredients
0,5,"['½ cup canned chickpeas rinsed and drained', ...",1,[23.66 grams canned chickpeas rinsed and drain...
1,8,['1.5 lbs – Sea Bass or Tilapia Fish of choice...,1,[85.05 grams – sea bass or tilapia fish of cho...
2,4,['1 1/2 cups rhubarb ginger simple syrup <<< G...,1,[59.15 0.5 grams rhubarb ginger simple syrup <...
3,1,"['1/3 cup (75 ml) beef broth, no salt added\n1...",1,"[78.07 grams (75 ml) beef broth, no salt added..."
4,8,['5 egg yolks\n2 Tbsp fresh lemon zest\n½ cup ...,1,"[0.62 egg yolks, 3.75 grams fresh lemon zest, ..."


In [525]:
kf_df.to_csv('../data/generated/kidney-friendly-standardized_2.csv', index=True)

### Take a sample of the much larger non-kidney friendly database and repeat the standardization steps

In [515]:
nkf_df_sample = nkf_df.sample(10000, ignore_index=True)

In [516]:
parsed_ingredients = []
for x in nkf_df_sample['ingredients_raw_str'].index:
    parsed_ingredients.append(clean_ingredients(nkf_df_sample['ingredients_raw_str'][x]))

In [517]:
all_standardized_ingredients = []
count = 0
for item_list in parsed_ingredients:
    item_standardized_ingredients = []
    for i in item_list:
        string = convert_vulgar_fractions_in_place(i)
        try:    
            string = fraction_replacer(string)
        except Exception as e:
            print(f"Error converting [{string}] on item {count}")
            continue
        else:
            pass
        string = ingredient_standardizer(string, nkf_df_sample['serving_size'][count])
        item_standardized_ingredients.append(string)
    all_standardized_ingredients.append(item_standardized_ingredients)
    count+=1

Error converting [14 cups \u003ca href\u003d\https://www.geniuskitchen.com/recipe/milkmoon-meringue-buttercream-536136\\u003eMilkmoon Meringue Buttercream\u003c/a\u003e for frosting] on item 11
Error converting [1 (1 ounce) package taco seasoning mix (or use my own \u003ca href\u003d\https://www.geniuskitchen.com/recipe/kittencals-taco-seasoning-mix-76616\\u003eKittencal\u0027s Taco Seasoning Mix\u003c/a\u003e)] on item 737
Error converting [ caramelized shallot (\u003ca href\u003d\https://www.geniuskitchen.com/recipe/caramelized-shallots-408147\\u003eCaramelized Shallots\u003c/a\u003e)] on item 1208
Error converting [ piquillo pepper ketchup (\u003ca href\u003d\https://www.geniuskitchen.com/recipe/piquillo-pepper-ketchup-408089\\u003ePiquillo Pepper Ketchup\u003c/a\u003e)] on item 1208
Error converting [ \u003ca href\u003d\https://www.geniuskitchen.com/recipe/basic-massoni-pizza-base-537146\\u003eBasic Massoni Pizza Base\u003c/a\u003e for pizza dough] on item 1389
Error converting [1 

In [518]:
nkf_df_sample['standardized_ingredients'] = all_standardized_ingredients

In [519]:
nkf_df_sample.standardized_ingredients[1]

['78.07 grams rice',
 'raw',
 'uncooked (NOT instant)',
 '158.51 grams chicken broth (canned is fine)',
 '30.0 grams butter',
 'divided ',
 '1.0 (6 ounce) can tuna',
 'drained ',
 '15.0 grams lemon juice',
 '78.07 grams onion',
 'diced ',
 '78.07 grams red bell pepper',
 '78.07 grams green bell pepper',
 ' salt and pepper']

In [520]:
nkf_df_sample = nkf_df_sample[['standardized_ingredients', 'label']]
nkf_df_sample.head()

Unnamed: 0,standardized_ingredients,label
0,"[0.25 (28 ounce) can whole tomatoes, drained ,...",0
1,"[78.07 grams rice, raw, uncooked (NOT instant)...",0
2,"[1.0 whole wheat tortillas, 56.7 grams shredde...",0
3,"[, 1.0 large potatoes (peeled and cooked), 3.3...",0
4,"[11.25 grams butter, 11.25 grams all-purpose f...",0


In [524]:
nkf_df_sample.to_csv('../data/generated/nkf_sample_10000_2.csv')

In [522]:
nkf_df_sample['standardized_ingredients'].head()

0    [0.25 (28 ounce) can whole tomatoes, drained ,...
1    [78.07 grams rice, raw, uncooked (NOT instant)...
2    [1.0 whole wheat tortillas, 56.7 grams shredde...
3    [, 1.0 large potatoes (peeled and cooked), 3.3...
4    [11.25 grams butter, 11.25 grams all-purpose f...
Name: standardized_ingredients, dtype: object

In [523]:
kf_df['standardized_ingredients'].head()

0    [23.66 grams canned chickpeas rinsed and drain...
1    [85.05 grams – sea bass or tilapia fish of cho...
2    [59.15 0.5 grams rhubarb ginger simple syrup <...
3    [78.07 grams (75 ml) beef broth, no salt added...
4    [0.62 egg yolks, 3.75 grams fresh lemon zest, ...
Name: standardized_ingredients, dtype: object