In [459]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
from fractions import Fraction
import re

### This notebook aims to replace fractions, unnecessary characters, and standardize ingredients as much as possible

In [460]:
# kidney friendly recipes
kf1 = pd.read_csv('../data/generated/kidney-ca-recipes-with-ingredients-and-nutrition.csv')
kf2 = pd.read_csv('../data/generated/kidney-kitchen-recipes-with-ingredients-and-nutrition.csv')

In [461]:
# non kidney friendly recipes
nkf1 = pd.read_csv('../data/generated/non_kidney_friendly_recipes.csv')

In [462]:
# most common unicode fractions appearing in recipes - conversion map
fraction_map = {'¼':'.25',
                '½':'.5',
                '⅓':'.33',                
                '¾':'.75',
                '⅒':'.1',
                '⅔':'.66',
                '⅕':'.2',
                '⅖':'.4',
                '⅗':'.6',
                '⅘':'.8',
                '⅙':'.166',
                '⅛':'.125'} 

In [463]:
# credit: Google Bard for the quick aid
# most common measurement units in recipes - conversion map
conversion_table = {
    "teaspoon": 5.0,
    "teaspoons": 5.0,
    "tsp": 5.0,
    "tsps": 5.0,
    "tablespoon": 15.0,
    "tablespoons": 15.0,
    "tbsp": 15.0,
    "tbsps": 15.0,
    "fl oz": 29.5735,
    "fluid ounce": 29.5735,
    "fl ozs": 29.5735,
    "fluid ounces": 29.5735,
    "cup": 236.588,
    "cups": 236.588,
    "pint": 473.176,
    "pints": 473.176,
    "quart": 946.352,
    "quarts": 946.352,
    "gallon": 3785.41,
    "pound": 453.592,
    "pounds": 453.592,
    "ounce": 28.3495,
    "ounces": 28.3495,
    "lb": 453.592,
    "lbs": 453.592,
    "oz": 28.3495,
    "ozs": 28.3495
}

### Helper functions

In [464]:
def find_numerics(string):
    # return a list of all numeric quantities appearing in a given phrase
    regex = r"\d*\.?\d+"
    matches = re.findall(regex, string)
    return matches

In [465]:
def find_units(string):
    # return a list of any measurement units detected in a given phrase
    units = []
    [units.append(x) for x in string.split() if x in conversion_table.keys()]
    return units

In [466]:
def convert_unicode_fractions_in_place(value):
    # convert given unicode fraction without concatenating nearby numbers (5 ½ oz. jars)
    new_string = value
    for c in new_string:
        if c in fraction_map.keys():
            new_string = new_string.replace(c,str(fraction_map[c]))
    return new_string

In [467]:
def convert_unicode_fractions_concat(value):
    # convert given unicode fraction and concatenate nearby numbers (5½ oz. jar)
    new_string = value
    for c in new_string:
        if c in fraction_map.keys():
            new_string = new_string.replace(c,str(fraction_map[c]))
            
    return new_string.replace(' ','')

In [468]:
def convert_simple_fraction(string):
    # convert fractions represented as 1/2, 1/4, etc. No current support for numerators/denominators beyond 2 digits
    if '/' in string:
        for c in string:
            if c in '/':
                # examine two characters before and after divider, and position of the divider
                pos_min_2 = ''
                pos_min_1 = ''
                pos = string.index(c)
                pos_plus_1 = ''
                pos_plus_2 = ''
                # prevent out of bounds if string has early /
                if pos >= 2:
                    pos_min_2 = string[pos - 2]
                pos_min_1 = string[pos - 1]
                pos_plus_1 = string[pos + 1]
                # prevent out of bounds if string has late /
                if len(string) > pos+2:
                    pos_plus_2 = string[pos + 2]
                eval_char = [pos_min_2, pos_min_1, c, pos_plus_1, pos_plus_2]
                final = [i if i.isnumeric() or i in '/' else '' for i in eval_char]
        fraction = ''.join(final)
        
        # only attempt conversion if the divider / was actually surrounded by numbers
        if fraction != '/':
            fraction_replacement = str(round(float(sum(Fraction(c) for c in fraction.split())),2))
            string = string.replace(fraction, fraction_replacement)
            #return string.replace(fraction, fraction_replacement)
        else:
            pass
    return string
    

In [469]:
def ingredient_standardizer(string, serving_size):
    #grab a list of numerics, if present
    quantities = find_numerics(string)
    # print(quantities)
    #grab a list of convertible units, if present
    units = find_units(string.lower())
    # if the string has at least one quantity and a convertible unit (examples 1 cup water, 1 14.5 oz can of beans)
    if (len(quantities) and len(units) > 0) and units[0] in conversion_table.keys():
        # take the first instance of a quantity and convert it to grams via conversion table
        # this assumes that the first unit is associated with the first quantity, which is often the case (i.e. 1 cup water) 
        grams = float(quantities[0]) * conversion_table[units[0]]
        
        # create a new string that will replace the convertible unit with grams and the value used for conversion, also dividing by serving size
        new_string = string.lower().replace(units[0], 'grams', 1).replace(quantities[0], str(round(grams/serving_size, 2)), 1)
    # if the string has at leasy one quantity but no units (example: 1 egg) 
    elif len(quantities) > 0:
        # just replace the quantity divided by serving size
        new_string = string.replace(quantities[0], str(round(float(quantities[0]) / serving_size, 2)), 1)
    else:
        new_string = string
    return new_string

In [470]:
def clean_ingredients(ingredients):
    # necessary due to the way .csv stores data. Lists are converted to a long string. Functions returns and actual list of strings
    # and takes opportunity to remove whitespace and unnecessary characters
    full = []
    ingredients = ingredients.replace('\\n', ',').replace('\n',',')
    for string in ingredients.split(','):
        string = ' '.join(string.strip().split())
        string = string.replace("'", '').replace('\\xa0',' ').replace('[','').replace(']','').replace('"','')
        full += string.split(',')
    return full

In [472]:
# standardizer example code

# test_unseen = []
# test = '''[16 ounce package Super Firm Tofu' '1/4 cup red onion
# 1 stalk celery
# 1 Tablespoon fresh dill (or 1 teaspoon dried dill)
# 1 Tablespoon mustard of choice
# 5 Tablespoons mayonnaise
# 1/2 teaspoon turmeric for color
# Juice from 1/2 lemon']'''

# test = clean_ingredients(test)
# for x in test:
#     string = convert_unicode_fractions_in_place(x)
#     try:    
#         string = convert_simple_fraction(string)
#     except Exception as e:
#         print(f"Error converting [{string}] on item {count}")
#         continue
#     else:
#         pass
#     string = ingredient_standardizer(string, 4)
#     print(string)

113.4 grams package super firm tofu 0.25 cup red onion
0.25 stalk celery
3.75 grams fresh dill (or 1 teaspoon dried dill)
3.75 grams mustard of choice
18.75 grams mayonnaise
0.62 grams turmeric for color
Juice from 0.12 lemon


### Apply labels and create tidy dataframe 

In [475]:
# kidney-friendly dataframe
kf_df = pd.concat([kf1, kf2], axis = 0)

In [476]:
kf_df['label'] = 1

In [477]:
kf_df = kf_df[['ingredients_raw','serving_size','label']]

### Extract serving sizes from scraped data, as additional text also came along for the ride

In [None]:
kf_df['serving_size'] = kf_df['serving_size'].apply(find_numerics)

In [480]:
# replace null servings sizes wit h1
serving_size_values = []
for i in kf_df['serving_size']:
    if len(i) > 0:
        serving_size_values.append(i[0])
    else:
        serving_size_values.append(1)

In [481]:
kf_df['serving_size_nums'] = serving_size_values

In [482]:
sn = kf_df['serving_size_nums']

In [483]:
# drop any serving sizes still longer than 3 characters
odd_servings = kf_df[sn.str.len() > 3].index

In [484]:
kf_df = kf_df.drop(odd_servings)

In [None]:
kf_df['serving_size_nums'] = pd.to_numeric(kf_df['serving_size_nums'])

### Reset index

In [485]:
kf_df.index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            657, 658, 659, 660, 661, 662, 663, 664, 665, 666],
           dtype='int64', length=831)

In [486]:
kf_df.reset_index(inplace=True)

In [487]:
kf_df.index

RangeIndex(start=0, stop=831, step=1)

In [489]:
# recommended: check index and drop as a column if extraneous
# kf_df.drop('index', axis = 1, inplace=True)
# kf_df.index

RangeIndex(start=0, stop=831, step=1)

### Replace instances of unicode fractions with decimals

In [493]:
kf_df['serving_size_nums'] = kf_df['serving_size_nums'].apply(convert_unicode_fractions_concat)
kf_df['serving_size_nums'] = kf_df['serving_size_nums'].apply(lambda x: convert_unicode_fractions(x))

### Replace instances of fractions in strings represented as '1 1/4' with decimals

In [494]:
kf_df['serving_size_nums'] = kf_df['serving_size_nums'].apply(convert_simple_fraction)

In [496]:
kf_df['serving_size_nums'] = kf_df['serving_size_nums'].apply(lambda x : float(sum(Fraction(c) for c in x.split())))

In [499]:
kf_df = kf_df[['serving_size_nums', 'ingredients_raw', 'label']]
kf_df.head()

### Repeat steps as necessary for Non-kidney Friendly Recipes

In [501]:
nkf1['label'] = 0

In [502]:
nkf_df = nkf1[['RecipeServings', 'ingredients_raw_str', 'label']]

In [503]:
nkf_df.dtypes

RecipeServings         float64
ingredients_raw_str     object
label                    int64
dtype: object

In [505]:
nkf_df.rename({'RecipeServings' : 'serving_size'}, axis =1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nkf_df.rename({'RecipeServings' : 'serving_size'}, axis =1, inplace = True)


In [506]:
kf_df.rename({'serving_size_nums': 'serving_size'}, axis = 1, inplace = True)

### Standardization
Convert the list-like raw string in kidney friendly recipes to a true list, then feed each item into a "standardizer" workflow

Start with kidney friendly recipes

In [510]:
# create a master list, where each entry is a list of string corresponding to that recipe's ingredients
parsed_ingredients = []
for x in kf_df['ingredients_raw'].index:
    parsed_ingredients.append(clean_ingredients(kf_df['ingredients_raw'][x]))


In [511]:
# create a counterpart list that will hold lists of strings corresponding to those same items, but standardized
all_standardized_ingredients = []
count = 0
for item_list in parsed_ingredients:
    item_standardized_ingredients = []
    for i in item_list:
        # find and replace unicode fractions first
        string = convert_unicode_fractions_in_place(i)
        
        #attempt conversion of simple fractions (sometimes fickle due to items like URLs)
        try:    
            string = convert_simple_fraction(string)
        except Exception as e:
            print(f"Error converting [{string}] on item {count}")
            continue
        else:
            pass

        # apply the standardization
        string = ingredient_standardizer(string, kf_df['serving_size'][count])

        #update the list for this specific recipe (item_list)
        item_standardized_ingredients.append(string)

    # add the result to the master list
    all_standardized_ingredients.append(item_standardized_ingredients)
    count+=1

Error converting [200 g dry rice noodles (.25 inch/.5 cm wide)] on item 124


In [512]:
# add the resulting master list to the dataframe
kf_df['standardized_ingredients'] = all_standardized_ingredients

In [525]:
# Uncomment to resave, if necessary
# kf_df.to_csv('../data/generated/kidney-friendly-standardized_2.csv', index=True)

### Take a sample of the much larger non-kidney friendly database and repeat the standardization steps

In [515]:
nkf_df_sample = nkf_df.sample(10000, ignore_index=True)

In [516]:
# create a master list, where each entry is a list of string corresponding to that recipe's ingredients

parsed_ingredients = []
for x in nkf_df_sample['ingredients_raw_str'].index:
    parsed_ingredients.append(clean_ingredients(nkf_df_sample['ingredients_raw_str'][x]))

In [517]:
# create a counterpart list that will hold lists of strings corresponding to those same items, but standardized
all_standardized_ingredients = []
count = 0
for item_list in parsed_ingredients:
    item_standardized_ingredients = []
    for i in item_list:
        # find and replace unicode fractions first
        string = convert_unicode_fractions_in_place(i)

        #attempt conversion of simple fractions (sometimes fickle due to items like URLs)
        try:    
            string = convert_simple_fraction(string)
        except Exception as e:
            print(f"Error converting [{string}] on item {count}")
            continue
        else:
            pass

        # apply the standardization
        string = ingredient_standardizer(string, nkf_df_sample['serving_size'][count])

        #update the list for this specific recipe (item_list)
        item_standardized_ingredients.append(string)

    # add the result to the master list
    all_standardized_ingredients.append(item_standardized_ingredients)
    count+=1

Error converting [14 cups \u003ca href\u003d\https://www.geniuskitchen.com/recipe/milkmoon-meringue-buttercream-536136\\u003eMilkmoon Meringue Buttercream\u003c/a\u003e for frosting] on item 11
Error converting [1 (1 ounce) package taco seasoning mix (or use my own \u003ca href\u003d\https://www.geniuskitchen.com/recipe/kittencals-taco-seasoning-mix-76616\\u003eKittencal\u0027s Taco Seasoning Mix\u003c/a\u003e)] on item 737
Error converting [ caramelized shallot (\u003ca href\u003d\https://www.geniuskitchen.com/recipe/caramelized-shallots-408147\\u003eCaramelized Shallots\u003c/a\u003e)] on item 1208
Error converting [ piquillo pepper ketchup (\u003ca href\u003d\https://www.geniuskitchen.com/recipe/piquillo-pepper-ketchup-408089\\u003ePiquillo Pepper Ketchup\u003c/a\u003e)] on item 1208
Error converting [ \u003ca href\u003d\https://www.geniuskitchen.com/recipe/basic-massoni-pizza-base-537146\\u003eBasic Massoni Pizza Base\u003c/a\u003e for pizza dough] on item 1389
Error converting [1 

### Take a sample of the NKF result to balance classes

In [518]:
nkf_df_sample['standardized_ingredients'] = all_standardized_ingredients

In [520]:
nkf_df_sample = nkf_df_sample[['standardized_ingredients', 'label']]
nkf_df_sample.head()

Unnamed: 0,standardized_ingredients,label
0,"[0.25 (28 ounce) can whole tomatoes, drained ,...",0
1,"[78.07 grams rice, raw, uncooked (NOT instant)...",0
2,"[1.0 whole wheat tortillas, 56.7 grams shredde...",0
3,"[, 1.0 large potatoes (peeled and cooked), 3.3...",0
4,"[11.25 grams butter, 11.25 grams all-purpose f...",0


In [524]:
# uncomment to re-save if necessary
# nkf_df_sample.to_csv('../data/generated/nkf_sample_10000_2.csv')