In [1]:
import pandas as pd
import re
import json
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [2]:
# To extract the names of the ingredients, I updated the word corpus designed by Zack Scholl:
# https://github.com/schollz/ingredients/blob/main/corpus.go

file = open("assets/corpus.txt", "r", encoding="utf-8")
content = file.read()
corpus = content.split('\n')

In [3]:
train = json.load(open("assets/train.json", encoding='utf-8'))
df = pd.DataFrame.from_dict(train).drop('id', axis=1)

df['recipes_in_cuisine'] = df.groupby('cuisine')['ingredients'].transform('count')
df = df.explode('ingredients').reset_index()
df = df.rename(columns = {'index': 'recipe_id'})

df.head()

Unnamed: 0,recipe_id,cuisine,ingredients,recipes_in_cuisine
0,0,greek,romaine lettuce,1175
1,0,greek,black olives,1175
2,0,greek,grape tomatoes,1175
3,0,greek,garlic,1175
4,0,greek,pepper,1175


In [4]:
print('Initial df shape: ', df.shape)
df = df[df['cuisine'].isin(df[[
    'cuisine', 'recipes_in_cuisine'
]].drop_duplicates().sort_values(by='recipes_in_cuisine',
                                 ascending=False).head(6)['cuisine'].tolist())]
print('Filtered df shape: ', df.shape)

Initial df shape:  (428275, 4)
Filtered df shape:  (285483, 4)


In [5]:
df[['cuisine', 'recipes_in_cuisine'
    ]].drop_duplicates().sort_values(by='recipes_in_cuisine',
                                     ascending=False).head(6)

Unnamed: 0,cuisine,recipes_in_cuisine
81,italian,7838
91,mexican,6438
9,southern_us,4320
32,indian,3003
123,chinese,2673
469,french,2646


#### Creating ingredients dictionary

In [6]:
ingr_map = df[['ingredients']].drop_duplicates().reset_index(drop=True)

In [7]:
print('Unique ingredients number: ', len(ingr_map.ingredients))

Unique ingredients number:  5666


#### Getting rid of unnecessary brand names

In [8]:
brands_to_remove = []

for row in df['ingredients'].unique():
    for word in row.split(' '):
        if word.endswith(tuple(['®', '™', '’s'])):
            brand_str = row[:max(
                [row.rfind('®'
                           ), row.
                 rfind('™'), row.rfind('’s')]) + 2].strip()
            if not brand_str.lower() in brands_to_remove:
                brands_to_remove.append(brand_str.lower())
brands_to_remove = [x for x in brands_to_remove if not x == 'sheep’s']

brands_to_remove_add = [
    'A Taste of Thai', 'Accent', 'All Natural Regular', 'Applewood', 'Argo',
    'Azteca', "BREAKSTONE'S", 'Barilla', 'Bertolli', 'Best Food',
    'Better Than Bouillon', 'Bisquick', 'Bisquick Original', 'Bob Evans',
    'Bragg', 'CURRY GUY', 'Camellia', "Campbell's Condensed", 'Cholula',
    'Cinnamon Toast Crunch', 'Country Crock', 'Crisco', 'Crystal Farms',
    'Crystal Hot Sauce', 'Daisy', 'Daiya', 'DeLallo', 'Dole', 'Domino',
    'Doritos', 'Earth Balance', 'Estancia', 'Everglades',
    'Farmhouse Originals', 'Fiber One', 'Fiesta Sides', 'Fisher',
    'Flora Cuisine', 'Franks', 'Gebhardt', 'Gold Medal',
    'Golden VeggieGood Seasons', 'Gourmet Garden', 'Goya', 'Green Giant',
    'Hatch', 'Heinz', 'Hellmann', "Hellmann'", "Hellmann''s", "Hellmann's",
    'Herdez', 'Herdez', 'Hogue', 'Hurst Family Harvest', 'Ibarra',
    'Imperial Sugar', 'JOHNSONVILLE', 'Jell-O', 'Jif', 'Jiffy', 'Jimmy Dean',
    'Johnsonville', 'KNUDSEN', 'KRAFT', 'Kewpie', 'Kikkoman', 'Kikkoman',
    'Kim Crawford', 'Klondike Rose', 'Knorr', 'Knudsen', 'Kraft', 'Kroger',
    'La Victoria', 'Lea & Perrins', 'Lipton', 'Mae Ploy', 'Maggi',
    'Makers Mark', 'Manischewitz', 'Margherita', 'Martha White', 'Mazola',
    'McCormick', 'Mizkan', 'Mizkan Oigatsuo', 'Morton Salt', 'Mrs. Dash',
    'Nakano', 'Nido', 'Nielsen-Massey', 'Nilla', "O'Brien", 'Old Bay',
    'Old El Paso', 'Old World Style', 'Ortega', 'Oscar Mayer', 'Pace', 'Pam',
    'Pepperidge Farm', 'Pillsbury', 'Pompeian', 'Prego', 'Progresso',
    'Pure Wesson', 'Quorn', 'Ritz', 'Ro-Tel', 'Robert Mondavi', 'Rotel',
    'Royal Baking Powder', 'SYD', 'San Marzano', 'Skippy', 'Smart Balance',
    'Smithfield', 'Spice Islands', 'Spike', 'Splenda', 'Stonefire', 'Success',
    'Swanson', 'Swerve', 'Tabasco', 'Taco Bell', 'Tapatio', 'Texas Pete',
    'Thick & Chunky', "Thick 'n Chunky", "Tony Chachere's",
    'Traditional Cut Shredded', 'Tuttorosso', 'Tyson', "Uncle Ben's",
    'Uncle Bens', 'V8', 'White Lily', 'Wholesome Sweeteners', 'Wish Bone',
    'Wish-Bone', 'Wolf Brand', 'Zatarains', 'bertolli® classico',
    "best food's", 'calcium plus vitamin d', ' - mexican rice',
    'golden onion', 'home originals', 'homestyl', 'hot & spicy',
    'italian side', 'or best food', 'or best food real',
    'pasta side   cheesi cheddar', ' - butter & herb',
    ' - alfredo', ' - chicken flavor', 'recip secret',
    'rice side   cheddar broccoli', 'vegetable recipe mix',
    'vineyard premium collect'
]

for brand in brands_to_remove_add:
    if not brand.lower() in brands_to_remove:
        brands_to_remove.append(brand.lower())

# regroup so that we remove compound names first and then simple ones
brands_to_remove_ordered = [
    x for _, x in sorted(zip([len(x)
                              for x in brands_to_remove], brands_to_remove),
                         reverse=True)
]

brands_to_remove_single = []
brands_to_remove_multi = []

for brand in brands_to_remove_ordered:
    if len(brand.split(' ')) > 1:
        brands_to_remove_multi.append(brand)
    else:
        brands_to_remove_single.append(brand)

def remove_brands(data):
    row = data['ingredients']
    row = re.sub('|'.join(brands_to_remove_multi), "", row.lower()).strip()
    splitted_row = row.split(' ')
    splitted_row_new = [
        x for x in splitted_row if not x in brands_to_remove_single
    ]
    new_line = ' '.join(splitted_row_new)
    new_line = re.sub('|'.join(['    ', '   ', '  ']), ' ', new_line).strip()    
    return new_line


ingr_map['brands_removed'] = ingr_map.apply(remove_brands, axis=1)

In [9]:
print('Initial unique ingredients number: ', len(ingr_map['ingredients'].unique()))
print('Filtered unique ingredients number: ', len(ingr_map['brands_removed'].unique()))

Initial unique ingredients number:  5666
Filtered unique ingredients number:  5489


#### Extracting corpus words

In [10]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = nltk.word_tokenize(text)
    tokens_lemmatized = []
    for t in tokens:
        new_t = lemmatizer.lemmatize(t)
        tokens_lemmatized.append(new_t)
    return tokens_lemmatized


corpus_tokenized = [tokenize(x) for x in corpus]

In [11]:
corpus_token_dict = dict()

for row, tokens in zip(corpus, corpus_tokenized):
    if tokens in corpus_token_dict.values():
        for k, v in corpus_token_dict.items():
            if v == tokens:
                old_key = k
        corpus_token_dict.pop(old_key)
        corpus_token_dict[row] = tokens
    corpus_token_dict[row] = tokens

In [12]:
def return_corpus_words(data):
    row = data['brands_removed']
    row_tokenized = tokenize(row)
    for ingredient in corpus_tokenized:
        if set(ingredient).issubset(row_tokenized):
            ingredient_items_from_row_tokenized = [
                x for x in row_tokenized if x in ingredient
            ]
            if ingredient == ingredient_items_from_row_tokenized:
                for k, v in corpus_token_dict.items():
                    if v == ingredient:
                        ingredient_key = k
                new_row = ingredient_key
                break
            else:
                new_row = ''
        else:
            new_row = ''
    return new_row


ingr_map['corpus_extracted'] = ingr_map.apply(return_corpus_words, axis=1)

In [13]:
print('Initial unique ingredients number: ', len(ingr_map['brands_removed'].unique()))
print('Filtered unique ingredients number: ', len(ingr_map['corpus_extracted'].unique()))

Initial unique ingredients number:  5489
Filtered unique ingredients number:  3065


#### Ingredient classification

In [14]:
with open('assets/corpus_convertion.json') as json_file:
    corpus_convert_dict = json.load(json_file)
with open('assets/ingredient_classification.json') as json_file:
    classification_dict = json.load(json_file)

In [15]:
convert_ingr_dict = dict()

for ingredient in ingr_map['corpus_extracted'].unique():
    for key, ingredient_list in corpus_convert_dict.items():
        if ingredient in ingredient_list:
            convert_ingr_dict[ingredient] = key

ingr_map['ingredients_extracted'] = [
    convert_ingr_dict[x] if x in convert_ingr_dict.keys() else x
    for x in ingr_map['corpus_extracted']
]

In [16]:
print('Initial unique ingredients number: ', len(ingr_map['corpus_extracted'].unique()))
print('Filtered unique ingredients number: ', len(ingr_map['ingredients_extracted'].unique()))

Initial unique ingredients number:  3065
Filtered unique ingredients number:  1444


In [17]:
class_lvl1 = dict()
class_lvl2 = dict()

for ingredient in ingr_map['ingredients_extracted'].unique():
    for key in classification_dict.keys():
        for k, ingredient_list in classification_dict[key].items():
            if ingredient in ingredient_list:
                class_lvl1[ingredient] = key
                class_lvl2[ingredient] = k
                
ingr_map['subgroup'] = ingr_map['ingredients_extracted'].map(class_lvl2)
ingr_map['group'] = ingr_map['ingredients_extracted'].map(class_lvl1)

In [18]:
ingr_map.head()

Unnamed: 0,ingredients,brands_removed,corpus_extracted,ingredients_extracted,subgroup,group
0,plain flour,plain flour,flour,flour,bakery,grains_seeds_beans_and_nuts
1,ground pepper,ground pepper,ground pepper,pepper,spices,spices_and_herbs
2,salt,salt,salt,salt,salt,salt
3,tomatoes,tomatoes,tomato,tomato,vegetables,vegetables_mushrooms_and_fruit
4,ground black pepper,ground black pepper,ground black pepper,pepper,spices,spices_and_herbs


#### Mapping the ingredients in the main dataframe

In [19]:
df['ingredients_extracted'] = df['ingredients'].map(
    ingr_map.set_index('ingredients')['ingredients_extracted'].to_dict())
df['subgroup'] = df['ingredients_extracted'].map(
    ingr_map.set_index('ingredients_extracted')['subgroup'].to_dict())
df['group'] = df['ingredients_extracted'].map(
    ingr_map.set_index('ingredients_extracted')['group'].to_dict())

In [20]:
df.head()

Unnamed: 0,recipe_id,cuisine,ingredients,recipes_in_cuisine,ingredients_extracted,subgroup,group
9,1,southern_us,plain flour,4320,flour,bakery,grains_seeds_beans_and_nuts
10,1,southern_us,ground pepper,4320,pepper,spices,spices_and_herbs
11,1,southern_us,salt,4320,salt,salt,salt
12,1,southern_us,tomatoes,4320,tomato,vegetables,vegetables_mushrooms_and_fruit
13,1,southern_us,ground black pepper,4320,pepper,spices,spices_and_herbs


In [21]:
df.columns = ['recipe_id', 'cuisine', 'ingredient', 'recipes_in_cuisine', 'ingredient_extracted', 'subgroup', 'group']
df = df[['recipe_id', 'cuisine', 'recipes_in_cuisine', 'ingredient', 'group', 'subgroup', 'ingredient_extracted']]

In [23]:
df.to_csv('yummly_six_cuisines_cleaned.csv')