In [21]:
# Loading packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [22]:
# Loading data 
recipes = pd.read_parquet("../data/recipes.parquet")
reviews = pd.read_parquet("../data/reviews.parquet")

recipes['Description'] = recipes['Description'].fillna('')

Retrieving keywords from different recipe attributes and adding new keywords

In [23]:
## This is based on previous Nadir's code from `Keywords_cleaning_old.ipynb`

def lower_case(value):
    if type(value)==str:
        return value.lower()
    else:
        return value

# From `Keywords` attribute
original_keywords = set()
for keyword_list in recipes['Keywords'].values:
    keyword_list = [s.lower() for s in keyword_list if type(s) == str]
    original_keywords = original_keywords | set(keyword_list)

# From `RecipeCategory` attribute
categories_keywords = {w.lower() for w in recipes.RecipeCategory if type(w) == str}

# Additional keywords
extra_keywords = {'pasta','chicken','rice','cheap','simple','burger','veggie','fried','sauce','soup','chocolate','pudding','taco','hummus','healthy',
                  'cake','egg','italian','mushroom','casserole','french toast','pancake','waffle','salad','pie','noodle','ramen','pizza','greek', 'butter',
                  'cocktail', 'drink', 'tortilla', 'tea', 'coffee', 'cafe', 'cappuccino', 'bacon'}

# Taking the union of all the keywords
all_keywords = original_keywords | categories_keywords | extra_keywords

# For simplicity I will be getting rid off the plural words. Instead I will consider their singular version
s_exceptions = {'spreads', 'homeopathy/remedies', 'octopus', 'christmas', 'potatoes', 'beef sandwiches', 
              'for large groups', 'hummus', 'citrus', 'bass', 'swiss', 'veggies'}

clean_keywords = set()
for keyword in all_keywords:
    if keyword.endswith('s') and keyword not in s_exceptions:
        clean_keywords.add(keyword[:-1])
    elif keyword.endswith('ies') and keyword not in s_exceptions:
        clean_keywords.add(keyword[:-3]+'y')
    else:
        clean_keywords.add(keyword)


In [24]:
# Function to extract new keywords from a row in `recipes`
def extract_new_keywords(row):
    new_keywords = set([s.lower() for s in row['Keywords'] if type(s) == str])
    for word in (row['Name'] + " " + row['Description']).split():
        if lower_case(word) in clean_keywords:
            new_keywords.add(word.lower())
    for word in ' '.join(row['RecipeInstructions']).split():
        if lower_case(word) in clean_keywords:
            new_keywords.add(word.lower())
    return list(new_keywords)

# Apply the function to create `KeywordsClean`
recipes['KeywordsClean'] = recipes.apply(extract_new_keywords, axis=1)

In [25]:
# We consider only the recipes with at least one keyword
clean_recipes = recipes[recipes['KeywordsClean'].apply(len) > 0]
print(f"We are keeping {round(len(clean_recipes)/len(recipes), 3)}% of the data")

# Saving recipes with cleaned Keywords into a file
clean_recipes[['RecipeId', 'KeywordsClean']].to_pickle('../data/clean_columns/keywords_clean.pk')
clean_recipes[['RecipeId', 'KeywordsClean']].head()

We are keeping 0.999% of the data


Unnamed: 0,RecipeId,KeywordsClean
0,38.0,"[summer, dessert, healthy, low cholesterol, fr..."
1,39.0,"[sauce, rice, meat, indian, stove top, lemon, ..."
2,40.0,"[summer, healthy, low cholesterol, shake, lemo..."
3,41.0,"[corn, pepper, low cholesterol, beans, oven, b..."
4,42.0,"[soup, cabbage, healthy, low cholesterol, wint..."
