In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('recipes.csv', index_col='RecipeId')
df = df[df['Description'].str.startswith('Make and share this') == False]
df = df.sample(n=100000)

In [4]:
df.head(5)

Unnamed: 0_level_0,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,RecipeCategory,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
260527,Bat Poop - for Halloween,297076,CandyTX,PT30M,PT20M,PT50M,2007-10-21T00:48:00Z,Tom came up with this for our son's Halloween-...,"c(""https://img.sndimg.com/food/image/upload/w_...",Candy,...,8.8,6.3,550.4,72.2,2.5,37.6,6.8,,30 squares,"c(""Mash Pretzels till they are mostly crushed,..."
16512,Lemony Rice Pilaf,20371,Lennie,PT25M,PT10M,PT35M,2001-12-31T10:19:00Z,This takes plain ol' rice to a new level. Goes...,"c(""https://img.sndimg.com/food/image/upload/w_...",White Rice,...,0.5,2.7,138.5,33.7,1.2,2.5,5.4,8.0,,"c(""In a heavy saucepan, heat oil over mediumhe..."
443170,Sugar-Free Blueberry-Corn Muffins - Weight Wat...,57042,internetnut,PT18M,PT15M,PT33M,2010-11-30T20:46:00Z,These fruity muffins get their sweetness from ...,character(0),Beans,...,0.5,29.2,120.1,21.8,2.7,7.5,5.3,15.0,15 muffins,"c(""Preheat the oven to 400. Line 15 muffin cup..."
164010,Easy &amp; Quick Strawberry Wedding Cake,83060,KEHALI,PT35M,PT15M,PT50M,2006-04-12T16:43:00Z,This is a variation of a cake served at my ste...,character(0),Dessert,...,2.3,0.0,351.4,56.2,1.3,43.4,3.1,12.0,,"c(""Preheat oven to 350F and lightly grease/flo..."
215927,Vegetable Minestrone,461472,ladybug810,PT20M,PT40M,PT1H,2007-03-09T18:18:00Z,"This is my sister's recipe, but I have to shar...","""https://img.sndimg.com/food/image/upload/w_55...",Clear Soup,...,2.4,2.4,1223.8,95.0,20.8,14.1,14.7,,3 quarts,"c(""Sauté the onion and garlic in olive oil ove..."


In [5]:
def str_to_time(str_time):
    
    if pd.isnull(str_time):
        return None
    
    hours = re.findall('[0-9]*H', str_time)
    if hours:
        hours = int(hours[0][:-1])
    else:
        hours='00'
        
    minutes = re.findall('[0-9]*M', str_time)
    if minutes:
        minutes = minutes[0][:-1]
    else:
        minutes = '00'
        
    seconds = re.findall('[0-9]*S', str_time)
    if seconds:
        seconds = seconds[0][:-1]
    else:
        seconds='00'
        
    return f'{hours}:{minutes}:{seconds}'

In [6]:
timeCols = ['CookTime', 'PrepTime', 'TotalTime']
for col in timeCols:
    df[col] = df[col].apply(str_to_time)

In [7]:
def parse_instructions(instructions):
    if pd.isnull(instructions):
        return None
    instructions = instructions[3:len(instructions)-2]
    instructions = instructions.split("\", \"")
    instructions = '\n'.join(instructions)
    return instructions

In [8]:
df['RecipeInstructions'] = df['RecipeInstructions'].apply(parse_instructions)

In [9]:
def parse_ingredients(row):
    ingredients = row.RecipeIngredientParts
    ingredients = ingredients[3:len(ingredients)-2]
    ingredients = ingredients.split('\", \"')
    ingredients = [ingredient.title() for ingredient in ingredients]
    quantities = row.RecipeIngredientQuantities
    quantities = quantities[3:len(quantities)-2]
    quantities = quantities.split('\", \"')
    
    return dict(zip(ingredients, quantities))

In [10]:
df['Ingredients'] = [parse_ingredients(row) for row in df.itertuples(index=False)]

In [11]:
def recipe_yield_fix(quantity:str):
    if pd.isnull(quantity):
        return None
    return quantity.title()

df['RecipeYield'] = df['RecipeYield'].apply(recipe_yield_fix)

In [12]:
def keywords_fix(keywords: str):
    if pd.isnull(keywords):
        return None
    keywords = keywords[3:len(keywords)-2]
    keywords = keywords.split('\", \"')
    return keywords

df['Keywords'] = df['Keywords'].apply(keywords_fix)

In [13]:
cols_to_drop = ['AuthorId', 'AuthorName', 'DatePublished', 'Images', 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'ReviewCount']
df = df.drop(columns=cols_to_drop)

In [21]:
df['ProteinPercentage'] = df['ProteinContent'] * 4 / df['Calories']
df['CarbohydratePercentage'] = df['CarbohydrateContent'] * 4 / df['Calories']
df['SugarPercentage'] = df['SugarContent'] * 4 / df['Calories']
df['FatPercentage'] = df['FatContent'] * 9 / df['Calories']

In [24]:
df.to_csv('recipes_final.csv', index=False)