## Importing necessary Tools

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np
from sklearn.preprocessing import normalize

import ast

## Importing the Dataset

In [2]:
data=pd.read_csv('dataset/raw-data_recipe.csv')

In [3]:
data.head()

Unnamed: 0,recipe_id,recipe_name,aver_rate,image_url,review_nums,ingredients,cooking_directions,nutritions,reviews
0,222388,Homemade Bacon,5.0,https://images.media-allrecipes.com/userphotos...,3,pork belly^smoked paprika^kosher salt^ground b...,{'directions': u'Prep\n5 m\nCook\n2 h 45 m\nRe...,"{u'niacin': {u'hasCompleteData': False, u'name...","{8542392: {'rating': 5, 'followersCount': 11, ..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.764706,https://images.media-allrecipes.com/userphotos...,29,sauerkraut drained^Granny Smith apples sliced^...,{'directions': u'Prep\n15 m\nCook\n2 h 30 m\nR...,"{u'niacin': {u'hasCompleteData': False, u'name...","{3574785: {'rating': 5, 'followersCount': 0, '..."
2,218939,Foolproof Rosemary Chicken Wings,4.571429,https://images.media-allrecipes.com/userphotos...,12,chicken wings^sprigs rosemary^head garlic^oliv...,"{'directions': u""Prep\n20 m\nCook\n40 m\nReady...","{u'niacin': {u'hasCompleteData': True, u'name'...","{13774946: {'rating': 5, 'followersCount': 0, ..."
3,87211,Chicken Pesto Paninis,4.625,https://images.media-allrecipes.com/userphotos...,163,focaccia bread quartered^prepared basil pesto^...,{'directions': u'Prep\n15 m\nCook\n5 m\nReady ...,"{u'niacin': {u'hasCompleteData': True, u'name'...","{1563136: {'rating': 5, 'followersCount': 0, '..."
4,245714,Potato Bacon Pizza,4.5,https://images.media-allrecipes.com/userphotos...,2,red potatoes^strips bacon^Sauce:^heavy whippin...,{'directions': u'Prep\n20 m\nCook\n45 m\nReady...,"{u'niacin': {u'hasCompleteData': True, u'name'...","{2945555: {'rating': 5, 'followersCount': 6690..."


In [4]:
data.shape

(49698, 9)

In [5]:
data.isnull().sum()

recipe_id             0
recipe_name           0
aver_rate             0
image_url             0
review_nums           0
ingredients           0
cooking_directions    0
nutritions            0
reviews               0
dtype: int64

In [6]:
data['cooking_directions'][0]

"{'directions': u'Prep\\n5 m\\nCook\\n2 h 45 m\\nReady In\\n11 h 50 m\\nPreheat oven to 200 degrees F (95 degrees C).\\nSeason pork belly with paprika, salt, and pepper. Tightly wrap pork twice in heavy-duty aluminum foil. Place on a baking sheet and bake in the preheated oven for 2 1/2 hours. Turn off the oven; let pork rest in the oven for 1 hour. Remove meat from oven, leaving it wrapped in aluminum foil, and refrigerate at least 8 hours or overnight.\\nRemove pork from foil and slice across the grain in 1/4-inch thick slices. Working in batches, cook pork in a non-stick skillet over medium heat until golden and crisped, 6 to 8 minutes per slice.'}"

## Preprocessing the Data

###### we don't have any use with the cooking_directions and reviews column in the dataset ,sot droping it will the best option

In [7]:
data=data.drop(columns=['cooking_directions','reviews'])
data.head()

Unnamed: 0,recipe_id,recipe_name,aver_rate,image_url,review_nums,ingredients,nutritions
0,222388,Homemade Bacon,5.0,https://images.media-allrecipes.com/userphotos...,3,pork belly^smoked paprika^kosher salt^ground b...,"{u'niacin': {u'hasCompleteData': False, u'name..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.764706,https://images.media-allrecipes.com/userphotos...,29,sauerkraut drained^Granny Smith apples sliced^...,"{u'niacin': {u'hasCompleteData': False, u'name..."
2,218939,Foolproof Rosemary Chicken Wings,4.571429,https://images.media-allrecipes.com/userphotos...,12,chicken wings^sprigs rosemary^head garlic^oliv...,"{u'niacin': {u'hasCompleteData': True, u'name'..."
3,87211,Chicken Pesto Paninis,4.625,https://images.media-allrecipes.com/userphotos...,163,focaccia bread quartered^prepared basil pesto^...,"{u'niacin': {u'hasCompleteData': True, u'name'..."
4,245714,Potato Bacon Pizza,4.5,https://images.media-allrecipes.com/userphotos...,2,red potatoes^strips bacon^Sauce:^heavy whippin...,"{u'niacin': {u'hasCompleteData': True, u'name'..."


###### Now we want to make the average_rating to a 2 decimal number.
###### For that we can create a function

In [8]:
def avg_rating(col):
    return f'{col:.2f}'

###### Apply the function to the aver_rate column in the dataset

In [9]:
data.aver_rate=data.aver_rate.apply(avg_rating)
data.head()

Unnamed: 0,recipe_id,recipe_name,aver_rate,image_url,review_nums,ingredients,nutritions
0,222388,Homemade Bacon,5.0,https://images.media-allrecipes.com/userphotos...,3,pork belly^smoked paprika^kosher salt^ground b...,"{u'niacin': {u'hasCompleteData': False, u'name..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.76,https://images.media-allrecipes.com/userphotos...,29,sauerkraut drained^Granny Smith apples sliced^...,"{u'niacin': {u'hasCompleteData': False, u'name..."
2,218939,Foolproof Rosemary Chicken Wings,4.57,https://images.media-allrecipes.com/userphotos...,12,chicken wings^sprigs rosemary^head garlic^oliv...,"{u'niacin': {u'hasCompleteData': True, u'name'..."
3,87211,Chicken Pesto Paninis,4.62,https://images.media-allrecipes.com/userphotos...,163,focaccia bread quartered^prepared basil pesto^...,"{u'niacin': {u'hasCompleteData': True, u'name'..."
4,245714,Potato Bacon Pizza,4.5,https://images.media-allrecipes.com/userphotos...,2,red potatoes^strips bacon^Sauce:^heavy whippin...,"{u'niacin': {u'hasCompleteData': True, u'name'..."


In [10]:
data.dtypes

recipe_id       int64
recipe_name    object
aver_rate      object
image_url      object
review_nums     int64
ingredients    object
nutritions     object
dtype: object

###### The function will make the datatype to object so we need to convert it back to float

In [11]:
data.aver_rate=data.aver_rate.astype(float)
data.dtypes

recipe_id        int64
recipe_name     object
aver_rate      float64
image_url       object
review_nums      int64
ingredients     object
nutritions      object
dtype: object

###### We have the nutritions data as a string of dictionary

In [12]:
data.nutritions[0]

"{u'niacin': {u'hasCompleteData': False, u'name': u'Niacin Equivalents', u'amount': 9.319291, u'percentDailyValue': u'72', u'displayValue': u'9', u'unit': u'mg'}, u'sugars': {u'hasCompleteData': True, u'name': u'Sugars', u'amount': 0.09355932, u'percentDailyValue': u'0', u'displayValue': u'0.1', u'unit': u'g'}, u'sodium': {u'hasCompleteData': True, u'name': u'Sodium', u'amount': 2017.13, u'percentDailyValue': u'81', u'displayValue': u'2017', u'unit': u'mg'}, u'carbohydrates': {u'hasCompleteData': True, u'name': u'Carbohydrates', u'amount': 1.797819, u'percentDailyValue': u'< 1', u'displayValue': u'1.8', u'unit': u'g'}, u'vitaminB6': {u'hasCompleteData': False, u'name': u'Vitamin B6', u'amount': 0.2329798, u'percentDailyValue': u'15', u'displayValue': u'< 1', u'unit': u'mg'}, u'calories': {u'hasCompleteData': True, u'name': u'Calories', u'amount': 308.1481, u'percentDailyValue': u'15', u'displayValue': u'308', u'unit': u'kcal'}, u'thiamin': {u'hasCompleteData': False, u'name': u'Thiamin

###### so we have to convert it into a python dictionary

In [13]:
list_of_dict = []

for row in data.nutritions:
    list_of_dict.append(ast.literal_eval(row))
list_of_dict[1]

{'niacin': {'hasCompleteData': False,
  'name': 'Niacin Equivalents',
  'amount': 15.6016,
  'percentDailyValue': '120',
  'displayValue': '16',
  'unit': 'mg'},
 'sugars': {'hasCompleteData': False,
  'name': 'Sugars',
  'amount': 19.84146,
  'percentDailyValue': '0',
  'displayValue': '19.8',
  'unit': 'g'},
 'sodium': {'hasCompleteData': False,
  'name': 'Sodium',
  'amount': 2606.764,
  'percentDailyValue': '104',
  'displayValue': '2607',
  'unit': 'mg'},
 'carbohydrates': {'hasCompleteData': True,
  'name': 'Carbohydrates',
  'amount': 32.08176,
  'percentDailyValue': '10',
  'displayValue': '32.1',
  'unit': 'g'},
 'vitaminB6': {'hasCompleteData': False,
  'name': 'Vitamin B6',
  'amount': 1.328631,
  'percentDailyValue': '83',
  'displayValue': '1',
  'unit': 'mg'},
 'calories': {'hasCompleteData': True,
  'name': 'Calories',
  'amount': 371.7219,
  'percentDailyValue': '19',
  'displayValue': '372',
  'unit': 'kcal'},
 'thiamin': {'hasCompleteData': False,
  'name': 'Thiamin',

###### Now we have to extract percent daily values for some important  nutritions

In [14]:
calories_list = []
fat_list = []
carbohydrates_list = []
protein_list = []
cholesterol_list = []
sodium_list = []
fiber_list = []

###### Now iterate through every rows and list out the nutritions

In [15]:
for x in range(len(list_of_dict)):
    calories_list.append(list_of_dict[x]['calories']['percentDailyValue'])
    fat_list.append(list_of_dict[x]['fat']['percentDailyValue'])
    carbohydrates_list.append(list_of_dict[x]['carbohydrates']['percentDailyValue'])
    protein_list.append(list_of_dict[x]['protein']['percentDailyValue'])
    cholesterol_list.append(list_of_dict[x]['cholesterol']['percentDailyValue'])
    sodium_list.append(list_of_dict[x]['sodium']['percentDailyValue'])
    fiber_list.append(list_of_dict[x]['fiber']['percentDailyValue'])

###### Now group all the data into a single dataframe

In [16]:
nutritions_list = {'calories': calories_list, 'fat': fat_list, 'carbohydrates': carbohydrates_list, 
       'protein': protein_list, 'cholesterol': cholesterol_list, 'sodium': sodium_list, 
       'fiber': fiber_list}

In [17]:
df=pd.DataFrame(nutritions_list)

###### We can make this index as the recipes index

In [18]:
df.index=data['recipe_id']
df.head()

Unnamed: 0_level_0,calories,fat,carbohydrates,protein,cholesterol,sodium,fiber
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
222388,15,36,< 1,42,21,81,2
240488,19,18,10,73,33,104,41
218939,17,36,2,48,24,31,4
87211,32,45,20,65,20,43,18
245714,8,12,5,14,7,8,3


In [19]:
df.isnull().sum()

calories         963
fat              963
carbohydrates    963
protein          963
cholesterol      963
sodium           963
fiber            963
dtype: int64

###### We can see that there are some missing values in these data. which maybe the recipes which does not have these Nutritions ,so we want to drop it

In [20]:
df=df.dropna()

In [21]:
df.isnull().sum()

calories         0
fat              0
carbohydrates    0
protein          0
cholesterol      0
sodium           0
fiber            0
dtype: int64

###### We can see that there are values which is less than 1 and it shows up as an object col
###### Convert those into 1 and others as same
###### For that we can create a function called text_cleaning

In [22]:
def text_cleaning(cols):
    if cols == '< 1':
        return 1
    else:
        return cols

for col in df.columns:
    df[col] = df[col].apply(text_cleaning)

In [23]:
df.head()

Unnamed: 0_level_0,calories,fat,carbohydrates,protein,cholesterol,sodium,fiber
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
222388,15,36,1,42,21,81,2
240488,19,18,10,73,33,104,41
218939,17,36,2,48,24,31,4
87211,32,45,20,65,20,43,18
245714,8,12,5,14,7,8,3


In [24]:
df.dtypes

calories         object
fat              object
carbohydrates    object
protein          object
cholesterol      object
sodium           object
fiber            object
dtype: object

###### We can see that the datatypes of all the columns are object
###### Convert those into numerics

In [25]:
df=df.apply(pd.to_numeric)

In [26]:
df.dtypes

calories         int64
fat              int64
carbohydrates    int64
protein          int64
cholesterol      int64
sodium           int64
fiber            int64
dtype: object

###### Now we can Normalize all the columns

In [27]:
df_normalized = pd.DataFrame(normalize(df, axis=0))
df_normalized.columns = df.columns
df_normalized.index = df.index
df_normalized.head()

Unnamed: 0_level_0,calories,fat,carbohydrates,protein,cholesterol,sodium,fiber
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
222388,0.003457,0.004775,0.000352,0.004739,0.002997,0.006572,0.00056
240488,0.004378,0.002387,0.003517,0.008236,0.00471,0.008438,0.011477
218939,0.003917,0.004775,0.000703,0.005416,0.003425,0.002515,0.00112
87211,0.007374,0.005969,0.007034,0.007334,0.002855,0.003489,0.005039
245714,0.001844,0.001592,0.001759,0.00158,0.000999,0.000649,0.00084


###### Now we need to add the name of the recipe and the recipe_id 

In [28]:
def selected_recipe(recipe_id):

    recipe_df=data.set_index('recipe_id')
    recipe_name=recipe_df.at[recipe_id,'recipe_name']
    print("Recipe ID:", recipe_id)
    print("Recipe Name:", recipe_name)
    print(df.loc[df.index.isin([recipe_id])].transpose())
selected_recipe(87211)
    

Recipe ID: 87211
Recipe Name: Chicken Pesto Paninis
recipe_id      87211
calories          32
fat               45
carbohydrates     20
protein           65
cholesterol       20
sodium            43
fiber             18


In [29]:
final_recipe_dataset=pd.merge(data,df,on='recipe_id')
final_recipe_dataset.head()

Unnamed: 0,recipe_id,recipe_name,aver_rate,image_url,review_nums,ingredients,nutritions,calories,fat,carbohydrates,protein,cholesterol,sodium,fiber
0,222388,Homemade Bacon,5.0,https://images.media-allrecipes.com/userphotos...,3,pork belly^smoked paprika^kosher salt^ground b...,"{u'niacin': {u'hasCompleteData': False, u'name...",15,36,1,42,21,81,2
1,240488,"Pork Loin, Apples, and Sauerkraut",4.76,https://images.media-allrecipes.com/userphotos...,29,sauerkraut drained^Granny Smith apples sliced^...,"{u'niacin': {u'hasCompleteData': False, u'name...",19,18,10,73,33,104,41
2,218939,Foolproof Rosemary Chicken Wings,4.57,https://images.media-allrecipes.com/userphotos...,12,chicken wings^sprigs rosemary^head garlic^oliv...,"{u'niacin': {u'hasCompleteData': True, u'name'...",17,36,2,48,24,31,4
3,87211,Chicken Pesto Paninis,4.62,https://images.media-allrecipes.com/userphotos...,163,focaccia bread quartered^prepared basil pesto^...,"{u'niacin': {u'hasCompleteData': True, u'name'...",32,45,20,65,20,43,18
4,245714,Potato Bacon Pizza,4.5,https://images.media-allrecipes.com/userphotos...,2,red potatoes^strips bacon^Sauce:^heavy whippin...,"{u'niacin': {u'hasCompleteData': True, u'name'...",8,12,5,14,7,8,3


In [35]:
final_recipe_dataset['ingredients'][2]

'chicken wings^sprigs rosemary^head garlic^olive oil^lemon pepper^seasoned salt'

In [30]:
final_recipe_dataset.dtypes

recipe_id          int64
recipe_name       object
aver_rate        float64
image_url         object
review_nums        int64
ingredients       object
nutritions        object
calories           int64
fat                int64
carbohydrates      int64
protein            int64
cholesterol        int64
sodium             int64
fiber              int64
dtype: object

###### The ingredients list is in an String form so we need to make it to a list

In [68]:
ingredients_list = []

for x in range(len(final_recipe_dataset)):
    ingredients_list.append(final_recipe_dataset['ingredients'][x].split('^'))
data2={'ingredients_list':ingredients_list}
df1=pd.DataFrame(data2)
df1.index=final_recipe_dataset['recipe_id']
df1.head()

Unnamed: 0_level_0,ingredients_list
recipe_id,Unnamed: 1_level_1
222388,"[pork belly, smoked paprika, kosher salt, grou..."
240488,"[sauerkraut drained, Granny Smith apples slice..."
218939,"[chicken wings, sprigs rosemary, head garlic, ..."
87211,"[focaccia bread quartered, prepared basil pest..."
245714,"[red potatoes, strips bacon, Sauce:, heavy whi..."


###### Now merge this to the Original dataset

In [69]:
final_recipe_dataset=pd.merge(final_recipe_dataset,df1,on='recipe_id')
final_recipe_dataset.head()

Unnamed: 0,recipe_id,recipe_name,aver_rate,image_url,review_nums,ingredients,nutritions,calories,fat,carbohydrates,protein,cholesterol,sodium,fiber,ingredients_list
0,222388,Homemade Bacon,5.0,https://images.media-allrecipes.com/userphotos...,3,pork belly^smoked paprika^kosher salt^ground b...,"{u'niacin': {u'hasCompleteData': False, u'name...",15,36,1,42,21,81,2,"[pork belly, smoked paprika, kosher salt, grou..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.76,https://images.media-allrecipes.com/userphotos...,29,sauerkraut drained^Granny Smith apples sliced^...,"{u'niacin': {u'hasCompleteData': False, u'name...",19,18,10,73,33,104,41,"[sauerkraut drained, Granny Smith apples slice..."
2,218939,Foolproof Rosemary Chicken Wings,4.57,https://images.media-allrecipes.com/userphotos...,12,chicken wings^sprigs rosemary^head garlic^oliv...,"{u'niacin': {u'hasCompleteData': True, u'name'...",17,36,2,48,24,31,4,"[chicken wings, sprigs rosemary, head garlic, ..."
3,87211,Chicken Pesto Paninis,4.62,https://images.media-allrecipes.com/userphotos...,163,focaccia bread quartered^prepared basil pesto^...,"{u'niacin': {u'hasCompleteData': True, u'name'...",32,45,20,65,20,43,18,"[focaccia bread quartered, prepared basil pest..."
4,245714,Potato Bacon Pizza,4.5,https://images.media-allrecipes.com/userphotos...,2,red potatoes^strips bacon^Sauce:^heavy whippin...,"{u'niacin': {u'hasCompleteData': True, u'name'...",8,12,5,14,7,8,3,"[red potatoes, strips bacon, Sauce:, heavy whi..."


###### Now we have the both the nutritions and ingredients_list, we can revome the parent columns that was used to derive these two things and also the image_url

In [70]:
final_recipe_dataset=final_recipe_dataset.drop(['ingredients','nutritions','image_url'],axis=1)

In [71]:
final_recipe_dataset.head()

Unnamed: 0,recipe_id,recipe_name,aver_rate,review_nums,calories,fat,carbohydrates,protein,cholesterol,sodium,fiber,ingredients_list
0,222388,Homemade Bacon,5.0,3,15,36,1,42,21,81,2,"[pork belly, smoked paprika, kosher salt, grou..."
1,240488,"Pork Loin, Apples, and Sauerkraut",4.76,29,19,18,10,73,33,104,41,"[sauerkraut drained, Granny Smith apples slice..."
2,218939,Foolproof Rosemary Chicken Wings,4.57,12,17,36,2,48,24,31,4,"[chicken wings, sprigs rosemary, head garlic, ..."
3,87211,Chicken Pesto Paninis,4.62,163,32,45,20,65,20,43,18,"[focaccia bread quartered, prepared basil pest..."
4,245714,Potato Bacon Pizza,4.5,2,8,12,5,14,7,8,3,"[red potatoes, strips bacon, Sauce:, heavy whi..."


In [72]:
final_recipe_dataset.shape

(48735, 12)

In [73]:
final_recipe_dataset.dtypes

recipe_id             int64
recipe_name          object
aver_rate           float64
review_nums           int64
calories              int64
fat                   int64
carbohydrates         int64
protein               int64
cholesterol           int64
sodium                int64
fiber                 int64
ingredients_list     object
dtype: object