In [1]:
#Dependencies
import pandas as pd
import json
from pandas.io.json import json_normalize

In [2]:
# reading the JSON data using json.load()
file = './data/HealthyAll.json'
with open(file) as train_file:
    dict_train = json.load(train_file)

#convert from JSON to pandas DataFrame    
health_df = json_normalize(dict_train)

#Create dataframe with only relevant columns (totalNutrients quantities and the serving size)
reduced = health_df[['recipe.yield', 'recipe.totalNutrients.ENERC_KCAL.quantity', 'recipe.totalNutrients.FAT.quantity', 
                     'recipe.totalNutrients.FASAT.quantity','recipe.totalNutrients.FATRN.quantity', 
                     'recipe.totalNutrients.FAMS.quantity', 'recipe.totalNutrients.FAPU.quantity',
                     'recipe.totalNutrients.CHOCDF.quantity','recipe.totalNutrients.FIBTG.quantity', 
                     'recipe.totalNutrients.SUGAR.quantity', 'recipe.totalNutrients.PROCNT.quantity', 
                     'recipe.totalNutrients.CHOLE.quantity',
                     'recipe.totalNutrients.NA.quantity', 'recipe.totalNutrients.CA.quantity', 
                     'recipe.totalNutrients.MG.quantity', 'recipe.totalNutrients.K.quantity', 
                     'recipe.totalNutrients.FE.quantity', 'recipe.totalNutrients.ZN.quantity', 
                     'recipe.totalNutrients.P.quantity', 'recipe.totalNutrients.VITA_RAE.quantity',
                     'recipe.totalNutrients.VITC.quantity', 'recipe.totalNutrients.THIA.quantity', 
                     'recipe.totalNutrients.RIBF.quantity', 'recipe.totalNutrients.NIA.quantity',
                     'recipe.totalNutrients.VITB6A.quantity', 'recipe.totalNutrients.FOLDFE.quantity',
                     'recipe.totalNutrients.FOLFD.quantity', 'recipe.totalNutrients.VITB12.quantity', 
                     'recipe.totalNutrients.VITD.quantity', 'recipe.totalNutrients.TOCPHA.quantity',
                     'recipe.totalNutrients.VITK1.quantity', 'recipe.totalNutrients.WATER.quantity']].copy()

#Rename columns
newNames = ['Servings', 'Calories', 'Fat', 'Sat_Fat', 'Trans_Fat', 'Mono_Fat', 'Poly_Fat', 'Carbs', 'Fiber',
           'Sugars', 'Protein', 'Cholesterol', 'Sodium', 'Calcium', 'Magnesium', 'Potassium', 'Iron', 'Zinc', 'Phosphorus',
           'Vit_A', 'Vit_C', 'B1', 'B2', 'B3', 'B6', 'Folate_eq', 'Folate_food', 'B12', 'Vit_D', 'Vit_E', 'Vit_K', 'Water']

reduced.columns = newNames

In [3]:
#Build new dataframe (reduced2) with totalNutrients divided by serving size. To standardize the nutrition to 'per serving' values

reduced2 = reduced.copy()

#Adds copies of all columns/serving size
for i in newNames:
    reduced2[f'{i}/s'] = reduced2[i]/reduced2['Servings']
 
#Removes original columns
for i in newNames:
    reduced2 = reduced2.drop([i], axis=1)

#Set NaN values to 0
reduced2 = reduced2.fillna(0)


In [4]:
#Add column to categorize 'Healthy status': [1,0] = healthy, [0,1] = unhealthy

healthColumn = []
for i in range(len(reduced2)):
    healthColumn.append([1,0])
    
reduced2['Healthy'] = healthColumn

In [5]:
#Repeat steps for unhealthy recipes then combine(concat) the two

# reading the JSON data using json.load()
file = './data/UnHealthyAll.json'
with open(file) as train_file:
    dict_train = json.load(train_file)

    
unhealth_df = json_normalize(dict_train)

unreduced = unhealth_df[['recipe.yield', 'recipe.totalNutrients.ENERC_KCAL.quantity', 'recipe.totalNutrients.FAT.quantity', 
                     'recipe.totalNutrients.FASAT.quantity','recipe.totalNutrients.FATRN.quantity', 
                     'recipe.totalNutrients.FAMS.quantity', 'recipe.totalNutrients.FAPU.quantity',
                     'recipe.totalNutrients.CHOCDF.quantity','recipe.totalNutrients.FIBTG.quantity', 
                     'recipe.totalNutrients.SUGAR.quantity', 'recipe.totalNutrients.PROCNT.quantity', 
                     'recipe.totalNutrients.CHOLE.quantity',
                     'recipe.totalNutrients.NA.quantity', 'recipe.totalNutrients.CA.quantity', 
                     'recipe.totalNutrients.MG.quantity', 'recipe.totalNutrients.K.quantity', 
                     'recipe.totalNutrients.FE.quantity', 'recipe.totalNutrients.ZN.quantity', 
                     'recipe.totalNutrients.P.quantity', 'recipe.totalNutrients.VITA_RAE.quantity',
                     'recipe.totalNutrients.VITC.quantity', 'recipe.totalNutrients.THIA.quantity', 
                     'recipe.totalNutrients.RIBF.quantity', 'recipe.totalNutrients.NIA.quantity',
                     'recipe.totalNutrients.VITB6A.quantity', 'recipe.totalNutrients.FOLDFE.quantity',
                     'recipe.totalNutrients.FOLFD.quantity', 'recipe.totalNutrients.VITB12.quantity', 
                     'recipe.totalNutrients.VITD.quantity', 'recipe.totalNutrients.TOCPHA.quantity',
                     'recipe.totalNutrients.VITK1.quantity', 'recipe.totalNutrients.WATER.quantity']].copy()

unreduced.columns = newNames

unreduced2 = unreduced.copy()

for i in newNames:
    unreduced2[f'{i}/s'] = unreduced2[i]/unreduced2['Servings']
 
for i in newNames:
    unreduced2 = unreduced2.drop([i], axis=1)
    
unreduced2 = unreduced2.fillna(0)

#------changed healthColumn to [0,1]...unhealthy recipes!-------
healthColumn = []
for i in range(len(unreduced2)):
    healthColumn.append([0,1])
    
unreduced2['Healthy'] = healthColumn


In [10]:
#Combine the healthy and unhealthy dataframes and reset the index
combined_df = pd.concat([reduced2, unreduced2])
combined_df = combined_df.reset_index()
   #drop second 'index' column produced from the concat 
combined_df = combined_df.drop('index', axis=1)
combined_df.tail()

Unnamed: 0,Servings/s,Calories/s,Fat/s,Sat_Fat/s,Trans_Fat/s,Mono_Fat/s,Poly_Fat/s,Carbs/s,Fiber/s,Sugars/s,...,B3/s,B6/s,Folate_eq/s,Folate_food/s,B12/s,Vit_D/s,Vit_E/s,Vit_K/s,Water/s,Healthy
1844,1.0,885.555968,53.018457,21.638624,0.465476,22.651813,3.703198,41.120828,3.09277,5.914407,...,13.81959,1.090329,151.388358,73.868358,2.002937,8.52,2.797755,43.488101,288.697325,"[0, 1]"
1845,1.0,1199.745198,52.064223,21.695331,0.055055,20.98781,6.19358,132.531893,9.903497,10.157808,...,12.726911,0.435342,698.420402,95.898856,0.8554,20.16,3.487766,9.842727,299.033738,"[0, 1]"
1846,1.0,575.813376,29.716701,17.255565,0.668482,7.652011,1.418145,48.349441,2.459127,2.626872,...,1.151624,0.156702,34.096367,34.096367,0.675327,18.282771,0.563872,2.312221,120.323226,"[0, 1]"
1847,1.0,223.868965,8.640837,3.993991,0.006804,2.284186,1.262668,26.627345,5.083299,11.188909,...,6.88319,0.352999,63.463372,40.945372,0.227013,4.082331,3.523929,25.027532,204.756052,"[0, 1]"
1848,1.0,453.05303,29.118689,15.106494,0.08253,8.686451,1.648327,34.804224,2.571149,5.466125,...,1.213557,0.261409,40.777834,40.777834,0.306823,18.406219,0.9251,12.385577,137.497637,"[0, 1]"


In [11]:
#Write to csv. Change ModelData file name for new API data
combined_df.to_csv("data/ModelData_1.csv", encoding="utf-8", index=False)