In [155]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
from requests import get
from bs4 import BeautifulSoup
from collections import defaultdict

plt.rcParams["figure.figsize"] = (15,8) #set size of plot

In [23]:
#Get the URL
URL = 'https://health.gov/dietaryguidelines/2015/guidelines/appendix-2/#males'
r = get(URL)
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser');

In [73]:
male_calory_demand = pd.read_excel("data/calories_demand.xlsx",header =None, sheet_name=0, names=['age', 'sedentary', 'moderate', 'active'])

In [70]:
male_calory_demand.head()

Unnamed: 0,age,sedentary,moderate,active
0,2,1000.0,1000.0,1000.0
1,3,1000.0,1400.0,1400.0
2,4,1200.0,1400.0,1600.0
3,5,1200.0,1400.0,1600.0
4,6,1400.0,1600.0,1800.0


In [51]:
females_calory_demand =  pd.read_excel("data/calories_demand.xlsx",header =None, sheet_name=1, names=['age', 'sedentary', 'moderate', 'active'])

In [48]:
females_calory_demand.head()

Unnamed: 0,age,sedentary,moderate,active
0,2,1000,1000,1000
1,3,1000,1200,1400
2,4,1200,1400,1400
3,5,1200,1400,1600
4,6,1200,1400,1600


In [50]:
usda_foods = pd.read_excel("data/USDA-Food.xlsx", sheet_name=0)

In [51]:
usda_foods.head()

Unnamed: 0,Database Number,Food Group,Food Name,Protein (g),Fat (g),Carbohydrates (g),Ash (g),Calories,Starch (g),Sucrose (g),...,Riboflavin (B2) (mg),Niacin (B3) (mg),Vitamin B5 (mg),Vitamin B6 (mg),Folate (B9) (mg),Vitamin B12,Choline (mg),Cholesterol (mg),Saturated Fat (g),Net Carbs
0,1001,Dairy and Egg Products,"Butter, salted",0.85,81.11,0.06,2.11,717,,,...,0.034,0.042,0.11,0.003,3.0,0.17,18.8,215.0,51.368,0.06
1,1002,Dairy and Egg Products,"Butter, whipped, with salt",0.49,78.3,2.87,1.62,718,,,...,0.064,0.022,0.097,0.008,4.0,0.07,18.8,225.0,45.39,2.87
2,1003,Dairy and Egg Products,"Butter oil, anhydrous",0.28,99.48,0.0,0.0,876,,,...,0.005,0.003,0.01,0.001,0.0,0.01,22.3,256.0,61.924,0.0
3,1004,Dairy and Egg Products,"Cheese, blue",21.4,28.74,2.34,5.11,353,,,...,0.382,1.016,1.729,0.166,36.0,1.22,15.4,75.0,18.669,2.34
4,1005,Dairy and Egg Products,"Cheese, brick",23.24,29.68,2.79,3.18,371,,,...,0.351,0.118,0.288,0.065,20.0,1.26,15.4,94.0,18.764,2.79


In [52]:
usda_foods = usda_foods.drop(columns=['Database Number'])

In [53]:
usda_foods.head()

Unnamed: 0,Food Group,Food Name,Protein (g),Fat (g),Carbohydrates (g),Ash (g),Calories,Starch (g),Sucrose (g),Glucose (g),...,Riboflavin (B2) (mg),Niacin (B3) (mg),Vitamin B5 (mg),Vitamin B6 (mg),Folate (B9) (mg),Vitamin B12,Choline (mg),Cholesterol (mg),Saturated Fat (g),Net Carbs
0,Dairy and Egg Products,"Butter, salted",0.85,81.11,0.06,2.11,717,,,,...,0.034,0.042,0.11,0.003,3.0,0.17,18.8,215.0,51.368,0.06
1,Dairy and Egg Products,"Butter, whipped, with salt",0.49,78.3,2.87,1.62,718,,,,...,0.064,0.022,0.097,0.008,4.0,0.07,18.8,225.0,45.39,2.87
2,Dairy and Egg Products,"Butter oil, anhydrous",0.28,99.48,0.0,0.0,876,,,,...,0.005,0.003,0.01,0.001,0.0,0.01,22.3,256.0,61.924,0.0
3,Dairy and Egg Products,"Cheese, blue",21.4,28.74,2.34,5.11,353,,,,...,0.382,1.016,1.729,0.166,36.0,1.22,15.4,75.0,18.669,2.34
4,Dairy and Egg Products,"Cheese, brick",23.24,29.68,2.79,3.18,371,,,,...,0.351,0.118,0.288,0.065,20.0,1.26,15.4,94.0,18.764,2.79


In [57]:
test = usda_foods[['Food Group', 'Food Name', 'Protein (g)', 'Carbohydrates (g)', 'Fat (g)']]

In order to better work with the information we have collected, we will make some simplifications on the data. Mainly, we will:
- in the calories demands database, assume an average of necessary input per age
- group the ages into ranges that match the ranges provided in the world population database

In [52]:
def input_average(data_frame):
    result = data_frame.copy()
    result['input (KCal)'] = result.mean(axis=1)
    result = result.drop(columns=['sedentary', 'moderate', 'active'])
    return result

In [74]:
male_calories_avg = input_average(male_calory_demand); male_calories_avg.head()

Unnamed: 0,age,input (KCal)
0,2,1000.0
1,3,1266.666667
2,4,1400.0
3,5,1400.0
4,6,1600.0


In [56]:
females_calories_avg = input_average(females_calory_demand); females_calories_avg.head()

Unnamed: 0,age,input (KCal)
0,2,1000.0
1,3,1200.0
2,4,1333.333333
3,5,1400.0
4,6,1400.0


We have now obtained a caloric demand average for simpler calculations in the future.

Now, we need a  way to match the age groups in this dataframe, to the ones in the population database we obtained. As such, let's analyse how ages are represented in our calory demand dataframes.

In [75]:
male_calories_avg['age'].unique()

array([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       '19-20', '21-25', '26-30', '31-35', '36-40', '41-45', '46-50',
       '51-55', '56-60', '61-65', '66-70', '71-75', '76 and up', nan],
      dtype=object)

We can see there are ranges of ages with different sizes (which makes sense, because different age groups have different caloric needs). We'll present a function that creates one row per individual age

In [137]:
def single_age(age_range):
    if type(age_range) ==  float: # nans are the only floats in the age column
        return -1
    elif type(age_range) == int:
        return age_range
    elif re.search('\d-\d', age_range):
        group = age_range.split('-')
        return list(range(int(group[0]), int(group[1])+1))
    elif age_range == "76 and up":
        return list(range(76, 101+1))

In [145]:
def explode_age(data_frame):
    accum = []
    for i in data_frame.index:
        row = data_frame.loc[i]
        single = single_age(row['age'])
        if single == -1: # we ignore the nan values, as their rows are empty
            continue
        if type(single) == int:
            accum.append((single, row['input (KCal)']))
        elif type(single) == list:
            accum.extend([(x, row['input (KCal)']) for x in single]) 
    return pd.DataFrame(accum, columns=data_frame.columns)

In [146]:
male_explode = explode_age(male_calories_avg)

In [154]:
male_explode

Unnamed: 0,age,input (KCal)
0,2,1000.000000
1,3,1266.666667
2,4,1400.000000
3,5,1400.000000
4,6,1600.000000
...,...,...
95,97,2200.000000
96,98,2200.000000
97,99,2200.000000
98,100,2200.000000


In [149]:
male_explode['age'].unique()

array([  2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
        15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101], dtype=int64)

Ages are now unique in this dataframe and there's a caloric input value for each of them.

In [183]:
def group(age):
    i = int(5*(age//5))
    return "{}-{}".format(i, i+4)

In [184]:
def compress_ages(data_frame):
    accum = defaultdict(list)
    for i in data_frame.index:
        row = data_frame.loc[i]
        g_id = group(row['age'])
        if g_id == "100-104":
            g_id = "100+"
        accum[g_id].append(row['input (KCal)'])
    for i in accum:
        accum[i] = sum(accum[i]) / len(accum[i])
    return pd.DataFrame.from_dict(accum, orient='index')

In [185]:
new_male_need = compress_ages(male_explode)

In [191]:
new_male_need.index.name = 'age_group'
new_male_need = new_male_need.rename(columns={0: 'input (KCal)'})

In [192]:
new_male_need

Unnamed: 0_level_0,input (KCal)
age_group,Unnamed: 1_level_1
0-4,1222.222222
5-9,1613.333333
10-14,2133.333333
15-19,2760.0
20-24,2746.666667
25-29,2680.0
30-34,2666.666667
35-39,2613.333333
40-44,2546.666667
45-49,2480.0
