In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
from requests import get
from bs4 import BeautifulSoup
from collections import defaultdict

plt.rcParams["figure.figsize"] = (15,8) #set size of plot

Cell used for scraping of calories:

In [None]:
#Get the URL
URL = 'https://health.gov/dietaryguidelines/2015/guidelines/appendix-2/#males'
r = get(URL)
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser');

Loading of USDA Food Database

In [None]:
usda_foods = pd.read_excel("data/USDA-Food.xlsx", sheet_name=0)
usda_foods = usda_foods.drop(columns=['Database Number'])
test = usda_foods[['Food Group', 'Food Name', 'Protein (g)', 'Carbohydrates (g)', 'Fat (g)']]

Statistical analysis

In [None]:
FAO_food_supply_stats = pd.DataFrame(index = FAO_food_supply_cleared.index)
FAO_food_supply_stats["latest data"] = FAO_food_supply_cleared["2013"]
FAO_food_supply_stats.sort_values('latest data')

Linear Regression Dario

In [None]:
FAO_recent = FAO_food_supply_cleared.iloc[39:]
def func(x, e, f):
    return e * x + f 
#return a * (x ** 5) + b * (x ** 4) + c * (x ** 3) +  d * (x ** 2) + e * x + f 

# Initial parameter guess, just to kick off the optimization
guess = (0.5, 0.5)

# Create copy of data to remove NaNs for curve fitting
fit_df = FAO_recent.dropna()

# Place to store function parameters for each column
col_params = {}

# Curve fit each column
for col in fit_df.columns:
    # Get x & y
    x = fit_df.index.astype(float).values
    y = fit_df[col].values
    # Curve fit column and get curve parameters
    params = curve_fit(func, x, y, guess)
    # Store optimized parameters
    col_params[col] = params[0]

# Extrapolate each column
for col in FAO_food_supply_cleared.columns:
    # Get the index values for NaNs in the column
    x = FAO_food_supply_cleared[pd.isnull(FAO_food_supply_cleared[col])].index.astype(float).values
    # Extrapolate those points with the fitted function
    FAO_food_supply_cleared[col][x] = func(x, *col_params[col])

# Display result
#print ('Extrapolated data:')
#print (FAO_food_supply_cleared)


#print ('Data was extrapolated with these column functions:')
#for col in col_params:
#    print ('f_{}(x) = {:0.3e} x^3 + {:0.3e} x^2 + {:0.4f} x + {:0.4f}'.format(col, *col_params[col]))

Window slider on 5 years

In [None]:
## from windowslider import WindowSlider
# 5 year version
from sklearn.linear_model import LinearRegression
w = 2

for col in FAO_food_supply_cleared.columns.values:
    years_train = FAO_food_supply_cleared.index.values[:53].copy()
    cal_train = FAO_food_supply_cleared.loc[:53, col].values.copy()
    train_set = pd.DataFrame(cal_train, index=years_train)
    train_set.insert(0, '∆t', np.ones(train_set.index.size))
    train_set.reset_index(inplace=True)
    train_set.rename(columns={"index": "years", 0:"cal"}, inplace=True)
    
    train_constructor = WindowSlider(window_size=w)
    train_windows = train_constructor.collect_windows(train_set.iloc[:,1:],
                                                  previous_y=True)
    lr_model = LinearRegression()
    lr_model.fit(train_windows.iloc[:,:-1], train_windows.iloc[:,-1])
    for i in range(44, 69, 5):
        years_test = FAO_food_supply_cleared.index.values[29:i:5]
        cal_test = FAO_food_supply_cleared[col][29:i:5].values
        test_set = pd.DataFrame(cal_test, index=years_test)
        test_set.insert(0, '∆t', np.ones(test_set.index.size))
        test_set.reset_index(inplace=True)
        test_set.rename(columns={"index": "years", 0:"cal"}, inplace=True)
        test_constructor = WindowSlider(window_size=w)
        test_windows = test_constructor.collect_windows(test_set.iloc[:,1:],
                                                        previous_y=True)
        pred = lr_model.predict(test_windows.iloc[:,:-1])
        if (i==59):
            FAO_food_supply_cleared[col].iloc[54] = pred[-1]
        if (i==64):
            FAO_food_supply_cleared[col].iloc[59] = pred[-1]

In [1]:
# Joao function on avg
#def input_average(data_frame):
    #result = data_frame.copy()
    #result['input (KCal)'] = result.mean(axis=1) #computing the mean
    #result = result.drop(columns=['sedentary', 'moderate', 'active']) #we keep only the mean
    #return result

In [None]:
#male_calories_avg = input_average(male_calory_demand)
#females_calories_avg = input_average(females_calory_demand)

#### Summing up to now
* We have defined a reasonable amount of calories needed for each gender and each group by taking average
* We collected these values in `new_male_need` and `new_female_need`
* We loaded the population of the African countries from the United Nation Dataset 
* We matched the population for the kcal needed by each age group. From this matching we build 4 different datasets, with different granularity levels:  
`total_cal_male`, `total_cal_female`, `total_cal_ages`, `total_cal`
* We changed the scale of our final dataframes to easily work with large numbers

In [None]:
for cou in pop_male.index:
    for i in np.arange(1950,2021): ##Initialize columns of missing years with Nan values
        if not i in pop_male.year.values: 
            temp = pop_male[(pop_male.year==1950) & (pop_male.country=="Algeria")]
            temp.year = i 
            temp.country = cou
            temp.iloc[:,2:] = 0
            pop_male = pop_male.append(temp, ignore_index=True)

Since the population dataframe from **World Population Database** contains measurement of population for years from 1950 to 2020 with a frequency of **5 years**, we now decide to interpolate in order to obtain values for intermediate years. Given the assumption that the population grows linearly over time, a plausible result is accomplished by interpolating with a linear method. The dataframe we will obtain will keep the same characteristics of the previous one with the difference of frequency that now is **1 year**.
In conclusion, we have to do this approximation to be able to compare our two different dataframes that now, thanks to our work, are in the same structure.

In [None]:
total_cal_yearly = total_cal.copy() #Copying data into new dataframe total_cal_yearly
for i in np.arange(1950,2021): ##Initialize columns of missing years with Nan values
    if not i in total_cal_yearly.columns.values:
        total_cal_yearly[i] = np.nan
years = list(total_cal_yearly.columns.sort_values()) #creating sorted list of years 
total_cal_yearly = total_cal_yearly[years] #sorting now columns with increasing years
total_cal_yearly = total_cal_yearly.interpolate(method='linear', axis=1) #interpolation

In [9]:
IFrame(src='visualization/african_cal_diff_animation.html', width = 1500, height=700)