In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
#get the soup from the website
url = requests.get("https://nutritionfacts.org/daily-dozen/")
soup = BeautifulSoup(url.content, "html.parser")

In [3]:
#find the needed data in the soup and make lists
dd_grid = soup.find_all(attrs={"class":"daily-dozen-grid mt-3"})
dd = [foodgroup.text.strip() for entry in dd_grid for foodgroup in entry.find_all("h3")]
servings_and_examples = [serving.text.strip() for entry in dd_grid for serving in entry.find_all("p")]
examples = servings_and_examples[1::2]
servings = servings_and_examples[::2]

In [4]:
#make a dataframe out of the individual lists
df_dd = pd.DataFrame({
   "food_group": dd,
    "servings": servings, 
    "examples": examples})

df_dd['servings'] = df_dd['servings'].str.extract('(\d+)').astype(float)

In [5]:
df_dd["examples"] = df_dd["examples"].astype("string")
print(df_dd.dtypes)

food_group            object
servings             float64
examples      string[python]
dtype: object


In [6]:
#convert the 60 oz of drinks to five servings (five glasses - mugs of water, tea etc)
df_dd.loc[df_dd['food_group'] == "Beverages", 'servings'] = 5
df_dd.loc[df_dd['food_group'] == "Exercise", 'servings'] = 1

#drop the exercise column, since it is not a foodgroup
#df_dd = df_dd.drop(index=11)
print(df_dd)

                food_group  servings  \
0                    Beans       3.0   
1                  Berries       1.0   
2             Other Fruits       3.0   
3   Cruciferous Vegetables       1.0   
4                   Greens       2.0   
5         Other Vegetables       2.0   
6                 Flaxseed       1.0   
7           Nuts and Seeds       1.0   
8         Herbs and Spices       1.0   
9             Whole Grains       3.0   
10               Beverages       5.0   
11                Exercise       1.0   

                                        examples  
0            e.g. ½ c. cooked beans, ¼ c. hummus  
1          e.g. ½ c. fresh or frozen, ¼ c. dried  
2          e.g. 1 medium fruit, ¼ c. dried fruit  
3          e.g. ½ c. chopped, 1 tbsp horseradish  
4                     e.g. 1 c. raw, ½ c. cooked  
5                  e.g. ½ c. nonleafy vegetables  
6                             e.g. 1 tbsp ground  
7              e.g. ¼ c. nuts, 2 tbsp nut butter  
8                   

In [7]:
#change the foodgroups to be in line with the data later on
df_dd.at[6, 'food_group'] = 'Flaxseeds'
df_dd.at[7, 'food_group'] = 'Nuts'
df_dd.at[8, 'food_group'] = 'Spices'

In [8]:
print(df_dd)

                food_group  servings  \
0                    Beans       3.0   
1                  Berries       1.0   
2             Other Fruits       3.0   
3   Cruciferous Vegetables       1.0   
4                   Greens       2.0   
5         Other Vegetables       2.0   
6                Flaxseeds       1.0   
7                     Nuts       1.0   
8                   Spices       1.0   
9             Whole Grains       3.0   
10               Beverages       5.0   
11                Exercise       1.0   

                                        examples  
0            e.g. ½ c. cooked beans, ¼ c. hummus  
1          e.g. ½ c. fresh or frozen, ¼ c. dried  
2          e.g. 1 medium fruit, ¼ c. dried fruit  
3          e.g. ½ c. chopped, 1 tbsp horseradish  
4                     e.g. 1 c. raw, ½ c. cooked  
5                  e.g. ½ c. nonleafy vegetables  
6                             e.g. 1 tbsp ground  
7              e.g. ¼ c. nuts, 2 tbsp nut butter  
8                   

In [9]:
df_dd.to_csv("/Users/lottedieleman/Documents/PORTOLIO/df_dd.csv", index=False)