In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
from tqdm import tqdm
import numpy as np

db_dict = {"en": "/home/pgajo/working/food/data/gz_textonly_parallel/gf_parallel/gf_en",
           "it": "/home/pgajo/working/food/data/gz_textonly_parallel/gf_parallel/gf_it"}

for key in db_dict.keys():
    if not os.path.exists(db_dict[key]):
        os.mkdir(db_dict[key])

def extract_recipe(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Print the title
    title = soup.find('h1', class_="gz-title-recipe gz-mBottom2x")
    title_raw = f"{title.get_text()}\n{url}"
    # print("Title:", title_raw)

    main_content = soup.find_all('div', class_="gz-content-recipe gz-mBottom4x")
    presentation = main_content[0].find('p')
    presentation_raw = presentation.get_text()
    # print("Presentation:", presentation_raw)

    ingredients_div = soup.find('div', class_ = "gz-ingredients gz-mBottom4x gz-outer")
    ingredient_list = []
    for i, ingredient in enumerate(ingredients_div.find_all('dd', class_ = "gz-ingredient")):
        ingredient_name = re.sub("\s+", " ", ingredient.find('a').get_text())
        # print(i, ingredient_name)
        ingredient_quantity = re.sub("\s+", " ", ingredient.find('span').get_text())
        # print(i, ingredient_quantity)
        # if len(' '.join(ingredient.find('span').get_text().split())) > 12:
        #     print(f'Check length of quantity in {i}: {ingredient_quantity}')
        line = ingredient_name + "\t" + ingredient_quantity
        ingredient_list.append(line)

    ingredients_raw = '\n'.join(ingredient_list)
    # print("Ingredients:")
    # print(ingredients_raw.strip())
        
    preparation = main_content[1]
    # print(preparation)

    preparation_list = []
    for paragraph in preparation.find_all('p'):
        # print(paragraph)
        for span in paragraph.find_all('span', class_="num-step"):
            step_number = span.get_text()
            span.replace_with(f"[{step_number}]")
        preparation_list.append(paragraph.get_text())
    preparation_raw = '\n'.join(preparation_list)
    # print("Preparation:", preparation_raw)

    return title_raw, presentation_raw, ingredients_raw, preparation_raw

df_recipes = pd.read_csv("/home/pgajo/working/food/data/gz_textonly_parallel/url_list.csv")

# let's only keep the ones which have both, so we can make a parallel corpus
df_recipes = df_recipes.dropna()
df_recipes = df_recipes.drop_duplicates()
print(len(df_recipes))
# # limit to 10 for testing purposes
# df_recipes = df_recipes[:10]

url_it_list = df_recipes['it']
url_en_list = df_recipes['en']

# create a new folder for the new english dataset and for the new italian dataset
 
language_list = df_recipes.columns 

for lang in language_list:
    
    lang = lang.split("_")[-1]
    progressbar = tqdm(enumerate(df_recipes[lang]), total = len(df_recipes[lang]))#.set_description(lang)
    for i, url in progressbar:
        
        title, presentation, ingredients, preparation = extract_recipe(url)
        
        recipe_path = os.path.join(db_dict[lang], f"gf_{lang}_{i}")
        
        if not os.path.exists(recipe_path):
            os.mkdir(recipe_path)

        # within recipe_path write a separate txt file for each output of extract_recipe

        with open(os.path.join(recipe_path, "title.txt"), "w", encoding="utf8") as f:
            f.write(title)

        with open(os.path.join(recipe_path, "presentation.txt"), "w", encoding="utf8") as f:
            f.write(presentation)

        with open(os.path.join(recipe_path, "ingredients.txt"), "w", encoding="utf8") as f:
            f.write(ingredients)

        with open(os.path.join(recipe_path, "preparation.txt"), "w", encoding="utf8") as f:
            f.write(preparation)



597


  0%|          | 0/597 [00:00<?, ?it/s]

100%|██████████| 597/597 [01:48<00:00,  5.51it/s]
100%|██████████| 597/597 [02:43<00:00,  3.66it/s]


In [None]:

url = "https://ricette.giallozafferano.it/Pan-di-Spagna.html"

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Print the title
title = soup.find('h1', class_="gz-title-recipe gz-mBottom2x")
title_raw = title.get_text()
print("Title:", title_raw)

main_content = soup.find_all('div', class_="gz-content-recipe gz-mBottom4x")
presentation = main_content[0].find('p')
presentation_raw = presentation.get_text()
print("Presentation:", presentation_raw)

ingredients_div = soup.find('div', class_ = "gz-ingredients gz-mBottom4x gz-outer")
ingredient_list = []
for i, ingredient in enumerate(ingredients_div.find_all('dd', class_ = "gz-ingredient")):
    ingredient_name = re.sub("\s+", " ", ingredient.find('a').get_text())
    # print(i, ingredient_name)
    ingredient_quantity = re.sub("\s+", " ", ingredient.find('span').get_text())
    # print(i, ingredient_quantity)
    if len(' '.join(ingredient.find('span').get_text().split())) > 12:
        print(f'Check length of quantity in {i}!')
    line = ingredient_name + "\t" + ingredient_quantity
    ingredient_list.append(line)

ingredients_raw = '\n'.join(ingredient_list)
print("Ingredients:")
print(ingredients_raw.strip())
    
preparation = main_content[1]
# print(preparation)

preparation_list = []
for paragraph in preparation.find_all('p'):
    # print(paragraph)
    for span in paragraph.find_all('span', class_="num-step"):
        step_number = span.get_text()
        span.replace_with(f"[{step_number}]")
    preparation_list.append(paragraph.get_text())
preparation_raw = '\n'.join(preparation_list)
print("Preparation:", preparation_raw)