In [1]:
import re
import requests
from bs4 import BeautifulSoup
import csv

In [3]:
url = "https://www.maangchi.com/recipes/noodles"

response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

dish_list = []

main_div = soup.find('div', id='main')

taxonomy_cards = main_div.find_all('div', class_='taxonomy-card')

for card in taxonomy_cards:
    english_name = card.find('h3').a.get_text(strip=True)
    korean_name = card.find('p').contents[-1].strip()
    link = card.find('h3').a['href']

    dish_list.append({
        "english_name": english_name,
        "korean_name": korean_name,
        "link": link
    })
    
for english_name, korean_name, link in dish_list:
    print("English Name:", english_name)
    print("Korean Name:", korean_name)
    print("Link:", link)
    print()


English Name: Hand-torn noodle soup
Korean Name: 수제비
Link: https://www.maangchi.com/recipe/sujebi

English Name: Cold soba with dipping sauce
Korean Name: 메밀국수
Link: https://www.maangchi.com/recipe/memil-guksu

English Name: Knife-cut noodle soup with perilla seeds
Korean Name: 들깨칼국수
Link: https://www.maangchi.com/recipe/deulkkae-kalguksu

English Name: Noodles & black bean sauce platter
Korean Name: 쟁반짜장면
Link: https://www.maangchi.com/recipe/jaengban-jjajangmyeon

English Name: Cold kimchi noodle soup
Korean Name: 김치말이국수
Link: https://www.maangchi.com/recipe/kimchimari-guksu

English Name: Kimchi sujebi
Korean Name: 김치수제비
Link: https://www.maangchi.com/recipe/kimchi-sujebi

English Name: Jjapaguri with steak
Korean Name: 쇠고기짜파구리
Link: https://www.maangchi.com/recipe/jjapaguri

English Name: Stir-fried noodles and vegetables
Korean Name: 잡채
Link: https://www.maangchi.com/recipe/easy-japchae

English Name: Crispy seaweed noodle rolls
Korean Name: 김말이
Link: https://www.maangchi.com/reci

In [5]:
# 텍스트 전처리 
ellipsis_pattern = re.compile(r'\.{2,}')

def remove_ellipsis(ingredient):
    return re.sub(ellipsis_pattern, '', ingredient)
def preprocess_text(text):
    return remove_ellipsis(text.replace('\xa0', ' '))


In [6]:
for dish in dish_list:
    dish_url = dish["link"]

    response = requests.get(dish_url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')

    # 양이 포함된 재료 정보 추출
    ul_tags_ingredients_with_quantity = soup.find("ul", attrs={'class': False})
    ingredients_with_quantity = []
    li_tags = ul_tags_ingredients_with_quantity.find_all('li', class_=False)
    for li_tag in li_tags:
        if li_tag.find_parent('ul').find('li', class_='metaheader'):
            break
        ingredients_with_quantity.append(preprocess_text(li_tag.get_text(strip=True)))

    # 양에 대한 정보 없이 재료 종류만 추출
    ul_tags = soup.find_all("ul", attrs={'class': False})
    ingredients_without_quantity = []
    for ul_tag in ul_tags:
        li_tags = ul_tag.find_all('li', class_=False)
        for li_tag in li_tags:
            if "Made with:" in li_tag.get_text():
                made_with_text = li_tag.get_text().replace("Made with:", "").strip()
                ingredients_without_quantity.append(preprocess_text(made_with_text))

    # 레시피 추출
    ol_tags = soup.find_all("ol")
    making_steps = []
    for ol_tag in ol_tags:
        li_tags = ol_tag.find_all('li', class_=False)
        for li_tag in li_tags:
            making_steps.append(preprocess_text(li_tag.get_text(strip=True)))

    dish["ingredients_with_quantity"] = ingredients_with_quantity
    dish["ingredients_without_quantity"] = ingredients_without_quantity
    dish["making_steps"] = making_steps

for dish in dish_list:
    print("English Name:", preprocess_text(dish["english_name"]))
    print("Korean Name:", preprocess_text(dish["korean_name"]))
    print("Link:", preprocess_text(dish["link"]))
    print("Ingredients with Quantity:", dish["ingredients_with_quantity"])
    print("Ingredients without Quantity:", dish["ingredients_without_quantity"])
    print("Making Steps:", dish["making_steps"])
    print()


English Name: Hand-torn noodle soup
Korean Name: 수제비
Link: https://www.maangchi.com/recipe/sujebi
Ingredients with Quantity: ['2 cups all-purpose flour', '¾ cup water', '½ teaspoonkosher salt', '1 tablespoon vegetable oil']
Ingredients without Quantity: ['all-purpose wheat flour, carrot, dried anchovies, fish sauce, garlic, green onion, kosher salt, onion, potato, soup soy sauce, toasted sesame oil, vegetable oil, water, and zucchini']
Making Steps: ['Combine the flour, water, salt, vegetable oil in a large bowl and knead in the bowl until the dough is smooth (like pizza dough), 10 to 15 minutes. Alternatively, combine water, salt, oil, and flour in a food processor fitted with the dough blade and process until the dough comes together in a ball, 1 to 2 minutes.', 'Remove the dough from the food processor and shape it into a smooth ball. Place the dough ball in a plastic bag and set aside for at least 30 minutes. This resting period allows the gluten to relax, making it easier to tear 

In [8]:
# CSV 파일로 저장
csv_file = "noodles.csv"

csv_header = ["English Name", "Korean Name", "Link", "Ingredients with Quantity", "Ingredients without Quantity", "Making Steps"]

with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    writer.writerow(csv_header)

    for dish in dish_list:
        row = [
            preprocess_text(dish["english_name"]),
            preprocess_text(dish["korean_name"]),
            preprocess_text(dish["link"]),
            ", ".join(dish["ingredients_with_quantity"]),
            ", ".join(dish["ingredients_without_quantity"]),
            ", ".join(dish["making_steps"])
        ]
        writer.writerow(row)

print("CSV file created successfully.")


CSV file created successfully.
