In [None]:
import re
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

In [None]:
# 데이터 전처리 함수
ellipsis_pattern = re.compile(r'\.{2,}')

def remove_ellipsis(ingredient):
    return re.sub(ellipsis_pattern, '', ingredient)

def preprocess_text(text):
    return remove_ellipsis(text.replace('\xa0', ' '))

In [None]:
# 메인 크롤링 함수
def crawl_category_recipes(category):
    url = f"https://www.maangchi.com/recipes/{category}"
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')

    dish_list = []

    main_div = soup.find('div', id='main')
    taxonomy_cards = main_div.find_all('div', class_='taxonomy-card')

    for card in taxonomy_cards:
        english_name = card.find('h3').a.get_text(strip=True)
        korean_name = card.find('p').contents[-1].strip()
        link = card.find('h3').a['href']

        dish_list.append({
            "english_name": english_name,
            "korean_name": korean_name,
            "link": link
        })

    for dish in dish_list:
        dish_url = dish["link"]

        response = requests.get(dish_url)
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        ul_tags_ingredients_with_quantity = soup.find("ul", attrs={'class': False})
        ingredients_with_quantity = []
        li_tags = ul_tags_ingredients_with_quantity.find_all('li', class_=False)
        for li_tag in li_tags:
            if li_tag.find_parent('ul').find('li', class_='metaheader'):
                break
            ingredients_with_quantity.append(preprocess_text(li_tag.get_text(strip=True)))

        ul_tags = soup.find_all("ul", attrs={'class': False})
        ingredients_without_quantity = []
        for ul_tag in ul_tags:
            li_tags = ul_tag.find_all('li', class_=False)
            for li_tag in li_tags:
                if "Made with:" in li_tag.get_text():
                    made_with_text = li_tag.get_text().replace("Made with:", "").strip()
                    ingredients_without_quantity.append(preprocess_text(made_with_text))

        ol_tags = soup.find_all("ol")
        making_steps = []
        for ol_tag in ol_tags:
            li_tags = ol_tag.find_all('li', class_=False)
            for li_tag in li_tags:
                making_steps.append(preprocess_text(li_tag.get_text(strip=True)))

        dish["ingredients_with_quantity"] = ingredients_with_quantity
        dish["ingredients_without_quantity"] = ingredients_without_quantity
        dish["making_steps"] = making_steps

    return dish_list

In [None]:
# 카테고리 별 CSV 파일 저장 함수
def save_to_csv(category, dish_list):
    csv_file = f"{category}.csv"

    csv_header = ["English Name", "Korean Name", "Link", "Ingredients with Quantity", "Ingredients without Quantity", "Making Steps"]

    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(csv_header)

        for dish in dish_list:
            row = [
                preprocess_text(dish["english_name"]),
                preprocess_text(dish["korean_name"]),
                preprocess_text(dish["link"]),
                ", ".join(dish["ingredients_with_quantity"]),
                ", ".join(dish["ingredients_without_quantity"]),
                ", ".join(dish["making_steps"])
            ]
            writer.writerow(row)

    print(f"CSV file for {category} created successfully.")

In [None]:
# maangchi 사이트에서 크롤링할 카테고리
categories = [
    "anju", "beef", "chicken", "cold", "drinks", "fermented", "gimbap",
    "korean-bakery", "main", "mandu", "mitbanchan", "noodles", "one-bowl-meals",
    "porridge", "rice", "seafood", "snacks", "stews", "street-food"
]

In [None]:
# 크롤링 실행 과정
for category in categories:
    recipes = crawl_category_recipes(category)
    save_to_csv(category, recipes)