In [None]:
import requests
from bs4 import BeautifulSoup
import csv

In [None]:
# 음식명 추출 함수
def extract_recipe_names(url):
    response = requests.get(url)
    html_content = response.content.decode('utf-8', 'ignore')
    soup = BeautifulSoup(html_content, 'html.parser')

    recipe_names = []

    title_div = soup.find('div', class_='item_box_data2')
    title_li = title_div.find_all('li', class_='item_box_thum_content')

    for name in title_li:
        recipe_name_element = name.find('h2', class_='h2_padd0')
        if recipe_name_element:
            recipe_name = recipe_name_element.text.strip()
            recipe_names.append(recipe_name)

    return recipe_names

In [None]:
# 레시피 추출 함수 - 레시피 단계에서 불필요한 문자가 존재하여 동시에 전처리 진행
def extract_desired_content(soup):
    desired_content = []
    
    # 모든 div 태그 중 class가 'padd20'인 것을 찾음
    divs = soup.find_all('div', class_='padd20')
    
    # 각 div에서 텍스트 추출
    for div in divs:
        text = div.get_text(strip=True)
        
        # 불필요한 문자 제거
        text = text.replace('？', '').replace('<', '').replace('>', '')
        
        # 원하는 내용 추출
        if text.startswith(('1.', '2.', '3.', '4.', '5.')):
            desired_content.append(text)
    
    return desired_content

In [None]:
food_data = [] # 크롤링한 데이터를 저장할 리스트 선언

for i in range(1, 55):
    url = f"http://www.lampcook.com/food/food_fusion_list.php?search_mode=0&alpha_no=0&big_no=0&field_one=&sql_one=&pagenum={i}"
    response = requests.get(url)
    html_content = response.content.decode('utf-8', 'ignore')
    soup = BeautifulSoup(html_content, 'html.parser')

    recipe_names = extract_recipe_names(url)

    link_div = soup.find('div', class_='item_box_data2')
    link_li = link_div.find_all('li', class_='item_box_thum')

    for i, card in enumerate(link_li):
        relative_link = card.find('a')['href']
        absolute_link = f"http://www.lampcook.com/{relative_link}"

        response = requests.get(absolute_link)
        html_content = response.content.decode('utf-8', 'ignore')
        soup = BeautifulSoup(html_content, 'html.parser')

        padd20 = soup.find_all('div', class_='padd20')
        ingredients_list = []

        for div in padd20:
            br_tags = div.find_all('br')
            for br in br_tags:
                next_sib = br.next_sibling
                if next_sib and isinstance(next_sib, str):
                    ingredients_list.extend([ingredient.strip() for ingredient in next_sib.split(',') if ingredient.strip()])

        for val in ingredients_list:
            if val == "<부재료>":
                ingredients_list.remove(val)
            if val == "<양념>":
                ingredients_list.remove(val)

        desired_content = extract_desired_content(soup)

        food_data.append({
            "name": recipe_names[i],
            "url": absolute_link,
            "ingredients": ingredients_list,
            "recipe": desired_content
        })

for data in food_data:
    print("Name:", data["name"])
    print("URL:", data["url"])
    print("Ingredients:", data["ingredients"])
    print("Recipe:")
    for content in data["recipe"]:
        print(content)
    print()


In [None]:
with open('lampcook.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # CSV 파일의 헤더 작성
    writer.writerow(['Name', 'URL', 'Ingredients', 'Recipe'])
    
    # food_data 리스트에 있는 데이터를 CSV 파일에 기록
    for data in food_data:
        name = data["name"]
        url = data["url"]
        ingredients = ', '.join(data["ingredients"])
        recipe = ', '.join(data["recipe"])
        
        # 각 항목을 CSV 파일에 작성
        writer.writerow([name, url, ingredients, recipe])