## 웹 크롤링

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_wine_info(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        wine_info = {}
        tasting_notes = {}

        # Tasting Notes
        tasting_notes_div = soup.find('div', class_='tasting-notes')
        if tasting_notes_div:
            aroma = tasting_notes_div.find(text='Aroma').find_next('span').text
            taste = tasting_notes_div.find(text='Taste').find_next('span').text
            finish = tasting_notes_div.find(text='Finish').find_next('span').text

            tasting_notes['Aroma'] = aroma
            tasting_notes['Taste'] = taste
            tasting_notes['Finish'] = finish

        # Information
        info_div = soup.find('div', class_='info')
        if info_div:
            info_table = info_div.find('table')
            rows = info_table.find_all('tr')
            for row in rows:
                key = row.find('td').text.strip()
                value = row.find('td').find_next('td').text.strip()
                wine_info[key] = value

        wine_info['Tasting Notes'] = tasting_notes
        return wine_info

    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None

def create_dataframe_from_wine_info(wine_info_list):
    # 데이터 프레임 생성을 위해 딕셔너리의 리스트를 사용합니다.
    return pd.DataFrame(wine_info_list)

if __name__ == "__main__":
    base_url = "https://dailyshot.co/m/items/"
    start_index = 1
    end_index = 10

    wine_info_list = []
    for i in range(start_index, end_index + 1):
        url = f"{base_url}{i}"
        wine_info = get_wine_info(url)
        print(wine_info)
        if wine_info:
            wine_info_list.append(wine_info)

    # # 데이터 프레임으로 변환
    # wine_dataframe = create_dataframe_from_wine_info(wine_info_list)
    # print(wine_dataframe)



In [None]:
import requests
from bs4 import BeautifulSoup
import re

url = "https://dailyshot.co/m/items/1"

# 웹페이지 내용 가져오기
response = requests.get(url)
if response.status_code != 200:
    print("Failed to fetch the page.")
    exit()

html_content = response.content

# BeautifulSoup를 사용하여 HTML 파싱
soup = BeautifulSoup(html_content, "html.parser")

# 원하는 부분 선택하기
target_div_class = "dailyshot-Stack-root dailyshot-1nmrv06"
target_div = soup.find_all("div", {"class": target_div_class})

if target_div:
    # 원하는 부분의 내용 출력
    print(target_div)
else:
    print("Target div not found.")

In [None]:
target_div[1:2]

In [None]:
import requests
from bs4 import BeautifulSoup

def fetch_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return None

    return response.content

def parse_html(html_content, target_class):
    soup = BeautifulSoup(html_content, "html.parser")
    target_div = soup.find("div", {"class": target_class})
    return target_div

if __name__ == "__main__":
    url = "https://dailyshot.co/m/items/1"
    target_div_class = "dailyshot-Stack-root dailyshot-2ufkj3"

    html_content = fetch_html(url)
    if html_content:
        target_div = parse_html(html_content, target_div_class)
        if target_div:
            "Aroma", "Taste", "Finish" #추출하기
            tasting = target_div.find("div", {"class": "dailyshot-Stack-root dailyshot-1nmrv06"})
            
            # 데이터 저장
            tasting_data = tasting.get_text() if tasting else None

            # 결과 출력
            print("tasting:", tasting_data)
        else:
            print("Target div not found.")
    else:
        print("Failed to fetch the page.")

In [None]:
import requests
from bs4 import BeautifulSoup
url = "https://dailyshot.co/m/items/1"

# 웹페이지 내용 가져오기
response = requests.get(url)
if response.status_code != 200:
    print("Failed to fetch the page.")
    exit()

html_content = response.content

# BeautifulSoup를 사용하여 HTML 파싱
soup = BeautifulSoup(html_content, "html.parser")

# 원하는 부분 선택하기
target_div_class = "dailyshot-Stack-root dailyshot-1nmrv06"
target_divs = soup.find_all("div", {"class": target_div_class})
name_target_div_class = "dailyshot-Stack-root dailyshot-1178y6y"
name_divs = soup.find_all("div", {"class": name_target_div_class})

if target_divs:
    # Aroma, Taste, Finish 정보를 저장할 딕셔너리 생성
    result_dict = {}

    # 각각의 요소에서 Aroma, Taste, Finish 정보 추출하기
    for target_div in target_divs[1:2]:  # 인덱스 1부터 2까지 (두 번째 요소만 선택)
        # Aroma
        aroma_div = target_div.find("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        aroma_title_element = aroma_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        aroma_text_element = aroma_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if aroma_title_element and aroma_text_element:
            aroma_title = aroma_title_element.text
            aroma_text = aroma_text_element.text
            result_dict["Aroma"] = aroma_text
        
        # Taste
        taste_div = target_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        taste_title_element = taste_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        taste_text_element = taste_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if taste_title_element and taste_text_element:
            taste_title = taste_title_element.text
            taste_text = taste_text_element.text
            result_dict["Taste"] = taste_text

        # Finish
        finish_div = taste_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        finish_title_element = finish_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        finish_text_element = finish_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if finish_title_element and finish_text_element:
            finish_title = finish_title_element.text
            finish_text = finish_text_element.text
            result_dict["Finish"] = finish_text
            # 딕셔너리에 종류, 도수 정보 추가하기
    for target_div in target_divs[2:3]:  # 인덱스 2부터 3까지 (세 번째 요소만 선택)
        # 종류
        kind_div = target_div.find("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        kind_title_element = kind_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        kind_text_element = kind_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if kind_title_element and kind_text_element:
            kind_title = kind_title_element.text
            kind_text = kind_text_element.text
            result_dict[kind_title] = kind_text
            
        # 용량
        volume_div = kind_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        volume_title_element = volume_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        volume_text_element =volume_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if volume_title_element and volume_text_element:
            volume_title = volume_title_element.text
            volume_text = volume_text_element.text
            result_dict[volume_title] = volume_text
        # 도수 정보 추가하기
        acl_div = volume_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        acl_title_element = acl_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        acl_text_element = acl_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if acl_title_element and acl_text_element:
            acl_title = acl_title_element.text
            acl_text = acl_text_element.text
            result_dict[acl_title] = acl_text
    for name_div in name_divs:
        name_text_element = name_div.find("h1", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-2eov7z"})
        if name_text_element:
            name_text = name_text_element.text
            result_dict["name"] = name_text
    
    # 딕셔너리 출력
    print(result_dict)

else:
    print("Target divs not found.")

In [None]:
import requests
from bs4 import BeautifulSoup

# Function to scrape data from a given URL
def scrape_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch the page: {url}")
        return None

    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")

    target_div_class = "dailyshot-Stack-root dailyshot-1nmrv06"
    name_target_div_class = "dailyshot-Stack-root dailyshot-1178y6y"
    target_divs = soup.find_all("div", {"class": target_div_class})
    name_divs = soup.find_all("div", {"class": name_target_div_class})

    if target_divs:
        result_dict = {}
        # 네임
    for name_div in name_divs:
        name_text_element = name_div.find("h1", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-2eov7z"})
        if name_text_element:
            name_text = name_text_element.text
            result_dict["name"] = name_text
    # 각각의 요소에서 Aroma, Taste, Finish 정보 추출하기
    for target_div in target_divs[1:2]:  # 인덱스 1부터 2까지 (두 번째 요소만 선택)
        # Aroma
        aroma_div = target_div.find("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        aroma_title_element = aroma_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        aroma_text_element = aroma_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if aroma_title_element and aroma_text_element:
            aroma_title = aroma_title_element.text
            aroma_text = aroma_text_element.text
            result_dict["Aroma"] = aroma_text
        
        # Taste
        taste_div = target_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        taste_title_element = taste_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        taste_text_element = taste_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if taste_title_element and taste_text_element:
            taste_title = taste_title_element.text
            taste_text = taste_text_element.text
            result_dict["Taste"] = taste_text

        # Finish
        finish_div = taste_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        finish_title_element = finish_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        finish_text_element = finish_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if finish_title_element and finish_text_element:
            finish_title = finish_title_element.text
            finish_text = finish_text_element.text
            result_dict["Finish"] = finish_text
            # 딕셔너리에 종류, 도수 정보 추가하기
    for target_div in target_divs[2:3]:  # 인덱스 2부터 3까지 (세 번째 요소만 선택)
        # 종류
        kind_div = target_div.find("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        kind_title_element = kind_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        kind_text_element = kind_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if kind_title_element and kind_text_element:
            kind_title = kind_title_element.text
            kind_text = kind_text_element.text
            result_dict[kind_title] = kind_text
            
        # 용량
        volume_div = kind_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        volume_title_element = volume_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        volume_text_element =volume_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if volume_title_element and volume_text_element:
            volume_title = volume_title_element.text
            volume_text = volume_text_element.text
            result_dict[volume_title] = volume_text
        # 도수 
        acl_div = volume_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
        acl_title_element = acl_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
        acl_text_element = acl_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
        if acl_title_element and acl_text_element:
            acl_title = acl_title_element.text
            acl_text = acl_text_element.text
            result_dict[acl_title] = acl_text

        return result_dict

    else:
        print(f"Target divs not found: {url}")
        return None

# Starting and ending item numbers
start_item = 1
end_item = 30

# Loop through the URLs and scrape data
for item_number in range(start_item, end_item + 1):
    url = f"https://dailyshot.co/m/items/{item_number}"
    data = scrape_data(url)
    if data:
        print(f"Data for {url}:")
        print(data)
        print("--------------------------")
    else:
        print(f"Data not found for {url}, moving to the next page...")

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to scrape data from a given URL
def scrape_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch the page: {url}")
        return None

    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")

    target_div_class = "dailyshot-Stack-root dailyshot-1nmrv06"
    name_target_div_class = "dailyshot-Stack-root dailyshot-1178y6y"
    target_divs = soup.find_all("div", {"class": target_div_class})
    name_divs = soup.find_all("div", {"class": name_target_div_class})

    if target_divs:
        result_dict = {}

        # Extract data for Aroma, Taste, Finish
        for target_div in target_divs[1:2]:  # 인덱스 1부터 2까지 (두 번째 요소만 선택)
            # Aroma
            try:
                aroma_div = target_div.find("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
                aroma_title_element = aroma_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                aroma_text_element = aroma_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                aroma_title = aroma_title_element.text if aroma_title_element else None
                aroma_text = aroma_text_element.text if aroma_text_element else None
                result_dict[aroma_title] = aroma_text
            except AttributeError:
                aroma_title = None
                aroma_text = None
                result_dict[aroma_title] = None

            # Taste
            try:
                taste_div = target_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
                taste_title_element = taste_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                taste_text_element = taste_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                taste_title = taste_title_element.text if taste_title_element else None
                taste_text = taste_text_element.text if taste_text_element else None
                result_dict[taste_title] = taste_text
            except AttributeError:
                taste_title = None
                taste_text = None
                result_dict[taste_title] = None

            # Finish
            try:
                finish_div = taste_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
                finish_title_element = finish_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                finish_text_element = finish_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                finish_title = finish_title_element.text if finish_title_element else None
                finish_text = finish_text_element.text if finish_text_element else None
                result_dict[finish_title] = finish_text
            except AttributeError:
                finish_title = None
                finish_text = None
                result_dict[finish_title] = None
        # Extract data for 종류, 도수, 도
        for target_div in target_divs[2:3]:  # 인덱스 2부터 3까지 (세 번째 요소만 선택)
            # 종류
            try:
                kind_div = target_div.find("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
                kind_title_element = kind_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                kind_text_element = kind_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                kind_title = kind_title_element.text if kind_title_element else None
                kind_text = kind_text_element.text if kind_text_element else None
                result_dict[kind_title] = kind_text
            except AttributeError:
                kind_title = None
                kind_text = None
                result_dict[kind_title] = None
                
            # 도수
            try:
                acl_div = kind_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
                acl_title_element = acl_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                acl_text_element = acl_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                acl_title = acl_title_element.text if acl_title_element else None
                acl_text = acl_text_element.text if acl_text_element else None
                result_dict[acl_title] = acl_text
            except AttributeError:
                acl_title = None
                acl_text = None
                result_dict[acl_title] = None

        # Extract data for name from name_divs
        for name_div in name_divs:
            name_text_element = name_div.find("h1", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-2eov7z"})
            name_text = name_text_element.text if name_text_element else None
            result_dict['name'] = name_text

        return result_dict

    else:
        print(f"Target divs not found: {url}")
        return None

# Starting and ending item numbers
start_item = 1
end_item = 300

# Create an empty DataFrame to store the data
df_list = []  # 빈 리스트 생성

# Loop through the URLs and scrape data
for item_number in range(start_item, end_item + 1):
    url = f"https://dailyshot.co/m/items/{item_number}"
    data = scrape_data(url)
    if data:
        # Append the scraped data as a new row to the DataFrame
        df_list.append(pd.DataFrame(data, index=[0]))
    else:
        print(f"Data not found for {url}, moving to the next page...")

# Concatenate the DataFrames in the list into a single DataFrame
df = pd.concat(df_list, ignore_index=True)

df.to_json("./data.json", index=False)

# Print the DataFrame containing all the data
print(df)

In [None]:
df[:5]

In [None]:
import requests
from bs4 import BeautifulSoup

# Function to scrape data from a given URL
def scrape_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch the page: {url}")
        return None

    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")

    target_div_class = "dailyshot-Stack-root dailyshot-1nmrv06"
    name_target_div_class = "dailyshot-Stack-root dailyshot-1178y6y"
    target_divs = soup.find_all("div", {"class": target_div_class})
    name_divs = soup.find_all("div", {"class": name_target_div_class})

    if target_divs:
        result_dict = {}
        # 네임
    for name_div in name_divs:
        name_text_element = name_div.find("h1", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-2eov7z"})
        if name_text_element:
            name_text = name_text_element.text
            result_dict["name"] = name_text
    # 각각의 요소에서 Aroma, Taste, Finish 정보 추출하기
    for target_div in target_divs[1:2]:  # 인덱스 1부터 2까지 (두 번째 요소만 선택)
        # Aroma
        try:
            aroma_div = target_div.find("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
            if aroma_div:
                aroma_title_element = aroma_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                aroma_text_element = aroma_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                if aroma_title_element and aroma_text_element:
                    aroma_title = aroma_title_element.text if aroma_title_element else None
                    aroma_text = aroma_text_element.text if aroma_text_element else None
                    result_dict[aroma_title] = aroma_text
        except AttributeError:
            pass
        # Taste
        try:
            taste_div = aroma_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
            if taste_div:
                taste_title_element = taste_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                taste_text_element = taste_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                if taste_title_element and taste_text_element:
                    taste_title = taste_title_element.text if taste_title_element else None
                    taste_text = taste_text_element.text if taste_text_element else None
                    result_dict[taste_title] = taste_text
        except AttributeError:
            pass

        # Finish
        try:
            finish_div = taste_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
            if finish_div:
                finish_title_element = finish_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                finish_text_element = finish_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                if finish_title_element and finish_text_element:
                    finish_title = finish_title_element.text if finish_title_element else None
                    finish_text = finish_text_element.text if finish_text_element else None
                    result_dict[finish_title] = finish_text
        except AttributeError:
            pass
        
    for target_div in target_divs[2:3]:  # 인덱스 2부터 3까지 (세 번째 요소만 선택)
        # 종류
        try:
            kind_div = target_div.find("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
            if kind_div:
                kind_title_element = kind_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                kind_text_element = kind_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                if kind_title_element and kind_text_element:
                    kind_title = kind_title_element.text if kind_title_element else None
                    kind_text = kind_text_element.text if kind_text_element else None
                    result_dict[kind_title] = kind_text
        except AttributeError:
            pass
        
        # 용량
        try:
            volume_div = kind_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
            if volume_div:
                volume_title_element = volume_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                volume_text_element =volume_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                if volume_title_element and volume_text_element:
                    volume_title = volume_title_element.text if volume_title_element else None
                    volume_text = volume_text_element.text if volume_text_element else None
                    result_dict[volume_title] = volume_text
        except AttributeError:
            pass
        
        # 도수
        try:
            acl_div = volume_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
            if acl_div:
                acl_title_element = acl_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                acl_text_element = acl_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                if acl_title_element and acl_text_element:
                    acl_title = acl_title_element.text if acl_title_element else None
                    acl_text = acl_text_element.text if acl_text_element else None
                    result_dict[acl_title] = acl_text
        except AttributeError:
            pass
        
        return result_dict

    else:
        print(f"Target divs not found: {url}")
        return None

# Starting and ending item numbers
data_list = []

# Starting and ending item numbers
start_item = 1
end_item = 300

# Loop through the URLs and scrape data
for item_number in range(start_item, end_item + 1):
    url = f"https://dailyshot.co/m/items/{item_number}"
    data = scrape_data(url)
    if data:
        data_list.append(data)
        print(f"Data for {url}:")
        print(data)
        print("--------------------------")
    else:
        print(f"Data not found for {url}, moving to the next page...")

# 데이터프레임으로 변환
df = pd.DataFrame(data_list)


# 데이터프레임을 JSON 파일로 저장
df.to_json("./data.json")

In [None]:
df[:30]

In [None]:
print(data_list[:5])

In [2]:
import json

In [2]:
import requests
from bs4 import BeautifulSoup

# Function to scrape data from a given URL
def scrape_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch the page: {url}")
        return None

    html_content = response.content
    soup = BeautifulSoup(html_content, "html.parser")

    target_div_class = "dailyshot-Stack-root dailyshot-1nmrv06"
    name_target_div_class = "dailyshot-Stack-root dailyshot-1178y6y"
    target_divs = soup.find_all("div", {"class": target_div_class})
    name_divs = soup.find_all("div", {"class": name_target_div_class})

    if target_divs:
        result_dict = {}
        # 네임
    for name_div in name_divs:
        name_text_element = name_div.find("h1", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-2eov7z"})
        if name_text_element:
            name_text = name_text_element.text
            result_dict["name"] = name_text
    # 각각의 요소에서 Aroma, Taste, Finish 정보 추출하기
    for target_div in target_divs[1:2]:  # 인덱스 1부터 2까지 (두 번째 요소만 선택)
        # Aroma
        try:
            aroma_div = target_div.find("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
            if aroma_div:
                aroma_title_element = aroma_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                aroma_text_element = aroma_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                if aroma_title_element and aroma_text_element:
                    aroma_title = aroma_title_element.text if aroma_title_element else None
                    aroma_text = aroma_text_element.text if aroma_text_element else None
                    result_dict[aroma_title] = aroma_text
            try:
                taste_div = aroma_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
                if taste_div:
                    taste_title_element = taste_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                    taste_text_element = taste_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                    if taste_title_element and taste_text_element:
                        taste_title = taste_title_element.text if taste_title_element else None
                        taste_text = taste_text_element.text if taste_text_element else None
                        result_dict[taste_title] = taste_text
                try:
                    finish_div = taste_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
                    if finish_div:
                        finish_title_element = finish_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                        finish_text_element = finish_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                        if finish_title_element and finish_text_element:
                            finish_title = finish_title_element.text if finish_title_element else None
                            finish_text = finish_text_element.text if finish_text_element else None
                            result_dict[finish_title] = finish_text
                except AttributeError:
                    pass
            except AttributeError:
                pass
        except AttributeError:
            pass
        # Taste

        # Finish

        
    for target_div in target_divs[2:3]:  # 인덱스 2부터 3까지 (세 번째 요소만 선택)
        # 종류
        try:
            kind_div = target_div.find("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
            if kind_div:
                kind_title_element = kind_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                kind_text_element = kind_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                if kind_title_element and kind_text_element:
                    kind_title = kind_title_element.text if kind_title_element else None
                    kind_text = kind_text_element.text if kind_text_element else None
                    result_dict[kind_title] = kind_text
        except AttributeError:
            pass
        
        # 용량
        try:
            volume_div = kind_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
            if volume_div:
                volume_title_element = volume_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                volume_text_element =volume_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                if volume_title_element and volume_text_element:
                    volume_title = volume_title_element.text if volume_title_element else None
                    volume_text = volume_text_element.text if volume_text_element else None
                    result_dict[volume_title] = volume_text
        except AttributeError:
            pass
        
        # 도수
        try:
            acl_div = volume_div.find_next("div", {"class": "dailyshot-Group-root dailyshot-8k3bl3"})
            if acl_div:
                acl_title_element = acl_div.find("h3", {"class": "dailyshot-Text-root dailyshot-Title-root dailyshot-o22yry"})
                acl_text_element = acl_div.find("div", {"class": "dailyshot-Text-root dailyshot-uc2z2z"})
                if acl_title_element and acl_text_element:
                    acl_title = acl_title_element.text if acl_title_element else None
                    acl_text = acl_text_element.text if acl_text_element else None
                    result_dict[acl_title] = acl_text
        except AttributeError:
            pass
        
        return result_dict

    else:
        print(f"Target divs not found: {url}")
        return None

# Starting and ending item numbers
data_list = []

# Starting and ending item numbers
start_item = 1
end_item = 87000

for item_number in range(start_item, end_item + 1):
    url = f"https://dailyshot.co/m/items/{item_number}"
    data = scrape_data(url)
    if data:
        data_list.append(data)


# 데이터를 JSON 파일로 저장
with open("./data.json", "w", encoding="utf-8") as json_file:
    json.dump(data_list, json_file, ensure_ascii=False, indent=4)

Target divs not found: https://dailyshot.co/m/items/11
Target divs not found: https://dailyshot.co/m/items/13
Target divs not found: https://dailyshot.co/m/items/15
Target divs not found: https://dailyshot.co/m/items/17
Target divs not found: https://dailyshot.co/m/items/19
Target divs not found: https://dailyshot.co/m/items/21
Target divs not found: https://dailyshot.co/m/items/25
Target divs not found: https://dailyshot.co/m/items/117
Target divs not found: https://dailyshot.co/m/items/135
Target divs not found: https://dailyshot.co/m/items/136
Target divs not found: https://dailyshot.co/m/items/144
Target divs not found: https://dailyshot.co/m/items/148
Target divs not found: https://dailyshot.co/m/items/149
Target divs not found: https://dailyshot.co/m/items/150
Target divs not found: https://dailyshot.co/m/items/152
Target divs not found: https://dailyshot.co/m/items/161
Target divs not found: https://dailyshot.co/m/items/162
Target divs not found: https://dailyshot.co/m/items/164

ConnectionError: HTTPSConnectionPool(host='dailyshot.co', port=443): Max retries exceeded with url: /m/items/13515 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001FC0DEA2D60>: Failed to resolve 'dailyshot.co' ([Errno 11002] getaddrinfo failed)"))

In [3]:
import pandas as pd

In [4]:
alcohol_df = pd.read_json('./data.json')
alcohol_df

Unnamed: 0,name,Aroma,Taste,Finish,종류,용량,도수,궁중술 왕주13(375ml),도수솔송주 골드(375ml),이강주 19(375ml),...,국가,호랑이 배꼽 생막걸리(350ml),나루 생막걸리 6도(935ml),나루 생막걸리 11.5도(500ml),도깨비술 9도(750ml),미르25(375ml),영 & 리치 (500ml),위트-코인 (500ml),망고 팡팡 (500ml),아스라이 (500ml)
0,앱솔루트 그레이프,"파파야, 청포도, 용과","알코올감, 과일, 부드러운","산뜻한, 신선한, 과일",보드카,750m,40%,,,,...,,,,,,,,,,
1,배다리도가 주교주,"누룩, 쌀","달콤한, 구수한","부드러운, 은은한",우리술,500ml,16%,,,,...,,,,,,,,,,
2,뽀할라 코코뱅어 330ml,"구운 코코넛, 열대 과일, 로스팅한 커피","달고나, 달콤한, 묵직한","부드러운, 진득한",임페리얼 스타우트,330ml,12.5%,,,,...,,,,,,,,,,
3,"다나 에스테이트, 바소 까베르네 소비뇽","오크, 타바코, 체리, 자두, 레드커런트","체리, 숲, 딸기, 단단한 구조감","크렘 브륄레, 붉은 과일, 딸기, 다크 초콜릿, 은은한",레드 와인,750ml,14~15%,,,,...,,,,,,,,,,
4,오린 스위프트 8년 인 더 데저트,"오크, 라즈베리, 자두, 후추","블랙베리, 라즈베리, 클로브","블루베리, 오크, 바닐라, 체리",레드 와인,750ml,15.7%,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253,가모츠루 다이긴죠 골드,과일,"깨끗한, 순수한","풍부한, 긴 여운",사케,720ml,16.5%,,,,...,,,,,,,,,,
254,카모시비토 쿠헤이지 오두디지,"꽃, 풍부한","오크, 달콤한","긴 여운, 풍부한",사케,720ml,16%,,,,...,,,,,,,,,,
255,쿠보타 센쥬 준마이 긴죠,은은한,"달콤한, 부드러운, 감칠맛, 산뜻한",부드러운,사케,720ml,15%,,,,...,,,,,,,,,,
256,송죽매 클래식 준마이 1.5L,"풋사과, 멜론, 과일, 은은한","감칠맛, 곡물","은은한, 부드러운",사케,"1,500ml",15%,,,,...,,,,,,,,,,


## 필요 없는 컬럼 삭제

In [5]:
alcohol_df.drop(labels=['궁중술 왕주13(375ml)', '도수솔송주 골드(375ml)', '국가', '이강주 19(375ml)', '명인안동소주 35(360ml)', '대잎술(300ml)', '한산소곡주(생)(375ml)', '담솔(500ml)'], axis=1, inplace=True)
alcohol_df

Unnamed: 0,name,Aroma,Taste,Finish,종류,용량,도수,삼해소주(250ml),"서동의 달(화주, 250ml)","소서노의 꿈(추사40, 250ml)",호랑이 배꼽 생막걸리(350ml),나루 생막걸리 6도(935ml),나루 생막걸리 11.5도(500ml),도깨비술 9도(750ml),미르25(375ml),영 & 리치 (500ml),위트-코인 (500ml),망고 팡팡 (500ml),아스라이 (500ml)
0,앱솔루트 그레이프,"파파야, 청포도, 용과","알코올감, 과일, 부드러운","산뜻한, 신선한, 과일",보드카,750m,40%,,,,,,,,,,,,
1,배다리도가 주교주,"누룩, 쌀","달콤한, 구수한","부드러운, 은은한",우리술,500ml,16%,,,,,,,,,,,,
2,뽀할라 코코뱅어 330ml,"구운 코코넛, 열대 과일, 로스팅한 커피","달고나, 달콤한, 묵직한","부드러운, 진득한",임페리얼 스타우트,330ml,12.5%,,,,,,,,,,,,
3,"다나 에스테이트, 바소 까베르네 소비뇽","오크, 타바코, 체리, 자두, 레드커런트","체리, 숲, 딸기, 단단한 구조감","크렘 브륄레, 붉은 과일, 딸기, 다크 초콜릿, 은은한",레드 와인,750ml,14~15%,,,,,,,,,,,,
4,오린 스위프트 8년 인 더 데저트,"오크, 라즈베리, 자두, 후추","블랙베리, 라즈베리, 클로브","블루베리, 오크, 바닐라, 체리",레드 와인,750ml,15.7%,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253,가모츠루 다이긴죠 골드,과일,"깨끗한, 순수한","풍부한, 긴 여운",사케,720ml,16.5%,,,,,,,,,,,,
254,카모시비토 쿠헤이지 오두디지,"꽃, 풍부한","오크, 달콤한","긴 여운, 풍부한",사케,720ml,16%,,,,,,,,,,,,
255,쿠보타 센쥬 준마이 긴죠,은은한,"달콤한, 부드러운, 감칠맛, 산뜻한",부드러운,사케,720ml,15%,,,,,,,,,,,,
256,송죽매 클래식 준마이 1.5L,"풋사과, 멜론, 과일, 은은한","감칠맛, 곡물","은은한, 부드러운",사케,"1,500ml",15%,,,,,,,,,,,,


In [6]:
alcohol_df.drop(labels=['삼해소주(250ml)', '서동의 달(화주, 250ml)', '소서노의 꿈(추사40, 250ml)', '호랑이 배꼽 생막걸리(350ml)', '나루 생막걸리 6도(935ml)', '나루 생막걸리 11.5도(500ml)',
                        '도깨비술 9도(750ml)', '미르25(375ml)', '영 & 리치 (500ml)', '위트-코인 (500ml)', '망고 팡팡 (500ml)', '아스라이 (500ml)'], axis=1, inplace=True)
alcohol_df

Unnamed: 0,name,Aroma,Taste,Finish,종류,용량,도수
0,앱솔루트 그레이프,"파파야, 청포도, 용과","알코올감, 과일, 부드러운","산뜻한, 신선한, 과일",보드카,750m,40%
1,배다리도가 주교주,"누룩, 쌀","달콤한, 구수한","부드러운, 은은한",우리술,500ml,16%
2,뽀할라 코코뱅어 330ml,"구운 코코넛, 열대 과일, 로스팅한 커피","달고나, 달콤한, 묵직한","부드러운, 진득한",임페리얼 스타우트,330ml,12.5%
3,"다나 에스테이트, 바소 까베르네 소비뇽","오크, 타바코, 체리, 자두, 레드커런트","체리, 숲, 딸기, 단단한 구조감","크렘 브륄레, 붉은 과일, 딸기, 다크 초콜릿, 은은한",레드 와인,750ml,14~15%
4,오린 스위프트 8년 인 더 데저트,"오크, 라즈베리, 자두, 후추","블랙베리, 라즈베리, 클로브","블루베리, 오크, 바닐라, 체리",레드 와인,750ml,15.7%
...,...,...,...,...,...,...,...
253,가모츠루 다이긴죠 골드,과일,"깨끗한, 순수한","풍부한, 긴 여운",사케,720ml,16.5%
254,카모시비토 쿠헤이지 오두디지,"꽃, 풍부한","오크, 달콤한","긴 여운, 풍부한",사케,720ml,16%
255,쿠보타 센쥬 준마이 긴죠,은은한,"달콤한, 부드러운, 감칠맛, 산뜻한",부드러운,사케,720ml,15%
256,송죽매 클래식 준마이 1.5L,"풋사과, 멜론, 과일, 은은한","감칠맛, 곡물","은은한, 부드러운",사케,"1,500ml",15%


In [8]:
alcohol_df

Unnamed: 0,name,Aroma,Taste,Finish,종류,용량,도수
0,앱솔루트 그레이프,"파파야, 청포도, 용과","알코올감, 과일, 부드러운","산뜻한, 신선한, 과일",보드카,750m,40%
1,배다리도가 주교주,"누룩, 쌀","달콤한, 구수한","부드러운, 은은한",우리술,500ml,16%
2,뽀할라 코코뱅어 330ml,"구운 코코넛, 열대 과일, 로스팅한 커피","달고나, 달콤한, 묵직한","부드러운, 진득한",임페리얼 스타우트,330ml,12.5%
3,"다나 에스테이트, 바소 까베르네 소비뇽","오크, 타바코, 체리, 자두, 레드커런트","체리, 숲, 딸기, 단단한 구조감","크렘 브륄레, 붉은 과일, 딸기, 다크 초콜릿, 은은한",레드 와인,750ml,14~15%
4,오린 스위프트 8년 인 더 데저트,"오크, 라즈베리, 자두, 후추","블랙베리, 라즈베리, 클로브","블루베리, 오크, 바닐라, 체리",레드 와인,750ml,15.7%
...,...,...,...,...,...,...,...
253,가모츠루 다이긴죠 골드,과일,"깨끗한, 순수한","풍부한, 긴 여운",사케,720ml,16.5%
254,카모시비토 쿠헤이지 오두디지,"꽃, 풍부한","오크, 달콤한","긴 여운, 풍부한",사케,720ml,16%
255,쿠보타 센쥬 준마이 긴죠,은은한,"달콤한, 부드러운, 감칠맛, 산뜻한",부드러운,사케,720ml,15%
256,송죽매 클래식 준마이 1.5L,"풋사과, 멜론, 과일, 은은한","감칠맛, 곡물","은은한, 부드러운",사케,"1,500ml",15%


In [11]:
alcohol_df.columns = ['name', 'Aroma', 'Taste', 'Finish', 'Kind', 'Volume', 'Alcohol']
alcohol_df

Unnamed: 0,name,Aroma,Taste,Finish,Kind,Volume,Alcohol
0,앱솔루트 그레이프,"파파야, 청포도, 용과","알코올감, 과일, 부드러운","산뜻한, 신선한, 과일",보드카,750m,40%
1,배다리도가 주교주,"누룩, 쌀","달콤한, 구수한","부드러운, 은은한",우리술,500ml,16%
2,뽀할라 코코뱅어 330ml,"구운 코코넛, 열대 과일, 로스팅한 커피","달고나, 달콤한, 묵직한","부드러운, 진득한",임페리얼 스타우트,330ml,12.5%
3,"다나 에스테이트, 바소 까베르네 소비뇽","오크, 타바코, 체리, 자두, 레드커런트","체리, 숲, 딸기, 단단한 구조감","크렘 브륄레, 붉은 과일, 딸기, 다크 초콜릿, 은은한",레드 와인,750ml,14~15%
4,오린 스위프트 8년 인 더 데저트,"오크, 라즈베리, 자두, 후추","블랙베리, 라즈베리, 클로브","블루베리, 오크, 바닐라, 체리",레드 와인,750ml,15.7%
...,...,...,...,...,...,...,...
253,가모츠루 다이긴죠 골드,과일,"깨끗한, 순수한","풍부한, 긴 여운",사케,720ml,16.5%
254,카모시비토 쿠헤이지 오두디지,"꽃, 풍부한","오크, 달콤한","긴 여운, 풍부한",사케,720ml,16%
255,쿠보타 센쥬 준마이 긴죠,은은한,"달콤한, 부드러운, 감칠맛, 산뜻한",부드러운,사케,720ml,15%
256,송죽매 클래식 준마이 1.5L,"풋사과, 멜론, 과일, 은은한","감칠맛, 곡물","은은한, 부드러운",사케,"1,500ml",15%


In [15]:
alcohol_df.to_csv('test.csv', index=False)

In [16]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,name,Aroma,Taste,Finish,Kind,Volume,Alcohol
0,앱솔루트 그레이프,"파파야, 청포도, 용과","알코올감, 과일, 부드러운","산뜻한, 신선한, 과일",보드카,750m,40%
1,배다리도가 주교주,"누룩, 쌀","달콤한, 구수한","부드러운, 은은한",우리술,500ml,16%
2,뽀할라 코코뱅어 330ml,"구운 코코넛, 열대 과일, 로스팅한 커피","달고나, 달콤한, 묵직한","부드러운, 진득한",임페리얼 스타우트,330ml,12.5%
3,"다나 에스테이트, 바소 까베르네 소비뇽","오크, 타바코, 체리, 자두, 레드커런트","체리, 숲, 딸기, 단단한 구조감","크렘 브륄레, 붉은 과일, 딸기, 다크 초콜릿, 은은한",레드 와인,750ml,14~15%
4,오린 스위프트 8년 인 더 데저트,"오크, 라즈베리, 자두, 후추","블랙베리, 라즈베리, 클로브","블루베리, 오크, 바닐라, 체리",레드 와인,750ml,15.7%
...,...,...,...,...,...,...,...
253,가모츠루 다이긴죠 골드,과일,"깨끗한, 순수한","풍부한, 긴 여운",사케,720ml,16.5%
254,카모시비토 쿠헤이지 오두디지,"꽃, 풍부한","오크, 달콤한","긴 여운, 풍부한",사케,720ml,16%
255,쿠보타 센쥬 준마이 긴죠,은은한,"달콤한, 부드러운, 감칠맛, 산뜻한",부드러운,사케,720ml,15%
256,송죽매 클래식 준마이 1.5L,"풋사과, 멜론, 과일, 은은한","감칠맛, 곡물","은은한, 부드러운",사케,"1,500ml",15%
