In [None]:
import aiohttp
import asyncio
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import nest_asyncio
from tqdm.asyncio import tqdm_asyncio

# nest_asyncio 적용
nest_asyncio.apply()

async def fetch(session, url):
    try:
        async with session.get(url, timeout=aiohttp.ClientTimeout(total=3000)) as response:
            return await response.text()
    except asyncio.TimeoutError:
        print(f"Timeout error for URL: {url}")
        return None

async def get_news_links(session, base_url, page):
    url = f"{base_url}&page={page}"
    response_text = await fetch(session, url)
    if response_text is None:
        return []
    soup = BeautifulSoup(response_text, 'html.parser')
    
    links = []
    for a in soup.select("ul.type06_headline li dl dt a"):
        links.append(a["href"])
    for a in soup.select("ul.type06 li dl dt a"):
        links.append(a["href"])
    
    return links

async def get_news_content(session, url):
    response_text = await fetch(session, url)
    if response_text is None:
        return None, None
    soup = BeautifulSoup(response_text, 'html.parser')
    
    title_tag = soup.select_one("h2.media_end_head_headline")
    content_tag = soup.find('article', {'id': 'dic_area'})
    
    if title_tag and content_tag:
        title = title_tag.get_text().strip()
        content = content_tag.get_text().strip()
        return title, content
    return None, None

async def main():
    today = datetime.today().strftime("%Y%m%d")
    base_url = "https://news.naver.com/main/list.naver?mode=LSD&mid=shm&sid1=101&date=" + today
    async with aiohttp.ClientSession() as session:
        news_links = set()
        page = 1
        
        while True:
            links = await get_news_links(session, base_url, page)
            if not links or page == 500:
                break            
            news_links.update(links)
            page += 1
        

        tasks = [get_news_content(session, link) for link in news_links]
        
        # news_contents = await asyncio.gather(*tasks)
        news_contents = await tqdm_asyncio.gather(*tasks, desc="Fetching news content")
        # None 값을 제거
        news_contents = [content for content in news_contents if content[0] is not None]

        # 명사 추출
        nouns_data = []
        for title, content in news_contents:
            nouns_data.append({'Title': title, 'content': content})
        
        df = pd.DataFrame(nouns_data)
        df.to_csv(f"{today}.csv", index=False, encoding='utf-8-sig')
        # df.to_csv("20240701.csv",index=False,encoding='utf-8-sig')
# Jupyter Notebook 환경에서 실행
await main()


## 하둡

In [46]:
import os
import subprocess
from pyarrow import fs
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import pandas as pd
import pyarrow.csv as pc
# Get the current date and format it as a string
today = datetime.now().strftime("%Y-%m-%d")

# Define the file path in HDFS

#hdfs dfs -mkdir test
file_path = f"/test{today}.csv"
classpath = subprocess.Popen(["/home/ksk/hadoop/bin/hdfs", "classpath", "--glob"], stdout=subprocess.PIPE).communicate()[0]
os.environ["CLASSPATH"] = classpath.decode("utf-8")
hdfs = fs.HadoopFileSystem(host='192.168.0.206', port=8020, user='ksk')

# Pandas DataFrame을 PyArrow의 Table 객체로 변환
table = pa.Table.from_pandas(df)

# Write the table to HDFS as a Parquet file
with hdfs.open_output_stream(file_path) as stream:
    # pc.write_table(table, stream)
    pc.write_csv(table,stream)
print(f"DataFrame saved to HDFS at {file_path}")
    
# # PyArrow를 사용하여 Parquet 포맷으로 데이터 저장
# pq.write_table(table, file_path, filesystem=hdfs)


DataFrame saved to HDFS at /test/2024-07-01.csv
