In [3]:
%pip install requests
%pip install pandas
%pip install beautifulsoup4
%pip install boto3
%pip install logging

Collecting requests
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (33 kB)
Collecting idna<4,>=2.5 (from requests)
  Downloading idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.2.3-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2024.8.30-py3-none-any.whl.metadata (2.2 kB)
Downloading requests-2.32.3-py3-none-any.whl (64 kB)
Downloading certifi-2024.8.30-py3-none-any.whl (167 kB)
Downloading charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl (118 kB)
Downloading idna-3.10-py3-none-any.whl (70 kB)
Downloading urllib3-2.2.3-py3-none-any.whl (126 kB)
Installing collected packages: urllib3, idna, charset-normalizer, certifi, requests
Successfully installed certifi-2024.8.30 charset-normalizer-3.3.2 idna-3.10 reque

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import uuid
import boto3
import io
import logging
from urllib.parse import urlparse

logging.basicConfig(level='DEBUG')

header = {'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                         '(KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'),}

session = boto3.Session(
    aws_access_key_id='deleteKey',
    aws_secret_access_key='deleteAccessKey',
    region_name='ap-northeast-2'
)

s3 = session.resource('s3')
bucket_name = 'newsseug-bucket'

categories = {
    '정치': 'https://www.joongang.co.kr/politics',
    '경제': 'https://www.joongang.co.kr/money',
    '국제': 'https://www.joongang.co.kr/world',
    '사회_사건': 'https://www.joongang.co.kr/society/accident', # 앞 분기 기준
    '경제_과학': 'https://www.joongang.co.kr/money/science', # 앞 분기 기준
    '사회': 'https://www.joongang.co.kr/society',
    '스포츠': 'https://www.joongang.co.kr/sports',
}

def scrap_article(article_url, category):
    response = requests.get(article_url, headers=header)
    soup = BeautifulSoup(response.text, 'html.parser')

    try:
        # category = soup.find('a', {'class': 'title'}).get_text(strip=True)

        page_category = soup.find('a', {'class': 'title'}).get_text(strip=True)
        logging.info(f"page_category: {page_category}")
        
        # expected_category = categories.get(category).lower()
        logging.info(f"category: {category}")

        category = category.split('_')
        
        # 카테고리가 아닌 글은 패스
        # if page_category != category:
        if page_category not in category:
            logging.info(f"{article_url} 는 해당 카테고리가 아니어서 생략합니다. ")
            return None
    except AttributeError:
        logging.error(f"{article_url} 는 찾을 수 없습니다.")
        return None

    # 헤드라인 스크랩
    try:
        headline = soup.find('h1', {'class': 'headline'}).get_text(strip=True)
    except AttributeError:
        headline = 'N/A'

    # 생성일 스크랩
    try:
        date = soup.find('p', {'class': 'date'})
        time = date.find('time').get('datetime')
    except AttributeError:
        time = 'N/A'

    # 본문 스크랩
    try:
        content = soup.find('div', {'class': 'article_body'}).get_text(strip=True)
    except AttributeError:
        content = 'N/A'

    # 기자 이름 스크랩
    try:
        byline = soup.find('div', {'class': 'byline'})
        reporter = byline.find('a').get_text(strip=True).split('\n')[0]
    except AttributeError:
        reporter = 'N/A'

    return {'title': headline, 'time': time, 'content': content, 'reporter': reporter}

def get_last_path_segment(base_url):
    try:
        url_path = urlparse(base_url).path  # '/society'

        path_segments = [segment for segment in url_path.split('/') if segment]

        if len(path_segments) > 0:
            return path_segments[-1]
        else:
            raise ValueError("No valid path segments found in the URL.")
    except Exception as e:
        logging.error(f"Error: {e}")
        return None

def save_article_to_s3(article_data, base_url):
    file_name = str(uuid.uuid4()) + '.json'

    article_df = pd.DataFrame([article_data])

    buffer = io.StringIO()
    article_df.to_json(buffer, orient='records', lines=True, force_ascii=False)

    buffer.seek(0)
    
    category_slug = get_last_path_segment(base_url)
    
    try:
        # s3.put_object(Bucket=bucket_name, Key=f'content/{file_name}', Body=buffer.getvalue())
        s3.Bucket(bucket_name).put_object(Key=f'content/joongang/{category_slug}/{file_name}', Body=buffer.getvalue())
        logging.debug(f"{file_name} 을 {category_slug}/{bucket_name} 에 업로드 하였습니다.")
    except Exception as e:
        logging.error(f"Error uploading {file_name}: {e}")

def check_for_new_articles(base_url, category):
    response = requests.get(base_url, headers=header)
    soup = BeautifulSoup(response.text, 'html.parser')

    article_count = 0
    cards = soup.find_all('li', class_='card')
    cards.reverse()
    for card in cards:
        if article_count >= 3:
            break

        link_tag = card.find('a', href=True)
        if link_tag and 'href' in link_tag.attrs:
            article_link = link_tag['href']
        else:
            logging.warning("article_link is None or missing 'href'.")
            continue
    
        logging.debug(f"Scraping article: {article_link}")

        result = scrap_article(article_link, category)

        if result is None:
            continue

        save_article_to_s3(result, base_url)
        article_count += 1

def scrap_all_categories():
    
    for category, base_url in categories.items():
        logging.info(f"category (base_url): {category} ({base_url})")
        check_for_new_articles(base_url, category)
        time.sleep(2)

scrap_all_categories()

DEBUG:botocore.hooks:Changing event name from creating-client-class.iot-data to creating-client-class.iot-data-plane
DEBUG:botocore.hooks:Changing event name from before-call.apigateway to before-call.api-gateway
DEBUG:botocore.hooks:Changing event name from request-created.machinelearning.Predict to request-created.machine-learning.Predict
DEBUG:botocore.hooks:Changing event name from before-parameter-build.autoscaling.CreateLaunchConfiguration to before-parameter-build.auto-scaling.CreateLaunchConfiguration
DEBUG:botocore.hooks:Changing event name from before-parameter-build.route53 to before-parameter-build.route-53
DEBUG:botocore.hooks:Changing event name from request-created.cloudsearchdomain.Search to request-created.cloudsearch-domain.Search
DEBUG:botocore.hooks:Changing event name from docs.*.autoscaling.CreateLaunchConfiguration.complete-section to docs.*.auto-scaling.CreateLaunchConfiguration.complete-section
DEBUG:botocore.hooks:Changing event name from before-parameter-buil

## ❗ AWS Lambda code

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import uuid
import boto3
import io
import logging
from urllib.parse import urlparse

def lambda_handler(event, context):
    
    return {
        'statusCode': 200,
        'body': 'joongang-news-crawler from Lambda is do!'
    }

logging.basicConfig(level='DEBUG')

header = {'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                         '(KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'),}

session = boto3.Session(
    aws_access_key_id='deleteKey',
    aws_secret_access_key='deleteAccessKey',
    region_name='ap-northeast-2'
)

s3 = session.resource('s3')
bucket_name = 'newsseug-bucket'

categories = {
    '정치': 'https://www.joongang.co.kr/politics',
    '경제': 'https://www.joongang.co.kr/money',
    '국제': 'https://www.joongang.co.kr/world',
    '사회_사건': 'https://www.joongang.co.kr/society/accident', # 앞 분기 기준
    '경제_과학': 'https://www.joongang.co.kr/money/science', # 앞 분기 기준
    '사회': 'https://www.joongang.co.kr/society',
    '스포츠': 'https://www.joongang.co.kr/sports',
}

def scrap_article(article_url, category):
    response = requests.get(article_url, headers=header)
    soup = BeautifulSoup(response.text, 'html.parser')

    try:
        # category = soup.find('a', {'class': 'title'}).get_text(strip=True)

        page_category = soup.find('a', {'class': 'title'}).get_text(strip=True)
        logging.info(f"page_category: {page_category}")
        
        # expected_category = categories.get(category).lower()
        logging.info(f"category: {category}")

        category = category.split('_')
        
        # 카테고리가 아닌 글은 패스
        # if page_category != category:
        if page_category not in category:
            logging.info(f"{article_url} 는 해당 카테고리가 아니어서 생략합니다. ")
            return None
    except AttributeError:
        logging.error(f"{article_url} 는 찾을 수 없습니다.")
        return None

    # 헤드라인 스크랩
    try:
        headline = soup.find('h1', {'class': 'headline'}).get_text(strip=True)
    except AttributeError:
        headline = 'N/A'

    # 생성일 스크랩
    try:
        date = soup.find('p', {'class': 'date'})
        time = date.find('time').get('datetime')
    except AttributeError:
        time = 'N/A'

    # 본문 스크랩
    try:
        content = soup.find('div', {'class': 'article_body'}).get_text(strip=True)
    except AttributeError:
        content = 'N/A'

    # 기자 이름 스크랩
    try:
        byline = soup.find('div', {'class': 'byline'})
        reporter = byline.find('a').get_text(strip=True).split('\n')[0]
    except AttributeError:
        reporter = 'N/A'

    return {'title': headline, 'time': time, 'content': content, 'reporter': reporter}

def get_last_path_segment(base_url):
    try:
        url_path = urlparse(base_url).path  # '/society'

        path_segments = [segment for segment in url_path.split('/') if segment]

        if len(path_segments) > 0:
            return path_segments[-1]
        else:
            raise ValueError("No valid path segments found in the URL.")
    except Exception as e:
        logging.error(f"Error: {e}")
        return None

def save_article_to_s3(article_data, base_url):
    file_name = str(uuid.uuid4()) + '.json'

    article_df = pd.DataFrame([article_data])

    buffer = io.StringIO()
    article_df.to_json(buffer, orient='records', lines=True, force_ascii=False)

    buffer.seek(0)
    
    category_slug = get_last_path_segment(base_url)
    
    try:
        # s3.put_object(Bucket=bucket_name, Key=f'content/{file_name}', Body=buffer.getvalue())
        s3.Bucket(bucket_name).put_object(Key=f'content/joongang/{category_slug}/{file_name}', Body=buffer.getvalue())
        logging.debug(f"{file_name} 을 {category_slug}/{bucket_name} 에 업로드 하였습니다.")
    except Exception as e:
        logging.error(f"Error uploading {file_name}: {e}")

def check_for_new_articles(base_url, category):
    response = requests.get(base_url, headers=header)
    soup = BeautifulSoup(response.text, 'html.parser')

    article_count = 0
    cards = soup.find_all('li', class_='card')
    cards.reverse()
    for card in cards:
        if article_count >= 3:
            break

        link_tag = card.find('a', href=True)
        if link_tag and 'href' in link_tag.attrs:
            article_link = link_tag['href']
        else:
            logging.warning("article_link is None or missing 'href'.")
            continue
    
        logging.debug(f"Scraping article: {article_link}")

        result = scrap_article(article_link, category)

        if result is None:
            continue

        save_article_to_s3(result, base_url)
        article_count += 1

def scrap_all_categories():
    
    for category, base_url in categories.items():
        logging.info(f"category (base_url): {category} ({base_url})")
        check_for_new_articles(base_url, category)
        time.sleep(2)

scrap_all_categories()

# Refactoring

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging
from urllib.parse import urlparse

def lambda_handler(event, context):
    
    return {
        'statusCode': 200,
        'body': 'joongang-news-crawler from Lambda is do!'
    }

logging.basicConfig(level='DEBUG')

header = {'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                         '(KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'),}

categories = {
    '정치': 'https://www.joongang.co.kr/politics',
    '경제': 'https://www.joongang.co.kr/money',
    '국제': 'https://www.joongang.co.kr/world',
    '사회_사건': 'https://www.joongang.co.kr/society/accident', # 앞 분기 기준
    '경제_과학': 'https://www.joongang.co.kr/money/science', # 앞 분기 기준
    '사회': 'https://www.joongang.co.kr/society',
    '스포츠': 'https://www.joongang.co.kr/sports',
}


result_categories = {
    '정치': 'politics',
    '경제': 'economy',
    '국제': 'world',
    '사회_사건': 'accident',
    '경제_과학': 'science',
    '사회': 'society',
    '스포츠': 'sports',
}

def scrap_article(article_url, category):
    response = requests.get(article_url, headers=header)
    soup = BeautifulSoup(response.text, 'html.parser')

    try:

        page_category = soup.find('a', {'class': 'title'}).get_text(strip=True)
        logging.info(f"page_category: {page_category}")
        
        logging.info(f"category: {category}")

        category_parts = category.split('_')
        
        # 카테고리가 아닌 글은 패스
        if page_category not in category_parts:
            logging.info(f"{article_url} 는 해당 카테고리가 아니어서 생략합니다. ")
            return None
    except AttributeError:
        logging.error(f"{article_url} 는 찾을 수 없습니다.")
        return None

    # 헤드라인 스크랩
    try:
        title = soup.find('h1', {'class': 'headline'}).get_text(strip=True)
    except AttributeError:
        title = 'N/A'

    # 생성일 스크랩
    try:
        date = soup.find('p', {'class': 'date'})
        time = date.find('time').get('datetime')
    except AttributeError:
        time = 'N/A'

    # 본문 스크랩
    try:
        content = soup.find('div', {'class': 'article_body'}).get_text(strip=True)
    except AttributeError:
        content = 'N/A'

    result_category = result_categories.get(category, 'unknown')

    return {'title': title, 'time': time, 'content': content, 'source_url': article_url, 'category': result_category}

def post_article_to_server(article_data):
    
    fastapi_url = 'https://your-fastapi-url.com/process_data'
    headers = {'Content-Type': 'application/json'}
    
    try:
        response = requests.post(fastapi_url, json=article_data, headers=headers)
        response.raise_for_status()
        logging.info(f'\'statusCode\': 200 \n\'body\': \'요청 성공\'')
    except requests.exceptions.RequestException as e:
        logging.error(f'\'statusCode\': 500 \n\'Internal Server Error\': {e}')

def check_for_new_articles(base_url, category):
    response = requests.get(base_url, headers=header)
    soup = BeautifulSoup(response.text, 'html.parser')

    article_count = 0
    cards = soup.find_all('li', class_='card')
    cards.reverse()
    for card in cards:
        if article_count >= 3:
            break

        link_tag = card.find('a', href=True)
        if link_tag and 'href' in link_tag.attrs:
            article_link = link_tag['href']
        else:
            logging.warning("article_link is None or missing 'href'.")
            continue
    
        logging.debug(f"Scraping article: {article_link}")

        result = scrap_article(article_link, category)

        if result is None:
            continue

        # post_article_to_server(result) # 서버 완성되면 주석을 풀어라...
        print(result)
        article_count += 1

def scrap_all_categories():
    
    for category, base_url in categories.items():
        logging.info(f"category (base_url): {category} ({base_url})")
        check_for_new_articles(base_url, category)
        time.sleep(2)

scrap_all_categories()