Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

날짜 지정 기능 추가 (Add Date Assignment feature) #26

Merged
merged 9 commits into from
Mar 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2020 lumyjuwon
Copyright (c) 2022 lumyjuwon

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
Binary file added dist/KoreaNewsCrawler-1.51-py3-none-any.whl
Binary file not shown.
94 changes: 70 additions & 24 deletions korea_news_crawler/articlecrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(self):
self.categories = {'정치': 100, '경제': 101, '사회': 102, '생활문화': 103, '세계': 104, 'IT과학': 105, '오피니언': 110,
'politics': 100, 'economy': 101, 'society': 102, 'living_culture': 103, 'world': 104, 'IT_science': 105, 'opinion': 110}
self.selected_categories = []
self.date = {'start_year': 0, 'start_month': 0, 'end_year': 0, 'end_month': 0}
self.date = {'start_year': 0, 'start_month': 0, 'start_day' : 0, 'end_year': 0, 'end_month': 0, 'end_day':0}
self.user_operating_system = str(platform.system())

def set_category(self, *args):
Expand All @@ -27,47 +27,94 @@ def set_category(self, *args):
raise InvalidCategory(key)
self.selected_categories = args

def set_date_range(self, start_year, start_month, end_year, end_month):
args = [start_year, start_month, end_year, end_month]
def set_date_range(self, start_date:str, end_date:str):
start = list(map(int, start_date.split("-")))
end = list(map(int, end_date.split("-")))

# Setting Start Date
if len(start) == 1: # Input Only Year
start_year = start[0]
start_month = 1
start_day = 1
elif len(start) == 2: # Input Year and month
start_year, start_month = start
start_day = 1
elif len(start) == 3: # Input Year, month and day
start_year, start_month, start_day = start

# Setting End Date
if len(end) == 1: # Input Only Year
end_year = end[0]
end_month = 12
end_day = 31
elif len(end) == 2: # Input Year and month
end_year, end_month = end
end_day = calendar.monthrange(end_year, end_month)[1]
elif len(end) == 3: # Input Year, month and day
end_year, end_month, end_day = end

args = [start_year, start_month, start_day, end_year, end_month, end_day]

if start_year > end_year:
raise InvalidYear(start_year, end_year)
if start_month < 1 or start_month > 12:
raise InvalidMonth(start_month)
if end_month < 1 or end_month > 12:
raise InvalidMonth(end_month)
if start_day < 1 or calendar.monthrange(start_year, start_month)[1] < start_day:
raise InvalidDay(start_day)
if end_day < 1 or calendar.monthrange(end_year, end_month)[1] < end_day:
raise InvalidDay(end_day)
if start_year == end_year and start_month > end_month:
raise OverbalanceMonth(start_month, end_month)
if start_year == end_year and start_month == end_month and start_day > end_day:
raise OverbalanceDay(start_day, end_day)

for key, date in zip(self.date, args):
self.date[key] = date
print(self.date)

@staticmethod
def make_news_page_url(category_url, start_year, end_year, start_month, end_month):
def make_news_page_url(category_url, date):
made_urls = []
for year in range(start_year, end_year + 1):
target_start_month = start_month
target_end_month = end_month

if start_year != end_year:
if year == start_year:
target_start_month = start_month
for year in range(date['start_year'], date['end_year'] + 1):
if date['start_year'] == date['end_year']:
target_start_month = date['start_month']
target_end_month = date['end_month']
else:
if year == date['start_year']:
target_start_month = date['start_month']
target_end_month = 12
elif year == end_year:
elif year == date['end_year']:
target_start_month = 1
target_end_month = end_month
target_end_month = date['end_month']
else:
target_start_month = 1
target_end_month = 12

for month in range(target_start_month, target_end_month + 1):
for month_day in range(1, calendar.monthrange(year, month)[1] + 1):
if date['start_month'] == date['end_month']:
target_start_day = date['start_day']
target_end_day = date['end_day']
else:
if year == date['start_year'] and month == date['start_month']:
target_start_day = date['start_day']
target_end_day = calendar.monthrange(year, month)[1]
elif year == date['end_year'] and month == date['end_month']:
target_start_day = 1
target_end_day = date['end_day']
else:
target_start_day = 1
target_end_day = calendar.monthrange(year, month)[1]

for day in range(target_start_day, target_end_day + 1):
if len(str(month)) == 1:
month = "0" + str(month)
if len(str(month_day)) == 1:
month_day = "0" + str(month_day)
if len(str(day)) == 1:
day = "0" + str(day)

# 날짜별로 Page Url 생성
url = category_url + str(year) + str(month) + str(month_day)
url = category_url + str(year) + str(month) + str(day)

# totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄
# page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 (Redirect)
Expand All @@ -94,12 +141,11 @@ def crawling(self, category_name):
writer = Writer(category='Article', article_category=category_name, date=self.date)
# 기사 url 형식
url_format = f'http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1={self.categories.get(category_name)}&date='
# start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다.
target_urls = self.make_news_page_url(url_format, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month'])

print(category_name + " Urls are generated")
print("The crawler starts")
# start_year년 start_month월 start_day일 부터 ~ end_year년 end_month월 end_day일까지 기사를 수집합니다.
target_urls = self.make_news_page_url(url_format, self.date)
print(f'{category_name} Urls are generated')

print(f'{category_name} is collecting ...')
for url in target_urls:
request = self.get_url_data(url)
document = BeautifulSoup(request.content, 'html.parser')
Expand Down Expand Up @@ -186,5 +232,5 @@ def start(self):
if __name__ == "__main__":
Crawler = ArticleCrawler()
Crawler.set_category('생활문화')
Crawler.set_date_range(2018, 1, 2018, 2)
Crawler.set_date_range('2018-01', '2018-02')
Crawler.start()
16 changes: 16 additions & 0 deletions korea_news_crawler/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ def __init__(self, month):
def __str__(self):
return self.message

# 일이 올바르지 않을 때
class InvalidDay(Exception):
def __init__(self, day):
self.message = f'{day} is an invalid day'

def __str__(self):
return self.message



# 시작 달과 끝나는 달이 올바르지 않을 때
class OverbalanceMonth(Exception):
Expand All @@ -62,6 +71,13 @@ def __init__(self, start_month, end_month):
def __str__(self):
return self.message

class OverbalanceDay(Exception):
def __init__(self, start_day, end_day):
self.message = f'{start_day}(start day) is an overbalance with {end_day}(end day)'

def __str__(self):
return self.message


# 실행시간이 너무 길어서 데이터를 얻을 수 없을 때
class ResponseTimeout(Exception):
Expand Down
7 changes: 3 additions & 4 deletions korea_news_crawler/sample.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from korea_news_crawler.articlecrawler import ArticleCrawler

if __name__ == "__main__":
Crawler = ArticleCrawler()
# 정치, 경제, 생활문화, IT과학, 사회, 세계 카테고리 사용 가능
Crawler.set_category("IT과학", "경제", "생활문화", "IT과학", "사회", "세계")
# 2017년 12월부터 2018년 1월까지 크롤링 시작
Crawler.set_date_range(2017, 12, 2018, 1)
Crawler.set_category("IT과학", "세계")
# 2017년 12월 (1일) 부터 2018년 1월 13일까지 크롤링 시작 YYYY-MM-DD의 형식으로 입력
Crawler.set_date_range('2017-12', '2018-01-13')
Crawler.start()
5 changes: 3 additions & 2 deletions korea_news_crawler/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ class Writer(object):
def __init__(self, category, article_category, date):
self.start_year = date['start_year']
self.start_month = f'0{date["start_month"]}' if len(str(date['start_month'])) == 1 else str(date['start_month'])
self.start_day = f'0{date["start_day"]}' if len(str(date['start_day'])) == 1 else str(date['start_day'])
self.end_year = date['end_year']
self.end_month = f'0{date["end_month"]}' if len(str(date['end_month'])) == 1 else str(date['end_month'])

self.end_day = f'0{date["end_day"]}' if len(str(date['end_day'])) == 1 else str(date['end_day'])
self.file = None
self.initialize_file(category, article_category)

Expand All @@ -20,7 +21,7 @@ def initialize_file(self, category, article_category):
if os.path.exists(output_path) is not True:
os.mkdir(output_path)

file_name = f'{output_path}/{category}_{article_category}_{self.start_year}{self.start_month}_{self.end_year}{self.end_month}.csv'
file_name = f'{output_path}/{category}_{article_category}_{self.start_year}{self.start_month}{self.start_day}_{self.end_year}{self.end_month}{self.end_day}.csv'
if os.path.isfile(file_name):
raise ExistFile(file_name)

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[metadata]
description-file = README.md
description_file = README.md
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from setuptools import setup

# build package command: python setup.py bdist_wheel
# release package command: twine upload dist/KoreaNewsCrawler-version-py3-none-any.whl
# release package command: twine upload dist/KoreaNewsCrawler-${version}-py3-none-any.whl

setup(
name = 'KoreaNewsCrawler',
version = '1.50',
version = '1.51',
description = 'Crawl the korean news',
author = 'lumyjuwon',
author_email = 'lumyjuwon@gmail.com',
url = 'https://github.com/lumyjuwon/KoreaNewsCrawler',
download_url = 'https://github.com/lumyjuwon/KoreaNewsCrawler/archive/1.50.tar.gz',
download_url = 'https://github.com/lumyjuwon/KoreaNewsCrawler/archive/1.51.tar.gz',
install_requires = ['requests', 'beautifulsoup4'],
packages = ['korea_news_crawler'],
keywords = ['crawl', 'KoreaNews', 'crawler'],
Expand Down