Skip to content

Commit

Permalink
Merge pull request #9 from lumyjuwon/repeat_request
Browse files Browse the repository at this point in the history
Repeat request & html error
  • Loading branch information
lumyjuwon committed Jul 4, 2019
2 parents 615e0aa + 272185a commit 34dfe6b
Showing 1 changed file with 17 additions and 3 deletions.
20 changes: 17 additions & 3 deletions korea_news_crawler/articlecrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@ def make_news_page_url(category_url, start_year, end_year, start_month, end_mont
for page in range(1, totalpage + 1):
made_urls.append(url + "&page=" + str(page))
return made_urls

def get_url_data(self, url, max_tries=10):
remaining_tries = int(max_tries)
while remaining_tries > 0:
try:
return requests.get(url)
except requests.exceptions:
time.sleep(60)
remaining_tries = remaining_tries - 1
raise Exception("Couldn't get the data.")

def crawling(self, category_name):
# Multi Process PID
Expand All @@ -96,7 +106,8 @@ def crawling(self, category_name):
regex = re.compile("date=(\d+)")
news_date = regex.findall(URL)[0]

request = requests.get(URL)
request = self.get_url_data(URL)

document = BeautifulSoup(request.content, 'html.parser')

# html - newsflash_body - type06_headline, type06
Expand All @@ -115,8 +126,11 @@ def crawling(self, category_name):
sleep(0.01)

# 기사 HTML 가져옴
request_content = requests.get(content_url)
document_content = BeautifulSoup(request_content.content, 'html.parser')
request_content = self.get_url_data(content_url)
try:
document_content = BeautifulSoup(request_content.content, 'html.parser')
except:
continue

try:
# 기사 제목 가져옴
Expand Down

0 comments on commit 34dfe6b

Please sign in to comment.