-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
153 lines (125 loc) · 5 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
from bs4 import BeautifulSoup
import json
import os
import requests
import time
class Crawler(object):
def __init__(self, waiting_time=0.1):
self.waiting_time = waiting_time
def get_book_detail(self, book_page_url):
'''Extract book info such as a title, a subtitle, and file urls'''
r = requests.get(book_page_url)
html = r.text.encode('ISO-8859-1')
soup = BeautifulSoup(html)
title_data = soup.find(attrs={'summary': u'タイトルデータ'})
sakuhin_data = soup.find(attrs={'summary': u'作品データ'})
files = soup.find_all(attrs={'bgcolor': 'white'})
# extract a book title
title = title_data.find(attrs={'size': '+2'}).text.encode('utf-8')
# extract a book subtitle if exists
subtitle = ''
for tr in title_data.find_all('tr'):
if u'副題:'in tr.text:
subtitle = tr.find_all('td')[-1].text
# extract what historical kana is used
if u'新字新仮名' in sakuhin_data.text:
kana_usage = u'新字新仮名'
elif u'新字旧仮名' in sakuhin_data.text:
kana_usage = u'新字旧仮名'
else:
kana_usage = u'旧字旧仮名'
# extract file locations
file_urls = []
for file_data in files:
file_name = file_data.find('a').text
file_url = book_page_url.rsplit('/', 1)[0] + '/files/' + file_name
file_urls.append(file_url)
# check whether copyright is expired or not
if u'*著作権存続*' in soup.text:
copyright = True
else:
copyright = False
res = {
'title': title,
'subtitle': subtitle,
'kana_usage': kana_usage,
'copyright': copyright
}
for file_url in file_urls:
ext = os.path.splitext(file_url)[1]
if ext == '.html':
res['html_file'] = file_url
elif ext == '.zip':
res['zip_file'] = file_url
return res
def get_book_pages(self, author_page_url):
'''Extract an author name and book urls'''
r = requests.get(author_page_url)
html = r.text.encode('ISO-8859-1')
soup = BeautifulSoup(html)
author_name = soup.find(attrs={'size': '+2'}).text.encode('utf-8')
book_list = soup.find('ol')
book_urls = []
for book in book_list.find_all('li'):
url = book.find('a').get('href')
if url:
book_urls.append('http://www.aozora.gr.jp/' + url[3:])
return {'author': author_name, 'book_urls': book_urls}
def get_author_pages(self, column_page_url):
'''Extract author page urls'''
r = requests.get(column_page_url)
html = r.text.encode('ISO-8859-1')
soup = BeautifulSoup(html)
authors = soup.find_all('ol')
author_links = []
for sound_ol in authors:
for author in sound_ol.find_all('li'):
author_links.append('http://www.aozora.gr.jp/index_pages/' + author.find('a').get('href'))
return author_links
def wait(self, start, end):
if end - start < self.waiting_time:
time.sleep(self.waiting_time - (end - start))
def crawl(self):
'''Extract all author names and related file urls'''
url = 'http://www.aozora.gr.jp/index_pages/person_{}.html'
column_names = ['a', 'ka', 'sa', 'ta', 'na', 'ha', 'ma', 'ya', 'ra', 'wa']
column_names = ['wa']
res = {}
for col_name in column_names:
authors = self.get_author_pages(url.format(col_name))
for author in authors:
author_info = self.get_book_pages(author)
author_name = author_info['author']
res[author_name] = []
book_urls = author_info['book_urls']
print author_name
for book in book_urls:
start = time.time()
book_info = self.get_book_detail(book)
res[author_name].append(book_info)
print ' ' + book_info['title'], book_info['subtitle'], book_info['kana_usage']
end = time.time()
self.wait(start, end)
return res
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--object',
help='a file name which will contain the result in the json format')
parser.add_argument('-w', '--waiting-time',
help='a waiting time to extract each book info', type=float)
args = parser.parse_args()
waiting_time = args.waiting_time
if waiting_time:
crawler = Crawler(waiting_time=waiting_time)
else:
crawler = Crawler()
result = crawler.crawl()
json_result = json.dumps(result)
fname = args.object
if fname is None:
fname = 'aozora.json'
with open(fname, 'w') as f:
f.write(json_result)