-
Notifications
You must be signed in to change notification settings - Fork 0
/
08http_crawl.py
79 lines (67 loc) · 2.25 KB
/
08http_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Author: TangYue
"""
'''
获取https://www.w3school.com.cn/导航,以及每个导航下左边的课程表和右边的侧边栏
(包括工具栏箱和赞助商图片),结果保存到一个json中,并格式化打印。
'''
import requests
from bs4 import BeautifulSoup
import json
class W3school:
pass
def __init__(self):
self.url = 'https://www.w3school.com.cn'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/84.0.4147.89 Safari/537.36 '
}
def _soup(self, url):
r = requests.get(url, headers=self.headers)
r.encoding = 'GBK'
soup = BeautifulSoup(r.text, "lxml")
return soup
def _build(self):
wrap = {}
name = []
soup = self._soup(self.urls)
course = soup.select("#course")
for i in course[0].find_all("a"):
name.append(i.string)
wrap["课程表"] = name
tools = soup.select("#tools")
if len(tools) > 0:
Tool = {}
for i in tools[0].find_all("h2"):
tool = []
name = i.string
j = i.find_next_siblings("ul")
for k in j[0].find_all("li"):
tool.append(k.string)
Tool[name] = tool
wrap["工具箱"] = Tool
sidebars = soup.select("#sp_sidebar")
if len(sidebars) > 0:
sidebar = {}
for k in sidebars[0].find_all("a"):
sidebar['图片跳转链接地址'] = k["href"]
wrap["赞助商连接"] = sidebar
return wrap
def run(self):
wrap = {}
soup = self._soup(self.url)
menu = soup.select("#menu")
for i in menu[0].find_all("a"):
name = i.string
self.urls = self.url + i["href"]
info = self._build()
wrap[name] = info
wrap = json.dumps(wrap, ensure_ascii=False, sort_keys=False, indent=4,
separators=(',', ': '))
f = open('structure.json', "w", encoding="utf-8")
f.write(str(wrap))
f.close()
if __name__ == "__main__":
f = W3school()
f.run()