-
Notifications
You must be signed in to change notification settings - Fork 1
/
mzitu爬取美女图片.py
123 lines (104 loc) · 3.58 KB
/
mzitu爬取美女图片.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import requests
import os
import time
import threading
from bs4 import BeautifulSoup
def download_page(url):
'''
用于下载页面
'''
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2)' \
'AppleWebKit/537.36 (KHTML, like Gecko)' \
'Chrome/79.0.3945.88 Safari/537.36',
'Referer': 'http://www.mzitu.com/'
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.encoding = 'utf-8'
if response.status_code is 200:
return response.text
except requests.ConnectionError:
return None
def download_pic(url, text):
'''
用于下载图片
'''
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2)' \
'AppleWebKit/537.36 (KHTML, like Gecko)' \
'Chrome/79.0.3945.88 Safari/537.36',
'Referer': 'http://www.mzitu.com/'
}
try:
response = requests.get(url, headers=headers, timeout=10)
with open('pic/{}/{}'.format(text, url.split('/')[-1]), 'wb') as f:
f.write(response.content)
except requests.ConnectionError:
return None
def get_page_list(html):
'''
获取每个页面的套图列表,之后循环调用get_pic函数获取图片
'''
soup = BeautifulSoup(html, 'html.parser')
pic_list = soup.find('div', class_='postlist').find_all('li')
for i in pic_list:
a_tag = i.find('a')
link = a_tag.get('href')
img_tag = i.find('img')
title = img_tag.get('alt')
url_list = get_all_page(link)
get_pic(url_list, title)
time.sleep(1)
def get_pic(urls, title):
'''
下载网页上的图片
'''
for url in urls:
html = download_page(url) # 下载界面
soup = BeautifulSoup(html, 'html.parser')
img_tag = soup.find(name='div', attrs={"class": "main-image"}).find('img')
pic_link = img_tag.get('src') # 拿到图片的具体 url
create_dir('pic/{}'.format(title))
print(title, i)
download_pic(pic_link, title)
def get_all_page(link):
'''
获取同一套图的所有的网页
'''
html = download_page(link) # 下载界面
soup = BeautifulSoup(html, 'html.parser')
page_list = soup.find('div', class_='pagenavi').find_all('a')
url_list = []
url = page_list[-2].get('href')
url_cut = url[0:-2]
page_num = url[-2] + url[-1]
for i in range(1, int(page_num)):
url_new = url_cut + str(i)
url_list.append(url_new)
return url_list
def create_dir(name):
if not os.path.exists(name):
os.makedirs(name)
def execute(url):
page_html = download_page(url)
get_page_list(page_html)
def main():
create_dir('pic')
for cur_page in range(1, 10):
url = 'https://www.mzitu.com/page/{}/'.format(cur_page)
execute(url)
# create_dir('pic')
# queue = [i for i in range(1, 71)] # 构造 url 链接 页码
# threads = []
# while len(queue) > 0:
# for thread in threads:
# if not thread.is_alive():
# threads.remove(thread)
# while len(threads) < 5 and len(queue) > 0: # 最大线程数设置为 5
# cur_page = queue.pop(0)
# url = 'https://www.mzitu.com/page/{}/'.format(cur_page)
# thread = threading.Thread(target=execute, args=(url,))
# thread.setDaemon(True)
# thread.start()
# threads.append(thread)
if __name__ == '__main__':
main()