-
Notifications
You must be signed in to change notification settings - Fork 2
/
西刺国内普通代理.py
137 lines (115 loc) · 4.06 KB
/
西刺国内普通代理.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python
#coding:utf-8
import urllib2, urllib
import re
import threading
import time
from bs4 import BeautifulSoup
import os
rawProxyList = []
checkedProxyList = []
#抓取代{过}{滤}理网站
targets = []
for i in xrange(1,3):
if i != 1:
target = r"http://www.xicidaili.com/nt/%d" % i
targets.append(target)
else:
target = r"http://www.xicidaili.com/nt/"
targets.append(target)
#print targets
print targets
#获取代{过}{滤}理的类
class ProxyGet(threading.Thread):
def __init__(self,target):
threading.Thread.__init__(self)
self.target = target
def getProxy(self):
print "目标网站: " + self.target
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
header = { 'User-Agent' : user_agent }
req = urllib2.Request(self.target, headers = header)
req = urllib2.urlopen(req)
result = req.read()
soup = BeautifulSoup(result, 'lxml')
for child in soup.find_all('tr'):
result = child.find_all('td')
print result
proxy = []
try:
proxy = [result[1].string.encode('utf8'),result[2].string.encode('utf8')]
except:
pass
#print proxy
print proxy
rawProxyList.append(proxy)
def run(self):
self.getProxy()
#检验代{过}{滤}理的类
class ProxyCheck(threading.Thread):
def __init__(self,proxyList):
threading.Thread.__init__(self)
self.proxyList = proxyList
self.timeout = 5
self.testUrl = "http://www.baidu.com/"
self.testStr = "030173"
def checkProxy(self):
cookies = urllib2.HTTPCookieProcessor()
for proxy in self.proxyList:
print proxy
try:
proxyHandler = urllib2.ProxyHandler({"http" : r'%s:%s' %(proxy[0],proxy[1])})
#print r'http://%s:%s' %(proxy[0],proxy[1])
except:
pass
opener = urllib2.build_opener(cookies,proxyHandler)
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')]
#urllib2.install_opener(opener)
t1 = time.time()
try:
#req = urllib2.urlopen("http://www.baidu.com", timeout=self.timeout)
req = opener.open(self.testUrl, timeout=self.timeout)
#print "urlopen is ok...."
result = req.read()
#print "read html...."
timeused = time.time() - t1
pos = result.find(self.testStr)
#print "pos is %s" %pos
if pos > 1:
checkedProxyList.append((proxy[0],proxy[1],timeused))
#print "ok ip: %s %s %s %s" %(proxy[0],proxy[1],proxy[2],timeused)
else:
continue
except Exception,e:
#print e.message
continue
def run(self):
self.checkProxy()
if __name__ == "__main__":
getThreads = []
checkThreads = []
#对每个目标网站开启一个线程负责抓取代{过}{滤}理
for i in targets:
t = ProxyGet(i)
getThreads.append(t)
for i in getThreads:
i.start()
for i in getThreads:
i.join()
print '.'*10+"总共抓取了%s个代{过}{滤}理" %len(rawProxyList) +'.'*10
#开启20个线程负责校验,将抓取到的代{过}{滤}理分成20份,每个线程校验一份
for i in range(20):
t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i:((len(rawProxyList)+19)/20) * (i+1)])
checkThreads.append(t)
for i in checkThreads:
i.start()
for i in checkThreads:
i.join()
print '.'*10+"总共有%s个代{过}{滤}理通过校验" %len(checkedProxyList) +'.'*10
#持久化
desktopPath = os.path.join(os.path.expanduser("~"), 'Desktop')
f= open(desktopPath + "/proxy_list.txt",'w+')
for proxy in sorted(checkedProxyList,cmp=lambda x,y:cmp(x[2],y[2])):
print "checked proxy is: %s:%s" %(proxy[0],proxy[1])
f.write("%s:%s\n"%(proxy[0],proxy[1]))
f.close()