Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ The project code in this repository is crawling three different public proxy web
* http://proxyfor.eu/geo.php
* http://free-proxy-list.net
* http://rebro.weebly.com/proxy-list.html
* http://www.samair.ru/proxy/time-01.htm

After collecting the proxy data and filtering the slowest ones it is randomly selecting one of them to query the target url.
The request timeout is configured at 30 seconds and if the proxy fails to return a response it is deleted from the application proxy list.
Expand Down
31 changes: 31 additions & 0 deletions project/http/requests/proxy/requestProxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def __init__(self, web_proxy_list=[]):
self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php', 100.0)
self.proxy_list += self.freeProxy_url_parser('http://free-proxy-list.net')
self.proxy_list += self.weebly_url_parser('http://rebro.weebly.com/proxy-list.html')
self.proxy_list += self.samair_url_parser('http://www.samair.ru/proxy/time-01.htm')


def get_proxy_list(self):
return self.proxy_list
Expand Down Expand Up @@ -127,6 +129,34 @@ def weebly_url_parser(self, web_url):
curr_proxy_list.append(proxy.__str__())
return curr_proxy_list

def samair_url_parser(self, web_url, speed_in_KBs=100.0):
curr_proxy_list = []
content = requests.get(web_url).content
soup = BeautifulSoup(content, "html.parser")
# css provides the port number so we reverse it
for href in soup.findAll('link'):
if '/styles/' in href.get('href'):
style = "http://www.samair.ru" + href.get('href')
break
css = requests.get(style).content.split('\n')
css.pop()
ports = {}
for l in css:
p = l.split(' ')
key = p[0].split(':')[0][1:]
value = p[1].split('\"')[1]
ports[key] = value

table = soup.find("table", attrs={"id": "proxylist"})

# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]

for row in table.find_all("span")[1:]:
curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])

return curr_proxy_list

def generate_proxied_request(self, url, params={}, req_timeout=30):
#if len(self.proxy_list) < 2:
# self.proxy_list += self.proxyForEU_url_parser('http://proxyfor.eu/geo.php')
Expand All @@ -137,6 +167,7 @@ def generate_proxied_request(self, url, params={}, req_timeout=30):
request = None
try:
rand_proxy = random.choice(self.proxy_list)
print "Next proxy: " + str(rand_proxy)
request = requests.get(test_url, proxies={"http": rand_proxy},
headers=req_headers, timeout=req_timeout)
except ConnectionError:
Expand Down