In [None]:
# default_exp IpPool

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# hide
!nbdev_build_lib --fname 23_IP_Pool.ipynb

Converted 23_IP_Pool.ipynb.


# 代理池
实现效果
1. 每10分钟批量抓取一次ip
* 每天删除health为0的ip


In [None]:
# export
import re,random,time
from concurrent.futures import ThreadPoolExecutor

import requests,redis 
from bs4 import BeautifulSoup


## 获取一个可用ip

In [None]:
# export
def connect_db() -> object:
    connection_pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
    rdb = redis.Redis(connection_pool=connection_pool)
    return rdb

def _get_ip(protocal='http') -> str:
    '把health作为权重，随机抽取ip'
    rdb = connect_db()
    ips = []
    healths = []
    
    for name in rdb.keys():
        health_type = 'http_health' if protocal=='http' else 'https_health'
        health = rdb.hget(name,health_type)
        
        ips.append(name)
        healths.append(int(health))
    return random.choices(ips,healths)[0]

In [None]:
_get_ip()

'121.13.252.58:41564'

In [None]:
def make_test_data():
    rdb = connect_db()
    rdb.hmset('167.172.138.162:8080',{'http_health':51,'https_health':12})
    rdb.hmset('128.199.246.10:44344',{'http_health':25,'https_health':102})    

In [None]:
make_test_data()
_get_ip('https')   

'188.27.137.163:30987'

## 更新健康值

In [None]:
# export
def update_health(ip,is_health=False,protocal='http'):
    rdb = connect_db()
    health_type = 'http_health' if protocal=='http' else 'https_health'
    
    health = int(rdb.hget(ip,health_type))
    if is_health:
        rdb.hset(ip,health_type,health+1)
        print('+ Health',protocal,ip,rdb.hget(ip,health_type))        
    else:
        rdb.hset(ip,health_type,health//2)
#     print('+' if is_health else '-','Health',protocal,ip,rdb.hget(ip,health_type))

In [None]:
update_health('128.199.246.10:44344',is_health=True)

+ Health http 128.199.246.10:44344 26


## 爬取ip

In [None]:
# export
proxy_website_urls = '''
https://www.kuaidaili.com/free/inha/
http://www.nimadaili.com/gaoni/
https://www.xicidaili.com/nn/
http://www.xiladaili.com/gaoni/
https://ip.jiangxianli.com/?anonymity=2
https://www.7yip.cn/free/
http://www.ip3366.net/free/
https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1
http://proxyslist.com/
'''.strip().split('\n')

def match_ip(tag): return re.match(r'^(\d{1,3}\.){3}\d{1,3}$',tag.text.strip())
def match_port(tag): return re.match(r'^\d{2,5}$',tag.text.strip())
def match_ip_with_port(tag): return re.match(r'^(\d{1,3}\.){3}\d{1,3}:\d{2,5}$',tag.text.strip())

def find_port(ip_item_soup) -> str:
    soup = ip_item_soup
    while True:
        # 不停的查找包含port的父级
        soup = soup.parent
        if len(soup.find_all(match_ip)) > 1:
#             print('解析port失败',soup)
            return 
        if soup.find(match_port): 
            return soup.find(match_port).text.strip()

def find_ips(soup) -> iter:
    '从soup中解析出ip和port'
    # 39.137.107.98:80这种情况
    if soup.find_all(match_ip_with_port):
        for item in soup.find_all(match_ip_with_port):
            yield item.text.strip()
    # 39.137.107.98 | 80这种情况
    elif soup.find_all(match_ip):
        for item in soup.find_all(match_ip):
            ip = item.text.strip()
            port = find_port(item)
            if port: yield ip+':'+port
    else:
        print('解析失败：',soup)
        
def crawl_ip(url):
    '爬取1个页面的ip'
    rdb = connect_db()
    res = requests.get(url,headers={'user-agent':'Mozilla/5.0'})
    stock_before = len(rdb.keys())
    
    if res.status_code == 200:
        soup = BeautifulSoup(res.text,'lxml')
        for ip in find_ips(soup):
            rdb.hmset(ip,{'http_health':100,'https_health':100})
  
        stock = len(rdb.keys())
        print(f'{url} 新增：{stock-stock_before}，库存更新为：{stock}个')
    else:
        print(url,res,'requests请求失败')

In [None]:
crawl_ip(proxy_website_urls[2])

https://www.xicidaili.com/nn/ 新增：0，库存更新为：1703个


## 校验IP

In [None]:
# export
def validate(ip,url='http://m.sm.cn/',timeout=5):
    protocal = url.split(':')[0]
    proxies={protocal: protocal+'://'+ip}
    try:
        res = requests.get(url,
                           headers={'user-agent':'Mozilla/5.0'},
                           proxies=proxies,
                           timeout=timeout)
    except:
        update_health(ip,is_health=False,protocal=protocal)
    else:
        if res and res.status_code == 200:
            update_health(ip,is_health=True,protocal=protocal)
        else:
            update_health(ip,is_health=False,protocal=protocal)
        

In [None]:
validate('128.199.246.10:44344')

- Health http 128.199.246.10:44344 13


In [None]:
# export
def parallel_validate(max_workers=100):
    rdb = connect_db()
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(validate, rdb.keys()) 

In [None]:
parallel_validate()

+ Health http 101.37.118.54:8888 101
+ Health http 116.196.87.86:20183 103
+ Health http 103.43.75.98:80 101
+ Health http 82.119.170.106:8080 51
+ Health http 188.40.68.151:3128 101
+ Health http 119.41.236.180:8010 101
+ Health http 35.183.117.50:80 101
+ Health http 140.238.16.90:3128 103
+ Health http 59.56.28.254:80 101
+ Health http 203.19.88.59:80 101
+ Health http 118.172.201.59:39931 26
+ Health http 202.5.221.66:80 103
+ Health http 103.37.81.92:48248 26
+ Health http 182.171.88.245:8080 103
+ Health http 94.130.70.42:80 101
+ Health http 18.138.91.14:3128 26
+ Health http 80.187.140.26:8080 101
+ Health http 96.113.183.214:3128 103
+ Health http 110.74.195.215:44975 26
+ Health http 1.119.166.180:8080 51
+ Health http 51.158.119.88:8811 26
+ Health http 178.32.6.105:3128 51
+ Health http 197.216.2.14:8080 26
+ Health http 110.232.81.71:8080 52
+ Health http 121.237.148.153:3000 101
+ Health http 116.196.85.150:3128 103
+ Health http 180.252.181.2:80 103
+ Health http 142.93.

## 定期更新IP
>10min更新一次

In [None]:
# export
last_crawl = 0
def parallel_crawl_ips():
    with ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(crawl_ip, proxy_website_urls) 
def repeat_crawl_ips(frequency=600):
    global last_crawl
    now = time.time()
    if last_crawl//frequency != now//frequency:
        last_crawl = now
        parallel_crawl_ips()

In [None]:
repeat_crawl_ips()

https://www.7yip.cn/free/ 新增：20，库存更新为：1447个
http://www.nimadaili.com/gaoni/ 新增：22，库存更新为：1449个
http://www.ip3366.net/free/ 新增：23，库存更新为：1450个
http://www.xiladaili.com/gaoni/ 新增：33，库存更新为：1460个
https://ip.jiangxianli.com/?anonymity=2 新增：14，库存更新为：1460个
https://www.kuaidaili.com/free/inha/ 新增：31，库存更新为：1461个
https://www.xicidaili.com/nn/ 新增：22，库存更新为：1461个
https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1 新增：0，库存更新为：1461个
http://proxyslist.com/ 新增：24，库存更新为：1485个


## 定期删除IP
> 每日删除health为0的IP

In [None]:
# export
last_delete = 0
def delete_ips():
    rdb = connect_db()
    count = 0
    for name in rdb.keys():
        if rdb.hget(name,'http_health') == '0' and rdb.hget(name,'https_health') == '0':
            rdb.delete(ip) 
            count += 1
    print(f'删除{count}个无效IP')
    
def repeat_delete_ips(frequency=24*3600):
    global last_delete
    now = time.time()
    
    if last_delete//frequency != now//frequency:
        last_delete = now
        delete_ips()
    

In [None]:
repeat_delete_ips()

删除0个无效IP


## 自动维护IP池

In [None]:
# export
def get_ip(protocal='http') -> str:
    repeat_crawl_ips()
    repeat_delete_ips()
    return _get_ip(protocal)
    

In [None]:
get_ip()

'192.41.71.199:3128'