In [None]:
# default_exp IpPool

%reload_ext autoreload
%autoreload 2

# 代理池
实现效果
1. 自动抓取新ip
* 自动删除无效ip（根据健康度）


In [None]:
# export
import re,random,time
from concurrent.futures import ThreadPoolExecutor

import requests,redis 
from bs4 import BeautifulSoup


## 获取一个可用ip

In [None]:
# export
def connect_db() -> object:
    connection_pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
    rdb = redis.Redis(connection_pool=connection_pool)
    return rdb

def _get_ip(protocal='http') -> str:
    '把health作为权重，随机抽取ip'
    rdb = connect_db()
    prim_ips = rdb.zrange(protocal,-20,-1)    
    return random.choice(prim_ips)

In [None]:
def make_test_data():
    rdb = connect_db()
    rdb.zadd('http',{'39.137.107.9:8080':10}) 
    rdb.zadd('https',{'39.137.107.9:8080':10})        

In [None]:
# make_test_data()
_get_ip()   

'117.88.177.109:3000'

## 更新健康值

In [None]:
# export
def update_health(ip,is_health=False,protocal='http') -> float:
    rdb = connect_db()
    increase = 1 if is_health else -10
    result = rdb.zincrby(protocal,increase,ip)
    return result

In [None]:
update_health('128.199.246.10:44344',is_health=True)

1.0

## 爬取ip

In [None]:
# export
proxy_website_urls = '''
https://www.kuaidaili.com/free/inha/
http://www.nimadaili.com/gaoni/
http://www.xiladaili.com/gaoni/
https://ip.jiangxianli.com/?anonymity=2
https://www.7yip.cn/free/
http://www.ip3366.net/free/
https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1
http://proxyslist.com/
'''.strip().split('\n')

def match_ip(tag): return re.match(r'^(\d{1,3}\.){3}\d{1,3}$',tag.text.strip())
def match_port(tag): return re.match(r'^\d{2,5}$',tag.text.strip())
def match_ip_with_port(tag): return re.match(r'^(\d{1,3}\.){3}\d{1,3}:\d{2,5}$',tag.text.strip())

def find_port(ip_item_soup) -> str:
    soup = ip_item_soup
    while True:
        # 不停的查找包含port的父级
        soup = soup.parent
        if len(soup.find_all(match_ip)) > 1:
#             print('解析port失败',soup)
            return 
        if soup.find(match_port): 
            return soup.find(match_port).text.strip()

def find_ips(soup) -> iter:
    '从soup中解析出ip和port'
    # 39.137.107.98:80这种情况
    if soup.find_all(match_ip_with_port):
        for item in soup.find_all(match_ip_with_port):
            yield item.text.strip()
    # 39.137.107.98 | 80这种情况
    elif soup.find_all(match_ip):
        for item in soup.find_all(match_ip):
            ip = item.text.strip()
            port = find_port(item)
            if port: yield ip+':'+port
    else:
        print('解析失败：',soup)
        
        
# 这里没想好，到底http和https都爬，然后自动切换还是手动
def crawl_ip(url,protocal='http'):
    '爬取1个页面的ip'
    rdb = connect_db()
    increase = 0
    
    res = requests.get(url,headers={'user-agent':'Mozilla/5.0'})
    if res.status_code == 200:
        soup = BeautifulSoup(res.text,features='lxml')
        for ip in find_ips(soup):
            
            if rdb.zadd('http',{ip:100},nx=True):
                increase += 1
        stock = rdb.zcount(protocal,0,100000)
        print(f'{url} 新增：{increase}，库存更新为：{stock}个')
    else:
        print(url,res,'requests请求失败')

In [None]:
crawl_ip(proxy_website_urls[1])

http://www.nimadaili.com/gaoni/ 新增：47，库存更新为：303个


## 校验IP

In [None]:
# export
def validate(ip,url='http://m.sm.cn/',timeout=5) -> float:
    protocal = url.split(':')[0]
    proxies={protocal: protocal+'://'+ip}
    try:
        res = requests.get(url,
                           headers={'user-agent':'Mozilla/5.0'},
                           proxies=proxies,
                           timeout=timeout)
    except:
        return update_health(ip,is_health=False,protocal=protocal)
    else:
        if res and res.status_code == 200:
            return update_health(ip,is_health=True,protocal=protocal)
        else:
            return update_health(ip,is_health=False,protocal=protocal)
        

In [None]:
validate('128.199.246.10:44344')

3.0

## 定期更新IP
>5min更新一次

In [None]:
# export
last_crawl = 0
def parallel_crawl_ips():
    with ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(crawl_ip, proxy_website_urls) 
def repeat_crawl_ips(frequency=300):
    global last_crawl
    now = time.time()
    if last_crawl//frequency != now//frequency:
        last_crawl = now
        parallel_crawl_ips()

In [None]:
repeat_crawl_ips()

http://www.nimadaili.com/gaoni/ 新增：43，库存更新为：346个
http://www.xiladaili.com/gaoni/ 新增：0，库存更新为：346个
http://www.ip3366.net/free/ 新增：3，库存更新为：349个
https://www.kuaidaili.com/free/inha/ 新增：1，库存更新为：350个
https://www.7yip.cn/free/ 新增：2，库存更新为：352个
https://ip.jiangxianli.com/?anonymity=2 新增：6，库存更新为：358个
http://proxyslist.com/ 新增：22，库存更新为：380个
https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1 新增：0，库存更新为：380个


## 定期删除IP
> 每日删除health为0的IP

In [None]:
r = connect_db()

In [None]:
r.zrem('http','128.199.246.10:44344')

In [None]:
r.zscan('http',match='128.199.246.10:44344')

In [None]:
r.zrange()

In [None]:
# r.zcount('http',0,20)
r.zremrangebyscore('http',0,50)

In [None]:
# export
# 这里也是，没想好，怎么维护http和https两个库

last_delete = 0
def delete_ips(protocal='http'):
    rdb = connect_db()
    result = rdb.zremrangebyscore(protocal,0,20) 
    return result
    
def repeat_delete_ips(frequency=24*3600):
    global last_delete
    now = time.time()
    
    if last_delete//frequency != now//frequency:
        last_delete = now
        result = delete_ips()
        print('移除：',result,'个IP')
    

In [None]:
repeat_delete_ips()

移除： 0 个IP


## 自动维护IP池

In [None]:
# export
def get_ip(protocal='http') -> str:
    repeat_crawl_ips()
    repeat_delete_ips()
    return _get_ip(protocal)
    

In [None]:
get_ip()

'39.137.69.8:80'

## 发布

In [None]:
# hide
!nbdev_build_lib --fname 23_IP_Pool.ipynb

In [None]:
!git add 23_IP_Pool.ipynb
!git commit -m "fix ip increase num"