In [None]:
# default_exp IPool_lite

核心就是去掉数据库，把这几百条数据放内存里查询&修改，定期备份成txt

核心路径是get_ip，所以初始化操作也在这里

In [None]:
# export
import json,random,requests,re,time
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

# 核心

## 获取一个ip

In [None]:
# export
db = {}

In [None]:
# export
def _get_ip() -> str:
    '健康值作为权重，随机抽取一个ip'
    global db
    ips = random.choices(list(db.keys()),weights=db.values(),k=1)
    return ips[0]

In [None]:
db = {'39.137.107.9:8080':100,'100,9.37.17.19:88':52}
_get_ip()

'100,9.37.17.19:88'

## 更新健康值

In [None]:
# export
def update_health(ip,is_good=False) -> int:
    '更新ip的health值，好用+1，无效/2'
    db[ip] = db[ip]+1 if is_good else db[ip]/2
    return db[ip]

In [None]:
ip = '100,9.37.17.19:88'
update_health(ip,is_good=True)

53

# 工具
## 周期任务

In [None]:
# export
def interval_task(fn,name,interval=300):
    '每5min自动执行fn'
    global last_modify
    if name not in last_modify or (time.time()-last_modify[name]) > interval:
        last_modify[name] = time.time()
        fn()

## 并行任务

In [None]:
# export
def get_progress_bar(r,length=30) -> str:
    # 类似于这样的进度条'[#######                                          ]14.87%'
    current = int(length*r)
    rest = int(length*(1-r))
    return '['+'#'*current+' '*rest+'] '+str(r*100)[:5]+'%'

def show_current_progress(done_num,total_num,start_time):
    pct = done_num/total_num
    now = time.time()
    cost_time = int(now-start_time)
    left_time = int(cost_time/done_num*(total_num-done_num))
    print(f'progress:{get_progress_bar(pct)} | cost:{cost_time}s | left:{left_time}s')

In [None]:
# export
def _parallel_task(fn,loop_args,max_workers=3):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for data in executor.map(fn,loop_args):
            yield data

def parallel_task(fn,loop_args,max_workers=3):
    start_time = time.time()
    
    done_num = 0
    total_num = len(loop_args)
    
    results = _parallel_task(fn,loop_args,max_workers)
    for data in results:
        done_num += 1
        interval_task(lambda:show_current_progress(done_num,total_num,start_time),'progress',1)
#         print('output data:',data)
        yield data
    
    cost_time = int(time.time()-start_time)
    print(f'{total_num} tasks, {cost_time}s')
#     return results

In [None]:
show_current_progress(3,11,time.time()-10)

progress:[########                     ] 27.27% | cost:10s | left:26s


In [None]:
def say_hi(i): 
    t = random.randint(1,10)
    time.sleep(t)
    return f'hi end {i} {t}'

for i in parallel_task(say_hi,range(10)):
    print(i)

progress:[###                           ] 10.0% | cost:9s | left:81s
hi end 0 9
hi end 1 3
hi end 2 1
hi end 3 8
progress:[###############               ] 50.0% | cost:13s | left:13s
hi end 4 10
progress:[##################            ] 60.0% | cost:17s | left:11s
hi end 5 8
hi end 6 6
progress:[########################     ] 80.0% | cost:23s | left:5s
hi end 7 10
hi end 8 7
hi end 9 1
10 tasks, 23s


## 查看优质ip

In [None]:
# export
def count_good_ips():
    global db
    return len([k for k in db.keys() if db[k] > 100])

In [None]:
count_good_ips()

126

# 进阶

## 爬取ip网站
> 搜索全部tr，然后解析符合的ip，不符合的就不管

In [None]:
# export
sites = '''
https://www.kuaidaili.com/free/inha/
http://www.nimadaili.com/gaoni/
http://www.xiladaili.com/gaoni/
https://ip.jiangxianli.com/?anonymity=2
https://www.7yip.cn/free/
http://www.ip3366.net/free/
https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1
http://proxyslist.com/
'''.strip().split('\n')

In [None]:
# export
def match_ip(tag): return re.match(r'^(\d{1,3}\.){3}\d{1,3}$',tag.text.strip())
def match_port(tag): return re.match(r'^\d{2,5}$',tag.text.strip())
def match_ip_with_port(tag): return re.match(r'^(\d{1,3}\.){3}\d{1,3}:\d{2,5}$',tag.text.strip())

def _parse_ip(soup) -> str:
    ip_with_port = soup.find(match_ip_with_port)
    ip = soup.find(match_ip)
    port = soup.find(match_port)
    if ip_with_port: return ip_with_port.text
    elif ip and port: return ip.text+':'+port.text
#     else: print('parse ip error:',soup)

def parse_ips(url) -> iter:
    res = requests.get(url,headers={'user-agent':'Mozilla/5.0'})
    soup = BeautifulSoup(res.text,features='lxml')
    tr_items = soup.body.find_all('tr')
    for tr in tr_items:
        ip = _parse_ip(tr) 
        if ip: yield ip
        

In [None]:
# export
def save_ips(ips) -> int:
    'ips保存到db和txt中，并且返回新增ip个数'
    global db
    keys = db.keys()
    count_new = 0
    for ip in ips:
        if ip not in keys:
            db[ip] = 100
            count_new += 1
    with open('ipool.txt','w') as f:
        json.dump(db,f)
    return count_new
            

In [None]:
# export
def crawl_ips():
    '爬取并保存ip'
    global sites,db
    for url in sites:
        try:
            ips = list(parse_ips(url))  
            count_new = save_ips(ips)
            print(url,' 新增：',count_new)
        except:
            print('error',url)
    print('总库存：',len(db.keys()))

In [None]:
crawl_ips()

https://www.kuaidaili.com/free/inha/  新增： 0
http://www.nimadaili.com/gaoni/  新增： 0
http://www.xiladaili.com/gaoni/  新增： 8
https://ip.jiangxianli.com/?anonymity=2  新增： 2
https://www.7yip.cn/free/  新增： 0
http://www.ip3366.net/free/  新增： 0
https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1  新增： 0
http://proxyslist.com/  新增： 0
总库存： 203


## 代理请求
分支可用于验证ip

In [None]:
# export
def parse_protocal(url): return 'https' if 'https' in url else 'http'

def _proxy_request(url,ip,method='get') -> object:
    '代理请求，并更新ip的health'
    protocal = parse_protocal(url)
    proxies = {protocal: protocal+'://'+ip}
    
    try:
        res = requests.request(method,url,
                               headers={'user-agent':'Mozilla/5.0'},
                               proxies=proxies,
                               allow_redirects=False,
                               timeout=5)
    except:
        update_health(ip)
        print('except error:',ip,db[ip])
        return
    else:
        if res and res.status_code == 200: update_health(ip,is_good=True)
        else: update_health(ip)
        print(res,ip,db[ip])
        return res

In [None]:
# export
def proxy_request(url,method='get',repeat_times=10) -> object:
    '自动获取一个随机ip，不断重复请求，直到200'
    current_times = 1
    while current_times <= repeat_times:
        ip = get_ip()
        res = _proxy_request(url,ip,method)
        if res and res.status_code == 200: 
            return res
        else: 
            print(url,'times:',current_times,res)
            current_times += 1

In [None]:
url = 'https://www.baidu.com/'
proxy_request(url,repeat_times=3)

删除： 0 剩余： 203
https://www.kuaidaili.com/free/inha/  新增： 0
http://www.nimadaili.com/gaoni/  新增： 29
http://www.xiladaili.com/gaoni/  新增： 46
https://ip.jiangxianli.com/?anonymity=2  新增： 2
https://www.7yip.cn/free/  新增： 0
http://www.ip3366.net/free/  新增： 0
https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1  新增： 0
http://proxyslist.com/  新增： 25
总库存： 305
except error: 163.204.247.147:9999 50.0
https://www.baidu.com/ times: 1 None
删除： 0 剩余： 305
https://www.kuaidaili.com/free/inha/  新增： 0
http://www.nimadaili.com/gaoni/  新增： 0
http://www.xiladaili.com/gaoni/  新增： 0
https://ip.jiangxianli.com/?anonymity=2  新增： 0
https://www.7yip.cn/free/  新增： 0
http://www.ip3366.net/free/  新增： 0
https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1  新增： 0
http://proxyslist.com/  新增： 0
总库存： 305
except error: 59.62.25.62:9000 50.0
https://www.baidu.com/ times: 2 None
except error: 68.183.188.100:3128 50.0
https://www.baidu.com/ times: 3 None


### 校验ip

In [None]:
# export
def validate_ips(url='http://www.baidu.com/',max_workers=100):
    global db
    ips = list(db.keys())
    parallel_task(lambda ip:_proxy_request(url, ip),ips,max_workers)
    print('good ips:',count_good_ips())

In [None]:
validate_ips()

except error: 100,9.37.17.19:88 13.25
except error: 182.148.206.23:9999 25.0
<Response [500]> 110.249.176.26:8060 25.0
except error: 39.106.156.203:8080 25.0
except error: 183.147.221.169:9000 25.0
<Response [500]> 27.203.163.18:8060 25.0
except error: 198.98.58.178:8080 25.0
<Response [200]> 61.147.210.159:8080 102
except error:<Response [403]>  182.46.111.72:9999 58.220.95.90:940125.0
 except error: 60.184.204.182:3000 25.0
25.0
<Response [403]> 58.220.95.79:10000 25.0
except error: 211.147.226.4:8118 25.0
except error: 142.93.57.37:80 25.0
<Response [200]> 39.137.69.10:8080 102
except error: 178.19.97.1:8088 25.0
except error: 80.211.60.89:8118 25.0
<Response [200]> 91.205.174.26:80 102
<Response [200]> 52.230.6.90:8080 102
except error: 198.23.143.27:8080 25.0
except error: 36.66.34.10:8080 25.0
except error: 84.22.46.25:8080 25.0
<Response [200]> 108.74.113.180:80 51.0
<Response [200]> 203.204.200.107:80 102
<Response [200]> 45.76.34.27:8080 102
<Response [200]> 45.63.42.56:80 102

except error: 36.89.227.98:8080 25.0
except error: 219.102.192.2:8080 50.5
<Response [403]> 58.220.95.54:9400 25.0
<Response [200]> 134.209.110.247:8080 102
<Response [200]> 104.41.29.43:80 102
<Response [200]> 82.119.170.106:8080 51.0
except error: 212.83.184.67:5836 25.0
except error: 54.229.251.214:80 25.0
except error: 159.138.1.185:80 25.0
<Response [200]> 163.172.29.74:5836 51.0
except error: 122.3.252.187:3128 50.5
<Response [200]> 197.216.2.14:8080 102
<Response [200]> 191.235.70.48:80 102
except error: 212.83.169.253:5836 25.0
<Response [200]> 153.127.40.61:3128 102
except error: 1.198.73.41:9999<Response [200]> 25.0
 110.136.222.67:8080 51.0
<Response [200]> 124.65.136.2:8060 51.0
<Response [200]> 168.138.42.43:3128 51.0
except error: 212.129.39.123:5836 25.0
<Response [200]> 163.125.73.81:9797 102
<Response [200]> 123.58.17.134:3128 102
except error: 123.169.118.58:9999 25.0
<Response [200]> 183.89.113.134:8080 102
except error: 112.84.51.153:9999 25.0
<Response [200]> 140.2

## 获取&自动更新ip

In [None]:
# export
last_modify = {}

In [None]:
# export
def delete_ips():
    '删除不健康的ip，节省内存，提高抽取效率'
    global db
    keys = list(db.keys())
    count_before = len(keys)
    for k in keys:
        if db[k] < 50: del db[k]
    count_current = len(db.keys())
    print('删除：',count_before-count_current,'剩余：',count_current)

def get_ip():
    '5min爬一次ip网站，1h删一次劣质ip'
    global db
    # 如果db为空，则尝试从txt文件读取
    if not db: 
        if Path('ipool.txt').exists():
            with open('ipool.txt','r') as f:
                db = json.load(f)
    
    interval_task(delete_ips,'delete',interval=3600)
    interval_task(crawl_ips,'crawl')
    
    return _get_ip()

In [None]:
get_ip()

'118.69.50.154:80'

# 发布

In [None]:
# hide
!nbdev_build_lib --fname 11b_ipool_lite.ipynb

Converted 11b_ipool_lite.ipynb.


In [None]:
!cp crawler_from_scratch/IPool_lite.py /Users/Neo/learn_fastai_from_scratch/IPool_lite.py

In [None]:
!git add 11b_ipool_lite.ipynb
!git add crawler_from_scratch/IPool_lite.py

!git commit -m "fix bugs"

[master 7185b0d] fix bugs
 2 files changed, 180 insertions(+), 97 deletions(-)


# 优化

爬取ip，出问题时try except，保证程序正常运作

如果线上跑的时候，把各种错误写到log文件中，或者用一个比较明显的方式显示进度，但其他的log会一直存在

In [None]:
超过10次后，会有soup解析报错，如何解决