# MongoDB简介
[官网](https://www.mongodb.com/) | [文档入口](https://docs.mongodb.com/) | [MongoDB Python Drivers文档](https://docs.mongodb.com/ecosystem/drivers/python/) | [MongoDB 大学](https://university.mongodb.com/courses/M001/about)

## 安装

1. 下载[MongoDB Server](https://www.runoob.com/mongodb/mongodb-osx-install.html) | [PyMongo](https://www.runoob.com/python3/python-mongodb.html)
* 先创建数据存储地址 `mkdir -p ./data/db`
* 再启动 mongo 服务端 `mongod --dbpath=./data/db`
* 最后`import pymongo` 就可以操作了

## 概念
数据结构
* Databases 数据库
* Collections 表
* Documents JSON格式的数据

数据操作
* CRUD (Create Read Update Delete) 增删改查 

工具
* MongoDB Atlas 云服务
* MongoDB Compass 数据管理客户端

## 数据操作
[PyMongo文档](https://pymongo.readthedocs.io/en/stable/)

In [None]:
from pymongo import MongoClient,DESCENDING
client = MongoClient('mongodb://127.0.0.1:27017')

### 新增

In [None]:
db = client.test_db
users = db.users
result = users.insert_one({'name':'hawk'})

In [None]:
result.inserted_id

### 查询

In [None]:
client.list_database_names()

In [None]:
db.list_collection_names()

In [None]:
users.find_one()

In [None]:
for item in users.find({'name': 'hawk'}):
    print(item)

#### 计数

In [None]:
users.count_documents({'name': 'hawk'})

#### 排序

In [None]:
for item in users.find().sort('_id', DESCENDING):
    print(item)

#### 其他

In [None]:
for item in users.find().sort('_id', DESCENDING).skip(1).limit(2):
    print(item)

### 删除

In [None]:
users.delete_one({'name':'hawk'})

In [None]:
db.drop_collection('users')

In [None]:
client.drop_database('test_db')

### 修改
[Update Operator文档](https://docs.mongodb.com/manual/reference/operator/update/)

In [None]:
users.update_one({'name':'hawk'},{'$set':{'age':30}})
users.update_one({'age':30},{'$inc':{'age':2}})

In [None]:
for item in users.find():
    print(item)

### 筛选条件

In [None]:
%%html
<table><thead><tr><th>符号</th><th>含义</th><th>示例</th></tr></thead><tbody><tr><td><code>$lt</code></td><td>小于</td><td><code>{'age': {'$lt': 20}}</code></td></tr><tr><td><code>$gt</code></td><td>大于</td><td><code>{'age': {'$gt': 20}}</code></td></tr><tr><td><code>$lte</code></td><td>小于等于</td><td><code>{'age': {'$lte': 20}}</code></td></tr><tr><td><code>$gte</code></td><td>大于等于</td><td><code>{'age': {'$gte': 20}}</code></td></tr><tr><td><code>$ne</code></td><td>不等于</td><td><code>{'age': {'$ne': 20}}</code></td></tr><tr><td><code>$in</code></td><td>在范围内</td><td><code>{'age': {'$in': [20, 23]}}</code></td></tr><tr><td><code>$nin</code></td><td>不在范围内</td><td><code>{'age': {'$nin': [20, 23]}}</code></td></tr></tbody></table>

In [None]:
%%html
<table><thead><tr><th>符号</th><th>含义</th><th>示例</th><th>示例含义</th></tr></thead><tbody><tr><td><code>$regex</code></td><td>匹配正则表达式</td><td><code>{'name': {'$regex': '^M.*'}}</code></td><td><code>name</code>以M开头</td></tr><tr><td><code>$exists</code></td><td>属性是否存在</td><td><code>{'name': {'$exists': True}}</code></td><td><code>name</code>属性存在</td></tr><tr><td><code>$type</code></td><td>类型判断</td><td><code>{'age': {'$type': 'int'}}</code></td><td><code>age</code>的类型为<code>int</code></td></tr><tr><td><code>$mod</code></td><td>数字模操作</td><td><code>{'age': {'$mod': [5, 0]}}</code></td><td>年龄模5余0</td></tr><tr><td><code>$text</code></td><td>文本查询</td><td><code>{'$text': {'$search': 'Mike'}}</code></td><td><code>text</code>类型的属性中包含<code>Mike</code>字符串</td></tr><tr><td><code>$where</code></td><td>高级条件查询</td><td><code>{'$where': 'obj.fans_count == obj.follows_count'}</code></td><td>自身粉丝数等于关注数</td></tr></tbody></table>

详细规则 https://docs.mongodb.com/manual/reference/operator/query/

### 合并操作

In [None]:
# find_one_and_delete
# find_one_and_replace
users.find_one_and_update({'age':32},{'$inc':{'age':2}})

# 实践案例，爬人人都是产品经理网站
> MongoDB的存储位置为./data/db

现存问题：
- [x] 爬取url太慢：用代理并行去爬
- [ ] 爬url，经常overtry，因为网站经常会卡死，设定workers少一点

In [None]:
# default_exp RRPM

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# export
import requests,re,time,sys
from bs4 import BeautifulSoup
from crawler_from_scratch import IpPool
from pymongo import MongoClient

In [None]:
# export
client = MongoClient('mongodb://127.0.0.1:27017')
db = client.crawler
art_coll = db.articals

category = 'pd'

In [None]:
# export
def simple_request(url) -> object:
    res = requests.get(url,headers={'user-agent':'Mozilla/5.0'})
    soup = BeautifulSoup(res.text)
    return soup.body

## 生成待爬url list


In [None]:
#export
def crawl_artical_url_v1(category_page_url) -> iter:
    res = requests.get(category_page_url)
    soup = BeautifulSoup(res.text,features='lxml').body
    for s in soup.find_all('h2','post-title'):
        url = s.a['href']
        if 'woshipm' in url: yield url
    
def get_urls_v1(start,end) -> list:
    global category
    urls = []
    category_url = f'http://www.woshipm.com/category/{category}'
    for i in range(start,end+1):
        page_url = f'{category_url}/page/{i}'
        print(page_url)
        for url in crawl_artical_url_v1(page_url):
            urls.append(url)
    print('urls:',len(urls),'per_page:',len(urls)/(end-start+1))
    urls = list(set(urls))
    return urls

In [None]:
get_urls_v1(20,30)

In [None]:
#export
def crawl_artical_url(category_page_url) -> list:
    urls = []
    res = IpPool.proxy_get(category_page_url)
    soup = BeautifulSoup(res.text,features='lxml').body
    for s in soup.find_all('h2','post-title'):
        url = s.a['href']
        if 'woshipm' in url: urls.append(url)
    
    # 获取失败的情况
    if len(urls) == 0: 
        print('\n\n!no urls:',category_page_url, 'retrying......')
        urls = crawl_artical_url(category_page_url)
    print('crawl urls:',category_page_url,'got:',len(urls))
    return urls

def parallel_crawl_artical_url(start,end) -> list:
    global category
    urls = []
    page_urls = [f'http://www.woshipm.com/category/{category}/page/{i}' for i in range(start,end+1)]
    for data in IpPool.parallel_task(crawl_artical_url,page_urls,10):
        urls += data
    urls = list(set(urls))
    print('urls:',len(urls))
    return urls

In [None]:
crawl_artical_url('http://www.woshipm.com/category/it/page/2')

https://www.7yip.cn/free/ 新增：0，库存更新为：2122个
http://www.ip3366.net/free/ 新增：4，库存更新为：2126个
http://www.nimadaili.com/gaoni/ 新增：37，库存更新为：2163个
http://www.xiladaili.com/gaoni/ 新增：3，库存更新为：2166个
https://www.kuaidaili.com/free/inha/ 新增：0，库存更新为：2167个
https://ip.jiangxianli.com/?anonymity=2 新增：7，库存更新为：2173个
http://proxyslist.com/ 新增：23，库存更新为：2196个
https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1 新增：1，库存更新为：2197个
移除： 0 个IP
success: http://www.woshipm.com/category/it/page/2 try times: 1
crawl urls: http://www.woshipm.com/category/it/page/2 got: 12


In [None]:
category = 'it'
urls = parallel_crawl_artical_url(1,100)

In [None]:
urls

['http://www.woshipm.com/it/3690544.html',
 'http://www.woshipm.com/it/3690303.html',
 'http://www.woshipm.com/it/3679467.html',
 'http://www.woshipm.com/it/3689204.html',
 'http://www.woshipm.com/it/3688145.html',
 'http://www.woshipm.com/it/3658393.html',
 'http://www.woshipm.com/it/3685361.html',
 'http://www.woshipm.com/it/3685687.html',
 'http://www.woshipm.com/it/3684275.html',
 'http://www.woshipm.com/it/3679947.html',
 'http://www.woshipm.com/it/3684034.html',
 'http://www.woshipm.com/it/3678488.html',
 'http://www.woshipm.com/it/3678488.html',
 'http://www.woshipm.com/it/3686449.html',
 'http://www.woshipm.com/it/3686307.html',
 'http://www.woshipm.com/it/3683307.html',
 'http://www.woshipm.com/it/3679936.html',
 'http://www.woshipm.com/it/3684661.html',
 'http://www.woshipm.com/it/3682891.html',
 'http://www.woshipm.com/it/3678595.html',
 'http://www.woshipm.com/it/3678962.html',
 'http://www.woshipm.com/it/3677359.html',
 'http://www.woshipm.com/it/3677782.html',
 'http://ww

## 代理访问&保存html

In [None]:
# export
def save_artical(response) -> int:
    global category
    url = response.url
    _id = int(re.search(r'\d+',url).group())
    
    soup = BeautifulSoup(response.text,features='lxml')
    content = str(soup.find('div','single-wrapper'))

    try:
        artical = art_coll.insert_one({'_id':_id,
                                       'category':category,
                                       'url':url,
                                       'html':content})
        return artical.inserted_id
    except:
        print('\n\n!error',url,sys.exc_info())
    
def crawl_artical(url):
    global category,art_coll
    _id = int(re.search(r'\d+',url).group())
    
    # 爬过，则结束
    if art_coll.find_one({'_id':_id}):
        print('exist:',url)
        return
    
    # 重复爬10次，直到res=200，不然就log
    res = IpPool.proxy_get(url) 
    if res: save_artical(res)          

In [None]:
crawl_artical('http://www.woshipm.com/rp/3660512.html')

exist: http://www.woshipm.com/rp/3660512.html


## 批量操作

In [None]:
# export
def crawl_artical_url_and_html(category_page_url):
    urls = crawl_artical_url(category_page_url)
    IpPool.parallel_task(crawl_artical,urls,10,False)
    print('complete:',category_page_url)
    return True

In [None]:
# export
def crawl_artical_by_category():
    global category
    category = 'blockchain'
    start,end = 1,20
    category_page_urls = [f'http://www.woshipm.com/category/{category}/page/{i}' for i in range(start,end+1)]
    IpPool.parallel_task(crawl_artical_url_and_html,category_page_urls,20)
#     urls = parallel_crawl_artical_url(1,133)    
# #     urls = get_urls_v1(1,257)
#     IpPool.parallel_task(crawl_artical,urls)

## 完善数据

In [None]:
def check_no_html() -> list:
    no_html_urls = []
#     for a in art_coll.find({'html':{'$exists':True},'$where':"(this.html.length < 100)"}):
    for a in art_coll.find({'html':{'$exists':False}}):
        no_html_urls.append(a['url'])
    print(len(no_html_urls))
    return no_html_urls

In [None]:
def update_html(url):
    res = IpPool.proxy_get(url)
    soup = BeautifulSoup(res.text,features='lxml')
    content = str(soup.find('div','single-wrapper'))
    result = art_coll.update_one({'url':url},{'$set':{'html':content}})
    print('update:',url,result.modified_count)

In [None]:
update_html('http://www.woshipm.com/it/1633891.html')

http://www.ip3366.net/free/ 新增：3，库存更新为：1151个
https://www.7yip.cn/free/ 新增：2，库存更新为：1171个
http://www.nimadaili.com/gaoni/ 新增：35，库存更新为：1177个https://www.kuaidaili.com/free/inha/ 新增：0，库存更新为：1177个

http://www.xiladaili.com/gaoni/ 新增：11，库存更新为：1178个
http://proxyslist.com/ 新增：25，库存更新为：1203个
https://ip.jiangxianli.com/?anonymity=2 新增：3，库存更新为：1206个
https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1 新增：0，库存更新为：1206个
update: http://www.woshipm.com/it/1633891.html 0


In [None]:
check_no_html()

0


[]

In [None]:
IpPool.parallel_task(update_html,check_no_html())

## 解析内容字段

In [None]:
# export
def parse_html(_id=3614430) -> object:
    update_content = {}
    try:
        soup = BeautifulSoup(art_coll.find_one({'_id':_id})['html'],features='lxml')
        update_content['title'] = soup.h2.text
        author_soup = soup.find('div','postMetaLockup--authorWithBio u-flex')
        update_content['author_link'] = author_soup.a['href']

        author_info = [s for s in author_soup.stripped_strings]
        update_content['author_name'] = author_info[0]
        update_content['likes'] = author_info[-1]
        update_content['stars'] = author_info[-2]
        update_content['views'] = author_info[-3]
        update_content['publish_date'] = author_info[-4]
        update_content['tags'] = [a.text for a in soup.find('div','taglist')] if soup.find('div','taglist') else []
    except:
        print('parse error:',_id)
    return update_content

In [None]:
# export
def update_artical_info(_id) -> int: 
    result = art_coll.update_one({'_id':_id},{'$set':parse_html(_id)})
    return result.modified_count

In [None]:
parse_html(1633891)

parse error: 1633891


{'title': '如何报名'}

In [None]:
update_artical_info(3614430)

0

In [None]:
# export
def update_all_artical_info():
    ids = [i['_id'] for i in art_coll.find({'likes':{'$exists':False}})]
    task_results = IpPool.parallel_task(update_artical_info,ids,1)
    modify_count = len([i for i in task_results if i> 0])
    print('modify:',modify_count)

In [None]:
!nbdev_build_lib --fname 24_MongoDB.ipynb
!mv crawler_from_scratch/RRPM.py RRPM.py 

Converted 24_MongoDB.ipynb.


In [None]:
# export
if __name__ == '__main__': update_all_artical_info()

## 检查数据完整性

## 分析数据

In [None]:
tags = art_coll.find({'tags':[]}).limit(10)

In [None]:
art_coll.count_documents({'tags':[]})

111

## 数据库导出备份

## 发布

In [None]:
!git add 24_MongoDB.ipynb
!git commit -m "fix ip increase num"

In [None]:
!nbdev_build_lib --fname 24_MongoDB.ipynb
!mv crawler_from_scratch/RRPM.py RRPM.py 