In [7]:
import time
import requests


class Net(object):
        
    def request(self, method, url, headers=None, params=None, data=None, **kwargs):
        _headers = {'user-agent': 'wasp', 'referer': url}
        
        if headers is None:
            headers = _headers
        
        if 'user-agent' not in headers:
            headers.update({'user-agent': 'wasp'})
            
        return requests.request(method, url, headers=headers, params=params, data=data, **kwargs)
    
    def get(self, url, headers=None, params=None, **kwargs):
        return self.request('GET', url, headers=headers, params=params, data=None, **kwargs)
    
    def post(self, url, headers=None, data=None, **kwargs):
        return self.request('POST', url, headers=headers, params=None, data=data, **kwargs)

    
class SearchHelper(object):
    """
    搜索方法返回一个列表，内含如下格式的字典:
    
    {
        'author': '幻想乡的新月',
        'mid': 149592,
        'aid': 'av2600096',
        'title': '【东方MMD】暗黑炼狱火☆',
        'description': 'sm26747660 这个简直wwww~作者FSM'
    }
  """
    def __init__(self, SESSDATA='', BILI_JCT=''):
        self.found_aid = set() # 找到的视频 av/bv 号集合
        self.not_found_kw = set() # 没有找的的关键字集合
        self.result_list = []
        
        self.net = Net()
        
        self.cookie = {}
        
        if SESSDATA and BILI_JCT:
            cookie.update({'cookie': 'SESSDATA={}; BILI_JCT={};'.format(SESSDATA, BILI_JCT)})
        
    def _find_key(self, dict_obj, kw_set, kw_map=None, default_value=None):
        """提取出字典中指定的 keyword 组成一个一维的字典，并根据可选的 keyword map 将原字典的指定键映射到给出的键"""
        def dfs(dict_obj, result_dict):
            """简单的 DFS，把 dict 中 key in kw_set 的键值对 update 到 result_dict 里面"""
            for k, v in dict_obj.items():
                if k in kw_set: 
                    result_dict.update({k: v})
                else: 
                    if isinstance(v, list):
                        for i in v:
                            if isinstance(i, dict):
                                dfs(i, result_dict)

                    if isinstance(v, dict):
                        dfs(v, result_dict)
    
        result_dict = {}
        dfs(dict_obj, result_dict)
        
        for key in list(kw_set):
            if key not in result_dict:
                result_dict.update({key: default_value})
        
        if kw_map is None:
            return result_dict
    
        mapped_dict = {}
        for key in kw_set:
            if key in kw_map: # 如果关键字在映射表中，改键名
                mapped_dict.update({kw_map[key]: result_dict[key]})
            else: # 不在就不修改
                mapped_dict.update({key: result_dict[key]})
        
        return mapped_dict
        
    def search(self, kw_list):
        """聚合站内检索和站外检索方法，先站内检索，后站外检索
        返回一个元组，第一个元素值为视频信息 dict（格式见类的注释），第二个元素值为未找到的关键字列表
        ([{...}], [...])"""
        import traceback
        
        try:
            self.not_found_kw = set(kw_list) # 初始假设所有关键字都没有找到
            
            kw_list = list(self.not_found_kw) # 去重然后还原
            
            print('*** 开始站内检索，预计耗时 {}s'.format(2 * len(kw_list)))
            
            found, not_found = self.search_from_bili(kw_list)
            print('找到 {} 个数据'.format(len(found)))
            
            print()
            print('*** 开始站外检索(Dogedoge)，预计耗时 {}s'.format(2 * len(not_found)))

            found, not_found = self.search_from_doge(list(self.not_found_kw))
            print('找到 {} 个数据'.format(len(found)))
            print()
        except Exception:
            print('#' * 30)
            print('ERROR:')
            traceback.print_exc()
        
        return self.result_list, list(self.not_found_kw)
    
    def search_from_bili(self, kw_list):
        """内站检索"""
        headers = {
            'origin': 'https://search.bilibili.com',
            'referer': 'https://search.bilibili.com/',
            'user-agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                           ' Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66')
        }
        
        headers.update(self.cookie)
        
        kw_list = list(set(kw_list))
        kw_count = len(kw_list)
        
        # 根据数据量动态设置休眠时间。不知道有没有用
        if kw_count < 40:
            delay = 0.5
        elif kw_count >= 40 and kw_count <= 80:
            delay = 1.5
        else:
            delay = 2.5
                    
        for kw in kw_list:
            api = 'https://api.bilibili.com/x/web-interface/search/type'
            print('Searching {} ...'.format(kw))
            
            for page in range(1, 6): # 最多取 5 页数据
                params = {
                    'keyword': kw,
                    'search_type': 'video',
                    'page': page
                }

                dict_kw_set = {'aid', 'author', 'mid', 'title', 'description'}
                jsoup = self.net.get(api, headers=headers, params=params).json()

                if jsoup['code'] == 0: 
                    data = jsoup['data']
                    
                    if 'result' in data:
                        for item in data['result']:
                            dict_item = {'keyword': kw}
                            aid = 'av' + str(item['aid'])

                            if aid not in self.found_aid:
                                self.found_aid.add(aid)

                                dict_item.update(self._find_key(item, dict_kw_set))
                                dict_item.update({'aid': aid})

                                for k, v in dict_item.items(): # 转 str
                                    dict_item[k] = str(v)
                                    
                                self.result_list.append(dict_item)

                                if kw in self.not_found_kw:
                                    self.not_found_kw.remove(kw)
                    else:
                        self.not_found_kw.add(kw)
                        print(kw, '无数据，可能是因为被屏蔽或被删除')
                        break
                # 被 ban，似乎其他接口没受到影响
                # 恢复时间大概 30min ?
                # {'code': -412, 'message': '请求被拦截', 'ttl': 1, 'data': None} 
                # elif jsoup['code'] == -412: 
                else:
                    print('接口调用过于频繁，请稍后再试')
                    print(jsoup)
                    break

                print('休眠 {}s'.format(delay))
                time.sleep(delay) 
                
        # 考虑到单独使用方法可能
        return self.result_list, list(self.not_found_kw)
    
    def search_from_doge(self, kw_list):
        """外站检索"""
        import re
        from lxml.html import fromstring
        
        url = 'https://www.dogedoge.com/results'
        
        headers = {
            'referer': 'https://www.dogedoge.com/',
            'user-agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                           ' Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66')
        }
        
        headers.update(self.cookie)
        
        kw_list = list(set(kw_list)) # 去重
        kw_count = len(kw_list)
        
        if kw_count <= 10: # 数据量在 10 个以内时最多休眠 2s
            max_delay = 2
        else:
            max_delay = 3 # 数据量超过 10 最多休眠 3s
        
        for kw in kw_list:
            params = {
                'q': kw,
                'lang': 'cn'
            }
            
            html = self.net.get(url, headers=headers, params=params).text
            elements = fromstring(html)
            element_a = elements.xpath('//a[@class="result__url js-result-extras-url"]')
            
            print('Searching {} ...'.format(kw))
            
            if not element_a:
                self.not_found_kw.add(kw)
            
            for a in element_a[:5]: # 取前五个数据
                domain = a.xpath('string(./span[@class="result__url__domain"])')
                
                if domain and domain.find('www.bilibili.com') != -1: # 是 B 站的内容
                    redirect_url = 'https://www.dogedoge.com/' + a.get('href')
                    dist = self.net.get(redirect_url, headers=headers, allow_redirects=False).headers['location']
                                        
                    vid = re.findall('BV\w+|av\d+', dist)
                    
                    if vid:
                        if vid[0].startswith('BV'):
                            video_info = self.net.get('https://api.bilibili.com/x/web-interface/view?bvid={}'.format(vid[0])).json()
                        else: # 以 AV 号为参数的视频信息接口， dogedoge 返回的基本都是 BV 号，不过还加上以防万一
                            video_info = self.net.get('https://api.bilibili.com/x/web-interface/view?aid={}'.format(vid[0][2:])).json()
                        
                        if video_info['code'] == -403: # 这种稿件是登录可见的 {'code': -403, 'message': '访问权限不足', 'ttl': 1}
                            print(kw, vid[0], '需要登录查看')
                            # 这里为了方便起见把可能是 bv 的视频放到 aid 字段了
                            self.result_list.append({
                                    'keyword': kw, 'aid': vid[0], 
                                    'author': 'unknown', 'mid': -1, 
                                    'description': '视频需要登录查看', 'title': '视频需要登录查看'
                            }) 
                        elif video_info['code'] == 0: # 正常
                            data = video_info['data']
                            dict_item = {'keyword': kw}
                            dict_kw_set = {'aid', 'name', 'mid', 'desc', 'title'}
                            dict_item.update(self._find_key(data, dict_kw_set, {'name': 'author', 'desc': 'description'}))
                            dict_item['aid'] = 'av' + str(dict_item['aid'])
                            
                            for k, v in dict_item.items(): # 全部转为 str
                                dict_item[k] = str(v)
                                
                            if dict_item['aid'] not in self.found_aid:
                                self.found_aid.add(dict_item['aid'])                        
                                
                                self.result_list.append(dict_item)
                        else: # 到这里说明稿件已经被删除
                            if video_info['code'] == -404:
                                print(kw, '已被削除')
                            else:
                                print('检索 {} 失败，MSG: {} CODE:{}'.format(kw, video_info['message'], video_info['code']))
                                
                            self.not_found_kw.add(kw)
                            break
                            
                        if kw in self.not_found_kw:
                            self.not_found_kw.remove(kw)
                        # 第一个找到的数据认为是最相关的数据，因此直接 break
                        # break
                        # 取所有相关的数据，每次休眠 0.3s
                        time.sleep(0.3)
                    
            from random import randint
            
            delay = randint(1, max_delay) # 随机暂停 1-max_delay s
            print('休眠 {}s.'.format(delay))            
            time.sleep(delay)
        
        # 考虑到单独使用方法可能
        return self.result_list, list(self.not_found_kw)
    
    
def format_result(result, only_one=False):
    """完全为了检索 sm 号而准备"""
    import re
    
    sus = [] # 疑似相关视频列表
    repeated = [] # 重复视频列表
    
    # 记录所有关键字和完全匹配的关键字
    # 有些关键字可能只是因为有视频提到了而被误认为被找到了，然而实际上可能因为各种原因并没有被搬运
    all_keyword = set()
    certain_keyword = set()
    
    found, not_found = result
    
    found.sort(key=lambda x:x['keyword'])

    print()
    print(' 检索结果 '.center(40, '='))
    for item in found:
        content = ' '.join([item['keyword'], str(item['aid']), '(' + item['title'] + ')', item['author'] + '(uid:' + str(item['mid']) + ')'])
        
        all_keyword.add(item['keyword'])
        
        # 提取简介中的 sm 号
        parsed_kw = re.findall('sm\d+', item['description'])
        
        # 如果没有 sm 号 或 首个 sm 号和 keyword 不同，加入疑似列表
        # 第二个条件判断第一个 sm 号是否和关键字对应是基于搬运的格式，如果搬运给出的第一个 sm 号不是视频原地址就可能出错，
        # 不过 B 站搬运格式貌似是固定的，间接第一行就是 original link
        if (not parsed_kw) or (parsed_kw[0] != item['keyword']):
            sus.append((content + '\n简介:\n' + item['description']))            
        else: # 否则这里输出的内容都是确定的 sm -> av
            if item['keyword'] not in certain_keyword: # sm 号已经出现过，表明检索结果有重复
                certain_keyword.add(item['keyword'])
                print(content)
            else:
                if only_one:
                    repeated.append(content)
                else:
                    print(content)            

                    
    print()
    print('未找到的数据:')
    print(not_found)
    
    print()
    print('可能没找到的数据:')
    print(list(all_keyword - certain_keyword))
    
    if only_one and repeated:
        print()
        print('重复出现过的数据:')
        for c in repeated:
            print(c)
    
    if sus:
        print()
        print('相关数据:')        
        for s in sus:
            print('*' * 20)
            print(s)
            print('*' * 20)
            print()

In [10]:
import re
text = """https://www.nicovideo.jp/watch/sm31656640
VM231:1 https://www.nicovideo.jp/watch/sm31562541
VM231:1 https://www.nicovideo.jp/watch/sm31516712
VM231:1 https://www.nicovideo.jp/watch/sm31304637
VM231:1 https://www.nicovideo.jp/watch/sm31083636
VM231:1 https://www.nicovideo.jp/watch/sm30724164
VM231:1 https://www.nicovideo.jp/watch/sm30605360
VM231:1 https://www.nicovideo.jp/watch/sm29386586
VM231:1 https://www.nicovideo.jp/watch/sm28496311
VM231:1 https://www.nicovideo.jp/watch/sm27833467
VM231:1 https://www.nicovideo.jp/watch/sm26480605
VM231:1 https://www.nicovideo.jp/watch/sm25845068
VM231:1 https://www.nicovideo.jp/watch/sm25548031
VM231:1 https://www.nicovideo.jp/watch/sm25432232
VM231:1 https://www.nicovideo.jp/watch/sm24877487
VM231:1 https://www.nicovideo.jp/watch/sm24321167
VM231:1 https://www.nicovideo.jp/watch/sm23792233
VM231:1 https://www.nicovideo.jp/watch/sm23630150
VM231:1 https://www.nicovideo.jp/watch/sm23244310
VM231:1 https://www.nicovideo.jp/watch/sm23068627
VM231:1 https://www.nicovideo.jp/watch/sm22857272
VM231:1 https://www.nicovideo.jp/watch/sm22724381
VM231:1 https://www.nicovideo.jp/watch/sm22659384
VM231:1 https://www.nicovideo.jp/watch/sm22472754
VM231:1 https://www.nicovideo.jp/watch/sm22356102
VM231:1 https://www.nicovideo.jp/watch/sm22262591
VM231:1 https://www.nicovideo.jp/watch/sm22215451
undefined
"""

li = re.findall('sm\d+', text)

st = time.time()

# 单独检索测试
sh = SearchHelper()
# result = sh.search_from_bili(li)
result = sh.search_from_doge(li)
format_result(result, only_one=True)

print()
print('耗时 {}s'.format(round(time.time() - st, 2)))

Searching sm22215451 ...
sm22215451 BV1Ts411t7zw 需要登录查看
休眠 2s.
Searching sm31656640 ...
休眠 2s.
Searching sm31304637 ...
休眠 1s.
Searching sm31562541 ...
休眠 2s.
Searching sm25845068 ...
休眠 3s.
Searching sm25548031 ...
休眠 1s.
Searching sm27833467 ...
休眠 3s.
Searching sm22724381 ...
休眠 1s.
Searching sm23792233 ...
休眠 3s.
Searching sm22472754 ...
休眠 1s.
Searching sm23068627 ...
休眠 2s.
Searching sm26480605 ...
休眠 1s.
Searching sm28496311 ...
休眠 3s.
Searching sm30605360 ...
休眠 2s.
Searching sm29386586 ...
休眠 3s.
Searching sm25432232 ...
休眠 3s.
Searching sm24877487 ...
休眠 3s.
Searching sm31083636 ...
休眠 1s.
Searching sm22262591 ...
休眠 1s.
Searching sm23244310 ...
休眠 1s.
Searching sm30724164 ...
休眠 1s.
Searching sm22659384 ...
休眠 2s.
Searching sm23630150 ...
休眠 2s.
Searching sm24321167 ...
休眠 2s.
Searching sm22356102 ...
休眠 2s.
Searching sm31516712 ...
休眠 3s.
Searching sm22857272 ...
休眠 2s.

sm22857272 av13892479 (我是一只猫.mp4) 核弹打苍蝇(uid:2081469)
sm23068627 av28737195 ((熟)朴秀的空手入门) ぼくひで(uid:4925394

In [9]:
format_result(result, only_one=True)


sm24239541 av31753756 (KNN塾.序章「Ominous」) 矢澤妮可(uid:22746751)
sm26299305 av96267112 (BBCOOKIE☆劇場「母の日」.Orange sky END) ECU王様(uid:84438004)
sm26299400 av16569323 (【cookie☆】小鬼SIK) 幻想乡敬老院院长(uid:1277420)
sm26883779 av3922353 (投稿者TNOK) 德古拉之泪(uid:4006838)
sm27947745 av3526294 (SIK的自由研究) 关西クレーマー(uid:2862608)
sm27947843 av17631064 (【cookie☆】一斉在庫売り切りセール SIK GB集) 幻想乡敬老院院长(uid:1277420)
sm28818614 av4619772 (迷你UDKBB) 柚子湯姉貴(uid:150801)
sm29628917 av8209847 (NYN姉貴PB集) -枩慶-(uid:1920474)
sm29926833 av10050372 (淫夢弱點攻擊GB.P5) 赤夜千緋(uid:8599866)
sm30185205 av20639840 (NYN姐贵音声集) 游客12345679(uid:2417251)
sm30309842 av20640128 (NYN姐贵音声素材2) 游客12345679(uid:2417251)
sm30466319 av8143083 (跳着新宝岛的NYN姐贵PB+使用例.kuso改) 托奇没霜雪(uid:1277794)
sm30737829 av20595993 (SZ姐贵　音声素材.JC) 游客12345679(uid:2417251)
sm30748662 av20596428 (SZ姐贵的翻唱) 游客12345679(uid:2417251)
sm30759419 av20596816 (SZ姐贵　音声素材集(剪切完毕).shizo) 游客12345679(uid:2417251)
sm31234027 av20640238 (NYN姐贵音声素材3.参) 游客12345679(uid:2417251)
sm31280702 av20640500 (NYNキャス音声4.06小隊) 游

In [39]:
sh = SearchHelper()
d = {'a': 'b', 'c': 'd', 'x': [{'m': 'n', 'j': 'k'}]}
print(sh._find_key(d, {'a', 'm', 'z'}, {'m': '2', 'z': '3'}))

{'3': None, '2': 'n', 'a': 'b'}


In [15]:
# 176 条数据测试
# 无脑 0.5s 休眠 B站接口被 ban，现在改为根据数据量动态设置休眠时间
# dogedoge 正常
# 纯 doge 结果: 59 / 115 
# 但是耗时巨大，取个中位数 2s，得 2 * 176 / 60 = 5.9 min，不过至少比手动来的有效率（适当）
"""
sh = SearchHepler()
li = ['sm37978072', 'sm37426464']
sh.search(li)
"""


NameError: name 'c' is not defined

In [4]:
text = """sm37326739
原标题：動き出した落書きICG姉貴EX　.GB
原作者：乳酸菌
原简介：おれがつかう（自給自足）

DLはこちら【https://www.mediafire.com/file/7abvwlr3mdnrw25/file】

落書きICG姉貴拡張キット【im10577988】

路島アニキ切抜素材謝謝茄子‼【sm37309546/av926652399】

投稿動画【 mylist/65591471 】


致敬sm28313948/av3982762"""


import re

print(re.findall('sm\d+', text))

['sm37326739', 'sm37309546', 'sm28313948']
