In [61]:
import os
import re

import matplotlib.pyplot as plt
import pandas as pd
import requests
from bs4 import BeautifulSoup

plt.rcParams['font.sans-serif'] = ['SimHei']
import warnings
warnings.filterwarnings("ignore")
cookies = {
    "cookie 和 headers替换你自己的"
}
headers = {
    "cookie 和 headers替换你自己的"
}

class BiBullet:
    def __init__(self):
        self.bullet_type = {
            "1": "滚动弹幕",
            "2": "滚动弹幕",
            "3": "滚动弹幕",
            "4": "底端弹幕",
            "5": "顶端弹幕",
            "6": "逆向弹幕",
            "7": "精准定位",
            "8": "高级弹幕"
        }
        self.bullet_font = {
            "12": "非常小",
            "16": "特小",
            "18": "小",
            "25": "中",
            "36": "大",
            "45": "很大",
            "64": "特别大"
        }
        self.bullet_pool = {
            "0": "普通池",
            "1": "字幕池",
            "2": "特殊池"
        }

    def get_url(self,url):
        return self.parse(url)

    def parse(self, url):
        url1 = url
        cid = re.findall(r'"cid":(.*?),', requests.get(url,headers=headers,cookies=cookies).text)[0]
        url = f'https://comment.bilibili.com/{cid}.xml'
        request = requests.get(url,headers=headers,cookies=cookies)
        request.encoding = 'utf8'
        soup = BeautifulSoup(request.text, 'lxml')
        results = soup.find_all('d')
        dict_infos = {"Up主视频链接": url1}
        df = pd.DataFrame()
        for t in results:
            data_split = t.attrs['p'].split(',')
            dict_info = self.getButterParams(data_split)
            dict_infos.update(dict_info)
            comments_ = self._dealStr(t.text)
            dict_infos.update({"弹幕内容": comments_})
            df = df._append(dict_infos, ignore_index=True)
        df = df[dict_infos.keys()]
        df.sort_values(["弹幕出现时间"], inplace=True)
        df.reset_index(drop=True, inplace=True)
        return df

    def getButterParams(self, data_splits):
        totalseconds = int(round(float(data_splits[0]), 0))
        second = totalseconds % 60
        hour = totalseconds // 60 // 60
        minute = totalseconds // 60 % 60
        if second<10:
            second="0"+str(second)
        if minute<10:
            minute="0"+str(minute)
        ret_time = str(hour) + "时" + str(minute) + "分" + str(second) + "秒"
        bullet_type = self.bullet_type[data_splits[1]]
        bullet_pool = self.bullet_pool[data_splits[5]]
        dict_info = {
            "弹幕出现时间": ret_time,
            "弹幕池": bullet_pool,
            "弹幕模式": bullet_type,
        }
        return dict_info

    def _dealStr(self, x):
        x = str(x).replace('  ', '').replace('\t', '').replace('\n', '').replace('\r', '').replace(',', '，')
        return x

    def save_df(self, df):
        file_name = "data/b站弹幕_more_rows"
        print('存到文件的数据有{}条'.format(len(df)))
        if not os.path.exists(f'{file_name}.csv'):
            df.to_csv(f'{file_name}.csv', encoding='utf_8_sig', mode='a', index=False)
        else:
            df.to_csv(f'{file_name}.csv', encoding='utf_8_sig', mode='a', index=False, header=False)



df=BiBullet().get_url("https://www.bilibili.com/video/BV1TD42157yy")
df['秒数']=df['弹幕出现时间'].apply(lambda x: int(x[0])*3600+int(x[2:4])*60++int(x[-3:-1]))
df.to_excel('output.xlsx',index=False)
publishdate_counts = df['秒数'].value_counts().sort_index()

plt.figure(figsize=(20, 5))
publishdate_counts.plot(kind='line',  color='b',alpha=0.5)
plt.title('弹幕数量与视频秒数折线图\n')
plt.xlabel('\n秒数\n')
plt.ylabel('\n频数\n')
plt.grid(True)
plt.xlim(0,9000)
plt.ylim(0,30)
plt.xticks(range(0,9500,500))
plt.show()
