In [None]:
from IPython.display import display, Image, HTML

from nico_info import NicovideoInfomation
from utils import analyze_comments

In [None]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

CLASS `NicovideoInfomation(video_url: str = None, video_id: str = None)`

Parameters
- `video_url(str)` - 動画のURL
- `video_id(str)` - 動画のID
> どちらかだけでOK

`load_comments(forks, mode, hop_rate, check, tqdm_kwargs)`

Parameters
- `forks(Union[int, list])` - コメントのタイプ (0: 一般，1: 投稿者，2: かんたん)
- `mode(str)` - 読み込み方 (once: 最新のみ，roughly: 粗めに，exactly: 正確に)
- `hop_rate(float)` - 過去コメントを読み込むためのパラメータ(粗め: 0~1 :正確)
- `check` - コメントをどれだけ読み込めたか表示する

`video_html(w, h)`

Parameters
- `h(int)` - 高さ
- `w(int)` - 幅

Returns
- `html(str)` - 動画埋め込み用の HTML

In [None]:
ninfo = NicovideoInfomation(input())
ninfo.load_comments([0, 2], hop_rate=.2, mode='exactly', check=True)
display(HTML(ninfo.video_html()))

comments_df = ninfo.comments_df

In [None]:
tgt_df = comments_df[comments_df.index.str[0] == '0']

### コメントのプロット

In [None]:
remove_index = ['user_id', '184', 'position', 'size', 'color', 'command', 'score']
for fork in sorted(set(comments_df.index.str[0])):
    fork_df = comments_df[comments_df.index.str[0] == fork]
    display(fork_df.drop(remove_index, axis=1))

### WordCloud によるコメント解析

In [None]:
from wordcloud import WordCloud

In [None]:
tgt_comments = tgt_df.comment

# tokenizer には janome か sudachi が選択できる
results = analyze_comments(tgt_comments, tokenizer='janome')
text = ' '.join(results)

font_path = '/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc'
wordcloud = WordCloud(
    background_color='white',
    font_path=font_path,
    width=800,
    height=600,
    max_words=500
).generate(text)

wordcloud.to_file('./wordcloud.png')
display(Image('./wordcloud.png'))

### ユーザー毎の投稿頻度

In [None]:
import matplotlib.pyplot as plt
from pprint import pprint

In [None]:
tgt_df = comments_df[comments_df.index.str[0] == '0']
uids = set(tgt_df.user_id)
hists = [(uid, len(tgt_df[tgt_df.user_id == uid])) for uid in uids]
hists.sort(key=lambda x: x[1], reverse=True)

left = range(len(hists))
height = [t[1] for t in hists]

print(f'max: {max(height)}')
print(f'min: {min(height)}')
print(f'avg: {sum(height)/len(height):.2f}')
print('---')
print('many post users:')
pprint({t[0]: t[1] for t in hists[:5]})

plt.bar(left, height)
plt.show()

In [None]:
tgt_df = comments_df[comments_df.index.str[0] == '2']
uids = set(tgt_df.user_id)
hists = [(uid, len(tgt_df[tgt_df.user_id == uid])) for uid in uids]
hists.sort(key=lambda x: x[1], reverse=True)

left = range(len(hists))
height = [t[1] for t in hists]

print(f'max: {max(height)}')
print(f'min: {min(height)}')
print(f'avg: {sum(height)/len(height):.2f}')
print('---')
print('many post users:')
pprint({t[0]: t[1] for t in hists[:5]})

plt.bar(left, height)
plt.show()

### スコアの低いユーザーの投稿

In [None]:
tgt_df = comments_df[comments_df.index.str[0] == '0']
uids = set(tgt_df.user_id)
scores = [
    (uid, tgt_df[tgt_df.user_id == uid].score.values.mean())
    for uid in uids
]
scores = [t for t in scores if t[1] != 0.]
scores.sort(key=lambda x: x[1])

left = range(len(scores))
height = [t[1] for t in scores]

plt.bar(left, height)
plt.show()

remove_index = ['user_id', 'write_time', 'video_time', 'command', '184']
for score in scores[:5]:
    print('min score user:', score[0])
    display(tgt_df[tgt_df.user_id == score[0]].drop(remove_index, axis=1))