In [7]:
import json
from zipfile import ZipFile
import pandas as pd
import glob

## Video info from zipped download

In [8]:
vid_info = list()

info_files_path = "../data/video_info/*_vids.json"
info_files = glob.glob(info_files_path)

for f_name in info_files:
    with open(f_name, "r") as info_file:
        vid_info.extend(json.load(info_file))

# for lookup later
vid2ind = {v["id"]:i for i, v in enumerate(vid_info)}

In [9]:
vid_info[vid2ind["83571434"]]

{'id': '83571434',
 'stream_id': '22827014832',
 'user_id': '30707866',
 'user_login': 'esl_lol',
 'user_name': 'ESL_LOL',
 'title': 'RERUN: fnatic vs. Millenium - Group A - IEM Katowice 2014 - League of Legends',
 'description': '',
 'created_at': '2016-08-14T14:26:23Z',
 'published_at': '2016-08-14T14:26:23Z',
 'url': 'https://www.twitch.tv/videos/83571434',
 'thumbnail_url': 'https://static-cdn.jtvnw.net/cf_vods/d2nvs31859zcd8/ad3ef727c7_esl_lol_22827014832_500493871/thumb/thumb0-%{width}x%{height}.jpg',
 'viewable': 'public',
 'view_count': 36,
 'language': 'en',
 'type': 'archive',
 'duration': '18h30m39s',
 'muted_segments': None}

In [49]:
chat_files_path = "../data/videos_chat/*_vids_chat.zip"
chat_files = glob.glob(chat_files_path)


for chat_file_name in chat_files:
    zf = ZipFile(chat_file_name)
    for f_name in zf.namelist():
        # print(f_name)if f_name.endswith("json") else None
        if f_name.endswith("json"):
            with zf.open(f_name, "r") as in_file:
                vid_chat = json.load(in_file)
                v_ind = vid2ind[f_name.split("/")[-1].strip(".json")]
                chat_info = {
                    "msg_count": len(vid_chat["comments"]),
                    "is_rerun": vid_info[v_ind]["title"].startswith("RERUN") or vid_info[v_ind]["title"].startswith("REBROADCAST") # might want to add more filters in future
                }

                vid_info[v_ind].update(chat_info)

In [16]:
vid_info_filtered = [vinf for vinf in vid_info if "msg_count" in vinf.keys()]

## Video info into dataframe

In [19]:
cols = ["id", "title", "created_at", "msg_count", "is_rerun", "duration"]

df_vid_info = pd.DataFrame(vid_info_filtered, columns=cols)
df_vid_info["created_at"] = pd.to_datetime(df_vid_info["created_at"])

df_vid_info

Unnamed: 0,id,title,created_at,msg_count,is_rerun,duration
0,1136665862,🔴 LIVE - ESL Mobile Open presents Wild Rift Oc...,2021-09-02 07:18:20+00:00,1282,False,8h11m38s
1,1129853487,🔴 LIVE - ESL Mobile Open presents Wild Rift Oc...,2021-08-26 07:14:01+00:00,376,False,5h46m43s
2,1122864123,🔴 LIVE - ESL Mobile Open presents Wild Rift Oc...,2021-08-19 07:15:57+00:00,1040,False,6h11m58s
3,1115898498,🔴 LIVE - ESL Mobile Open presents Wild Rift Oc...,2021-08-12 09:42:37+00:00,304,False,5h36m13s
4,1115852655,🔴 LIVE - ESL Mobile Open presents Wild Rift Oc...,2021-08-12 07:31:14+00:00,256,False,2h10m39s
...,...,...,...,...,...,...
1361,38013566,GPL 2013 Spring Season #63 #64 Azubu TPA vs KL...,2013-03-10 11:16:44+00:00,0,False,2h42m46s
1362,38013607,"GPL 2013 Spring Season #56 #57 ahq vs SAJ, KLH...",2013-03-03 11:06:12+00:00,0,False,2h26m34s
1363,38013596,GPL 2013 Spring Season #52 KLH vs SAJ,2013-03-02 11:12:57+00:00,0,False,3h27m1s
1364,38013562,"GPL 2013 Spring Season #44 #45 #46 MLE vs ahq,...",2013-02-23 10:56:24+00:00,0,False,2h41m17s


In [20]:
df_vid_info.describe()

Unnamed: 0,msg_count
count,1366.0
mean,1607.335286
std,14107.745856
min,0.0
25%,0.0
50%,0.0
75%,165.5
max,264144.0


In [47]:
vid_info_out = "../data/videos_chat/vid_info.csv"
df_vid_info.to_csv(vid_info_out)

In [21]:
df_commented_vids = df_vid_info[df_vid_info.msg_count > 0]
f"{df_commented_vids.id.count()}/{df_vid_info.id.count()} videos with comments"

'564/1366 videos with comments'

In [35]:
df_reruns = df_vid_info[df_vid_info.is_rerun]
print(f"{df_reruns.id.count()}/{df_vid_info.id.count()} videos are reruns (approx)")

652/1366 videos are reruns (approx)


In [41]:
print(f"{sum(df_reruns.msg_count > 0)}/{df_reruns.id.count()} reruns have comments")

294/652 reruns have comments


In [44]:
print(f"{df_reruns.msg_count.mean()} comments per rerun video on average") # maybe an approximate cutoff for what to sort out

689.3067484662577 comments per rerun video on average


In [36]:
print(f"{df_commented_vids.msg_count.sum()} comments in total")

2195620 comments in total


In [37]:
print(f"{df_commented_vids.msg_count.mean()} comments per video on average")

3892.9432624113474 comments per video on average


In [38]:
print(f"oldest video from\t{df_vid_info.created_at.min()}\nmost recent video from\t{df_vid_info.created_at.max()}")

oldest video from	2012-05-29 11:43:41+00:00
most recent video from	2021-09-02 07:18:20+00:00
