In [25]:
import glob
import json
import os
from datetime import datetime
from zipfile import ZipFile

In [26]:
def load_vid_info(info_files_path):
    vid_info = list()

    info_files = glob.glob(info_files_path)

    for f_name in info_files:
        with open(f_name, "r") as info_file:
            vid_info.extend(json.load(info_file))

    # for lookup later
    vid2ind = {v["id"]:i for i, v in enumerate(vid_info)}
    return vid_info, vid2ind

In [27]:
def extract_messages(chat_files_path, out_dir, bots_file):
    bot_ids = []
    with open(bots_file, "r") as in_file:
        bots = json.load(in_file)
        bot_ids = [bot["id"] for bot in bots]

    chat_files = glob.glob(chat_files_path)
    num_files_extracted = 0
    for chat_file_name in chat_files:
        zf = ZipFile(chat_file_name)
        for f_name in zf.namelist():            
            if f_name.endswith("json"):
                with zf.open(f_name, "r") as in_file:
                    vid_chat = json.load(in_file)
                    video_id = f_name.split("/")[-1].strip(".json")
                    with open(f"{out_dir}/{video_id}.txt", "w") as out_file:
                        out_file.write("\n".join([msg["message"]["body"] for msg in vid_chat["comments"] if msg["commenter"]["_id"] not in bot_ids]))
                    num_files_extracted += 1
            if num_files_extracted%250 == 0:
                print(f"{datetime.now().strftime('%Y/%m/%d_%H:%M:%S')}: extracted {num_files_extracted} chat files.")


In [28]:
vinf, v2i = load_vid_info("../data/video_info/*_vids.json")

In [29]:
channels = {c["user_login"] for c in vinf}

In [30]:
corpus_dir = "../data/videos_chat/corpus_test"
if not os.path.exists(corpus_dir):
    os.makedirs(corpus_dir)

In [31]:
extract_messages("../data/videos_chat/*_vids_chat.zip", corpus_dir, "../data/twitch_info/twitch_bots.json")

2022/04/04_15:06:57: extracted 0 chat files.
2022/04/04_15:06:58: extracted 250 chat files.
2022/04/04_15:07:13: extracted 500 chat files.
2022/04/04_15:07:23: extracted 750 chat files.
2022/04/04_15:07:32: extracted 1000 chat files.
2022/04/04_15:07:55: extracted 1250 chat files.
