In [34]:
import os
import time
import uuid
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, List
import re

import requests
from tqdm.auto import tqdm

In [2]:
CLIENT_ID = "qeyx02kvx8uffv4std0did44iyoj35"
CLIENT_SECRET = "s9jf674kb2ekkqqyv1nrs8uftnxuum"
STREAMER_ID = "14371185" # https://www.twitch.tv/northernlionb

In [17]:
@dataclass
class Comment:
    commenter: str
    createdAt: datetime
    contentOffsetSeconds: int
    message: str
    video_id: str

In [4]:
expiration_time = 0
def get_access_token():
    app_access_token_response = requests.post(
        "https://id.twitch.tv/oauth2/token",
        params={
            "client_id": CLIENT_ID,
            "client_secret": CLIENT_SECRET,
            "grant_type": "client_credentials",
        }
    )

    expires_in = app_access_token_response.json()["expires_in"]
    expiration_time_ = time.time() + expires_in
    return app_access_token_response.json()["access_token"], expiration_time_

app_access_token, expiration_time = get_access_token()

In [5]:
base_api_url = "https://api.twitch.tv/helix" 
def twitch_api_get(url, params=None):
    global app_access_token, expiration_time
    if time.time() > expiration_time:
        print("Refreshing token")
        app_access_token, expiration_time = get_access_token()
    full_url = os.path.join(base_api_url, url)
    return requests.get(
        full_url,
        params=params,
        headers={
            "Client-Id": CLIENT_ID,
            "Authorization": f"Bearer {app_access_token}",
        }
    )

In [36]:
videos = twitch_api_get("videos", params={"user_id": STREAMER_ID, "type": "archive"})

In [43]:
headers = {
    "Client-ID": "kd1unb4b3q4t58fwlpcbzcbnm76a8fp",
    "Accept": "application/vnd.twitchtv.v5+json",
    'X-Device-Id': uuid.uuid4().hex,
}
DURATION_RE = re.compile(r"(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)s)?")

result: Dict[str, List[Comment]] = defaultdict(list)
for video in tqdm(videos.json()["data"][3:]):
    has_next_page = True
    cursor = None
    video_id = video["id"]
    duration_match = DURATION_RE.match(video["duration"])
    duration = sum(int(x or 0) * 60 ** i for i, x in enumerate(reversed(duration_match.groups())))
    
    with tqdm(total=duration, unit="s", desc=video['created_at']) as video_bar:
        while has_next_page:
            query = f"""
                query {{
                    video(id: "{video_id}") {{
                        comments{f'(after: "{cursor}")' if cursor else ''} {{
                            edges {{
                                cursor
                                node {{
                                    commenter {{
                                        displayName
                                        login
                                    }}
                                    createdAt
                                    contentOffsetSeconds
                                    message {{
                                        fragments {{
                                            text
                                        }}
                                    }}
                                }}
                            }}
                            pageInfo {{
                                hasNextPage
                            }}
                        }}
                    }}
                }}
            """
            response = requests.post("https://gql.twitch.tv/gql", json={"query": query}, headers=headers)
            if response.status_code != 200 or "errors" in response.json():
                print(response.json())
                break
            comments = response.json()["data"]["video"]["comments"]
            has_next_page = comments["pageInfo"]["hasNextPage"]
            for comment in comments["edges"]:
                result[video_id].append(Comment(
                    commenter=comment["node"]["commenter"]["displayName"] or "UNKNOWN",
                    createdAt=datetime.fromisoformat(comment["node"]["createdAt"]),
                    contentOffsetSeconds=comment["node"]["contentOffsetSeconds"],
                    message=comment["node"]["message"]["fragments"][0]["text"],
                    video_id=video_id,
                ))
            if len(comments["edges"]) == 0:
                print(response.json())
                raise IOError("Bad.")
            cursor = comments["edges"][-1]["cursor"]
            video_bar.update(comments["edges"][-1]["node"]["contentOffsetSeconds"] - video_bar.n)
        

  0%|          | 0/17 [00:00<?, ?it/s]

2023-05-09T16:03:09Z:   0%|          | 0/18542 [00:00<?, ?s/s]

{'data': {'video': {'comments': {'edges': [{'cursor': 'eyJpZCI6ImZmY2ZkMzU1LWU4NGMtNDVlNS1hMzY0LTc1ZTAyNTc1Y2I3YSIsImhrIjoiYnJvYWRjYXN0OjQyMTc0MDAzOTc5Iiwic2siOiJBQUFNRDh2SkpjQVhYWkVyZlpWdXdBIn0', 'node': {'commenter': {'displayName': 'taliatate', 'login': 'taliatate'}, 'createdAt': '2023-05-09T19:43:46.764Z', 'contentOffsetSeconds': 13237, 'message': {'fragments': [{'text': 'oo oo '}, {'text': 'chjbPapa'}]}}}, {'cursor': 'eyJpZCI6ImZmY2ZkMzU1LWU4NGMtNDVlNS1hMzY0LTc1ZTAyNTc1Y2I3YSIsImhrIjoiYnJvYWRjYXN0OjQyMTc0MDAzOTc5Iiwic2siOiJBQUFNRDh2SkpjQVhYWkVyZlpWdXdBIn0', 'node': {'commenter': {'displayName': 'shinydruwu', 'login': 'shinydruwu'}, 'createdAt': '2023-05-09T19:43:46.815Z', 'contentOffsetSeconds': 13237, 'message': {'fragments': [{'text': 'ratJAM'}]}}}, {'cursor': 'eyJpZCI6ImZmY2ZkMzU1LWU4NGMtNDVlNS1hMzY0LTc1ZTAyNTc1Y2I3YSIsImhrIjoiYnJvYWRjYXN0OjQyMTc0MDAzOTc5Iiwic2siOiJBQUFNRDh2SkpjQVhYWkVyZlpWdXdBIn0', 'node': {'commenter': {'displayName': 'MinisterOfSpaceyStuff', 'login': 'minist

OSError: Bad.

In [None]:
import json

with open("comments.json", "w") as f:
    json.dump(result, f, indent=2)

2709

In [30]:
video_bar.close()