
Links to sources that I pulled from:<br>Handling the Requests ConnectionError better: https://stackoverflow.com/questions/44448625/how-to-handle-a-connection-error-gracefully-in-requests<br>Calling the Twitch API via Jupyter: https://kaivalyapowale.com/2019/12/18/twitch-dashboard/


In [1]:
import json
import requests
import pandas as pd
from pandas import json_normalize
import datetime
import time
from os import path

connection_timeout = 30

In [2]:
# A function to get all games currently being broadcast

# Needs only a credentials header
def get_all_games(client_headers_dict):
    
    # The Timestamp is for creating a column in the data to know when the data was obtained.
    # Start_time is for the connection timeout of the try-catch block
    ts = pd.Timestamp(datetime.datetime.now())
    start_time1 = time.time()
    
    # Try to POST the request, and retry if given a Connection Error, up to only 30 seconds.
    flag1 = True
    while flag1:
        try:
            request = requests.get("https://api.twitch.tv/helix/games/top?first=100", headers=client_headers_dict)
            flag1 = False
            
        except requests.exceptions.ConnectionError:
            if time.time() > start_time1 + connection_timeout:
                raise Exception('Unable to get updates after {} seconds of ConnectionErrors'.format(connection_timeout))
            else:
                time.sleep(1)
    
    # If the first request was successful, transform data into a Pandas DataFame
    json_request = json.loads(request.text)
    df = pd.DataFrame.from_dict(json_normalize(json_request["data"]), orient='columns')
    df["TimeStamp_Pulled"] = ts
    
    # The page cursor for the next POST requests.
    next_page = json_request.get("pagination")
    
    # i is a flagging variable for the while loop.
    # df_list is a list to improve time complexity of concatenating DataFrames, 
    # because using pd.concat() each time in the loop is very slow.
    i = 1
    df_list = []
    df_list.append(df)
    
    # Iteratively gather all other cursors for pagination of data
    while i > 0 and next_page != {} and next_page != "":
        ts = pd.Timestamp(datetime.datetime.now())
        start_time2 = time.time()
        
        # Account for connection timeout, in getting next data pages
        flag2 = True
        while flag2:
            try:
                next_100 = requests.get("https://api.twitch.tv/helix/games/top?first=100&after="+next_page.get('cursor'), headers=client_headers_dict)
                flag2 = False
                
            except requests.exceptions.ConnectionError:
                if time.time() > start_time2 + connection_timeout:
                    raise Exception('Unable to get updates after {} seconds of ConnectionErrors'.format(connection_timeout))
                else:
                    time.sleep(1)
                    
        # Convert data to DataFrame, and append to list for concatenation
        next_json = json.loads(next_100.text)
        next_df = pd.DataFrame.from_dict(json_normalize(next_json["data"]), orient='columns')
        next_df["TimeStamp_Pulled"] = ts
        df_list.append(next_df)
        
        # If the most recent DataFrame is empty, then end the while loop
        # If DataFrame contains info, get the next page to request
        # This accounts for an inconsistency with the API not returning the max 100 items requested
        if next_df.empty:
            i = -1
        else:
            next_page = next_json.get("pagination")
    
    # Combine all of the DataFrames appended to the list all at once.
    df_return = pd.concat(df_list).drop_duplicates("id", keep="first").reset_index(drop=True)
    
    # Return all of the games/content being played currently
    return df_return

In [3]:
# A function to retrieve all users broadcasting a specific game ID

# Needs credentials header, and a singular game ID 
# or multiple game IDs in the string format of "12345&game_id=12346&game_id=..."
def get_all_game_broadcasters(client_headers_dict, game_ids):
    
    # Casts any integer game ids into a string for the url parameter
    if type(game_ids) == int or type(game_ids) == str:
        game_ids = str(game_ids)
        
        # The Timestamp is for creating a column in the data to know when the data was obtained.
        # Start_time is for the connection timeout of the try-catch block
        start_time1 = time.time()
        
        # Try to POST the request, and retry if given a Connection Error, up to only 30 seconds.
        flag1 = True
        while flag1:
            try:
                request = requests.get("https://api.twitch.tv/helix/streams?game_id="+game_ids+"&first=100", headers=client_headers_dict)
                flag1 = False
                
            except requests.exceptions.ConnectionError:
                if time.time() > start_time1 + connection_timeout:
                    raise Exception('Unable to get updates after {} seconds of ConnectionErrors'.format(connection_timeout))
                else:
                    time.sleep(1)
        
        # If the first request was successful and contains any amount of data
        # Proceed to DataFrame creation
        json_request = json.loads(request.text)
        if json_request["data"] != []:
    
            # TimeStamp columns to know when the data was requested
            ts = pd.Timestamp(datetime.datetime.now())
            
            # Create DataFrame and get next page of broadcasters
            df = pd.DataFrame.from_dict(json_normalize(json_request["data"]), orient='columns')
            df["TimeStamp_Pulled"] = ts
            next_page = json_request.get("pagination")
        
            # i is a flagging variable for the while loop.
            # df_list is a list to improve time complexity of concatenating DataFrames, 
            # because using pd.concat() each time in the loop is very slow.
            i = 1
            df_list = []
            df_list.append(df)
        
            # Iteratively gather all other cursors for pagination of data
            while i > 0 and next_page != {}:
                
                ts = pd.Timestamp(datetime.datetime.now())
                start_time2 = time.time()
                
                # Account for connection timeout, in getting next data pages
                flag2 = True
                while flag2:
                    try:
                        next_100 = requests.get("https://api.twitch.tv/helix/streams?game_id="+game_ids+"&first=100&after="+next_page.get('cursor'), headers=client_headers_dict)
                        flag2 = False
                
                    except requests.exceptions.ConnectionError:
                        if time.time() > start_time2 + connection_timeout:
                            raise Exception('Unable to get updates after {} seconds of ConnectionErrors'.format(connection_timeout))
                        else:
                            time.sleep(1)

                # Convert data to DataFrame, and append to list for concatenation
                next_json = json.loads(next_100.text)
                next_df = pd.DataFrame.from_dict(json_normalize(next_json["data"]), orient='columns')
                next_df["TimeStamp_Pulled"] = ts
                df_list.append(next_df)
            
                # If the most recent DataFrame is empty, then end the while loop
                # If DataFrame contains info, get the next page to request
                # This accounts for an inconsistency with the API not returning the max 100 items requested
                if next_df.empty:
                    i = -1
                else:
                    next_page = next_json.get("pagination")
                    
            # Combine all of the DataFrames appended to the list all at once.    
            df_return = pd.concat(df_list).reset_index(drop=True)
        
        # Return an empty DataFrame with the same column names, if there is nobody broadcasting the given game_id
        else:
            df_return = pd.DataFrame(columns = ["id","user_id","user_login","user_name","game_id","game_name","type","title",
                                                 "viewer_count","started_at","language","thumbnail_url","tag_ids","is_mature"])
    
    # Return all of the broadcasters for the given game_id
    return df_return

In [4]:
# Function to retrieve all broadcasters from list or Series of game IDs

# Credentials header and list/Series of game IDs are required
def get_all_broadcasters(client_headers_dict, all_games_id_col):
    
    # df_list is a list for the ultimate concatenation of all DataFrames returned
    # list_100 is a list for holding strings of 100 game IDs
    df_list = []
    list_100 = []

    # i1 is the starting index for subsetting 100 items from all_games_id_col
    # i2 is the ending index for subsetting 100 items from all_games_id_col
    # idx is the index for grabbing the next item in list_100
    i1=0
    i2=100
    idx = 0
    
    # If the ending index is greater than the length of all_games_id_col,
    # set the ending index to the length of the column
    # Doing this before the while loop fixes the case of a 
    #if i2 > len(all_games_id_col):
    #    i2 = len(all_games_id_col)

    # For each item that becomes appended to list_100: 
    
    while True:
        
        # If the ending index is greater than the length of all_games_id_col,
        # set the ending index to the length of the column
        if i2 > len(all_games_id_col):
            i2 = len(all_games_id_col)
        
        # Create the string of 100 game IDs, and append to list_100
        # Take the newly appended string item, and request all broadcasters of those 100 game IDs
        list_100.append("&game_id=".join(all_games_id_col[i1:i2]))
        df_list.append(get_all_game_broadcasters(client_headers_dict, list_100[idx]))
        
        # IF the ending index is greater than or equal to the length of the game ID col,
        # then all game IDs have been requested
        # End the loop
        if i2 >= len(all_games_id_col):
            break
            
        # Set up the next subsetting indexes, and the idx for indexing the strings of 100 IDs
        i1 += 100
        i2 += 100
        idx += 1

    # Combine all of the DataFrames from the get_all_game_broadcasters() calls
    df = pd.concat(df_list).drop_duplicates("user_id", keep="first").reset_index(drop=True)
    
    # Return all of the broadcasters currently on Twitch, at the given time of the function call
    return df

<font size=6>Establishing Token and Requesting Parameters

In [5]:
# The client_path file is a two-line .txt file to hold my Twitch CLI App's credentials for API access.
# The first line in the file is the client ID, and the second line is the client secret.

client_path = <"path of client credentials .txt file">

file = open(client_path, "r")

lines = []
for line in file:
    lines.append(line.strip("\n"))

file.close()

# <insert your client id here>
client_id=lines[0]

# <insert your client secret here>
client_secret=lines[1]

In [6]:
# access_code_ttv sends your client credentials to obtain an access token.
# The token is stored into access_token_ttv, and is fed to the headers dictionary for future POST queries.

access_code_ttv = requests.post("https://id.twitch.tv/oauth2/token?client_id="+client_id+'&client_secret='+client_secret+'&grant_type=client_credentials')

access_token_json_ttv = json.loads(access_code_ttv.text)
access_token_ttv = access_token_json_ttv['access_token']

headers = {'Client-ID' : client_id,
           'Authorization' : 'Bearer '+access_token_ttv}

<font size=6>Grabbing all games currently being streamed

<font size=6>Repentance ID is 491080

In [7]:
# Obtains all games currently being broadcast, in order of content with the most viewers to the least viewers.

all_games = get_all_games(headers)
all_games.head()

Unnamed: 0,id,name,box_art_url,TimeStamp_Pulled
0,509658,Just Chatting,https://static-cdn.jtvnw.net/ttv-boxart/509658...,2022-04-06 14:54:24.573631
1,32982,Grand Theft Auto V,https://static-cdn.jtvnw.net/ttv-boxart/32982_...,2022-04-06 14:54:24.573631
2,32399,Counter-Strike: Global Offensive,https://static-cdn.jtvnw.net/ttv-boxart/32399_...,2022-04-06 14:54:24.573631
3,21779,League of Legends,https://static-cdn.jtvnw.net/ttv-boxart/21779-...,2022-04-06 14:54:24.573631
4,33214,Fortnite,https://static-cdn.jtvnw.net/ttv-boxart/33214-...,2022-04-06 14:54:24.573631


In [8]:
all_games.shape

(3240, 4)

<font size=6>Grabbing all current broadcasters for my game of interest--The Binding of Isaac: Repentance

<font size=6>Get all current streamers in total, regardless of game

In [9]:
# Obtains all users broadcasting at the time the code was executed.

all_streamers = get_all_broadcasters(headers, all_games["id"])  

In [10]:
all_streamers.head()

Unnamed: 0,id,user_id,user_login,user_name,game_id,game_name,type,title,viewer_count,started_at,language,thumbnail_url,tag_ids,is_mature,TimeStamp_Pulled
0,46089807629,459331509,auronplay,auronplay,32982,Grand Theft Auto V,live,Carreritas con Ibai (amigo de messi) y Rubius ...,147281.0,2022-04-06T15:41:38Z,es,https://static-cdn.jtvnw.net/previews-ttv/live...,[d4bb9c58-2141-4881-bcdc-3fe0505457d1],False,2022-04-06 14:54:33.250422
1,46088594701,31239503,esl_csgo,ESL_CSGO,32399,Counter-Strike: Global Offensive,live,LIVE: Heroic vs Team Liquid - ESL Pro League S...,94750.0,2022-04-06T12:31:58Z,en,https://static-cdn.jtvnw.net/previews-ttv/live...,"[36a89a80-4fcd-4b74-b3d2-2c6fd9b30c95, 6ea6bca...",False,2022-04-06 14:54:33.250422
2,46086542845,71092938,xqcow,xQcOW,509658,Just Chatting,live,🟥ALERT🟥CLICK HERE IMMEDIATELY🟥THIS IS AN IMPOR...,90588.0,2022-04-06T04:28:26Z,en,https://static-cdn.jtvnw.net/previews-ttv/live...,"[6ea6bca4-4712-4ab9-a906-e3336a9d8039, c2839af...",False,2022-04-06 14:54:33.250422
3,46089484189,83232866,ibai,ibai,32982,Grand Theft Auto V,live,"CARRERAS DE GTA CON IBAI (MADRE), RUBIUS (PADR...",62981.0,2022-04-06T14:53:35Z,es,https://static-cdn.jtvnw.net/previews-ttv/live...,[d4bb9c58-2141-4881-bcdc-3fe0505457d1],False,2022-04-06 14:54:33.250422
4,46090057389,26261471,asmongold,Asmongold,509658,Just Chatting,live,BIG DAY--TIER LISTS FINALLY--LOST ARK ABYSS HA...,50604.0,2022-04-06T16:17:16Z,en,https://static-cdn.jtvnw.net/previews-ttv/live...,[6ea6bca4-4712-4ab9-a906-e3336a9d8039],False,2022-04-06 14:54:33.250422


In [11]:
all_streamers.shape

(97011, 15)

In [20]:
# Storage path is the file path in your directory where you want to store the dataframe.
# If the file exists in the directory already, then the dataframe is appended to the existing .csv.
# If the file does not exist, then it is created.

storage_path = <"path to store .csv">
if path.exists(storage_path):
    all_streamers.to_csv(storage_path, mode='a', index=False, header=0)
else:
    all_streamers.to_csv(storage_path, index=False)