# Scraping Histories of Halite Episodes

Kaggle Limits (https://www.kaggle.com/c/halite/discussion/164932)
* 1000 requests per day max;  Rate limits are shared between the ListEpisodes and GetEpisodeReplay endpoints
* Exceeding limits repeatedly will lead to temporary and then permanent bans

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json
from zipfile import ZipFile
import datetime
import time

In [2]:
SCRAPED_DIR = '../kaggle_games/'
BUFFER = 1
NUM_TEAMS = 5
EPISODES = 5
MIN_FINAL_RATING = 1100

base_url = "https://www.kaggle.com/requests/EpisodeService/"
get_url = base_url + "GetEpisodeReplay"
list_url = base_url + "ListEpisodes"

USE_SAVED = True
teams_df_file = 'teams_df.pkl'
episodes_df_file = 'episodes_df.pkl'
agents_df_file = 'agents_df.pkl'

## Define Functions
* getTeamEpisodes: requests listed episode data for a team, updates teams_df
* saveEpisode: requests the data for an individual episode and saves it
* updateTeams: takes "rj" json response data from getTeamEpisodes and appends new team info

In [3]:
def getTeamEpisodes(team_id):

    # request
    r = requests.post(list_url, json = {"teamId":  int(team_id)})
    rj = r.json()

    # make df and initialize
    team_episodes = pd.DataFrame(rj['result']['episodes'])
    
    team_episodes['tmpSubmissionId'] = -1;
    team_episodes['tmpSubmissionMu'] = -1;
    team_episodes['tmpLatestMu'] = -1;
    team_episodes['startTime'] = -1;
    team_episodes['matchLength'] = -1;
    team_episodes['avg_score'] = -1;
    team_episodes['agent0'] = -1;
    team_episodes['agent1'] = -1;
    team_episodes['agent2'] = -1;
    team_episodes['agent3'] = -1;
    team_episodes['episodeScore0'] = -1;
    team_episodes['episodeScore1'] = -1;
    team_episodes['episodeScore2'] = -1;
    team_episodes['episodeScore3'] = -1;
    team_episodes['agentTeam0'] = -1;
    team_episodes['agentTeam1'] = -1;
    team_episodes['agentTeam2'] = -1;
    team_episodes['agentTeam3'] = -1;
    team_episodes['agentCreation0'] = -1;
    team_episodes['agentCreation1'] = -1;
    team_episodes['agentCreation2'] = -1;
    team_episodes['agentCreation3'] = -1;
    team_episodes['agentLatestMu0'] = -1;
    team_episodes['agentLatestMu1'] = -1;
    team_episodes['agentLatestMu2'] = -1;
    team_episodes['agentLatestMu3'] = -1;
    
    # fill in information for each episode
    for i in range(len(team_episodes)):
        agents = team_episodes['agents'].iloc[i]
        agent_scores = [a['updatedScore'] for a in agents if a['updatedScore'] is not None]
        
        team_episodes.loc[i, 'tmpSubmissionId'] = [a['submissionId'] for a in agents if a['submission']['teamId'] == team_id][0]
        team_episodes.loc[i, 'tmpSubmissionMu'] = [a['updatedScore'] for a in agents if a['submission']['teamId'] == team_id][0]
        
        team_episodes.loc[i, 'startTime'] = team_episodes.loc[i, 'createTime']['seconds']
        team_episodes.loc[i, 'matchLength'] = (team_episodes.loc[i, 'endTime']['seconds'] 
                                               - team_episodes.loc[i, 'createTime']['seconds'])    
        
        if len(agent_scores) > 0:
            team_episodes.loc[i, 'avg_score'] = np.mean(agent_scores)
            for player in range(4):
                team_episodes.loc[i, 'agent'+str(player)] = agents[player]['submissionId']
                team_episodes.loc[i, 'episodeScore'+str(player)] = agents[player]['reward']
                team_episodes.loc[i, 'agentTeam'+str(player)] = agents[player]['submission']['teamId']
                team_episodes.loc[i, 'agentCreation'+str(player)] = agents[player]['submission']['dateSubmitted']['seconds']
                
    # calculate the latest score for each submission from the team
    for sub_id in team_episodes['tmpSubmissionId'].unique():
        sub_rows = team_episodes[ team_episodes['tmpSubmissionId'] == sub_id ]
        latest_start = max( [r for r in sub_rows['startTime']] )
        latest_score = sub_rows[ sub_rows['startTime'] == latest_start]['tmpSubmissionMu'].values[0]
        team_episodes.loc[sub_rows.index, 'tmpLatestMu'] = latest_score
        
    for player in range(4):
        agentidx = team_episodes['agent'+str(player)]==team_episodes['tmpSubmissionId']
        team_episodes.loc[agentidx,'agentLatestMu'+str(player)] = team_episodes.loc[agentidx]['tmpLatestMu']    
        
    team_episodes.drop(columns=['competitionId','createTime','endTime','adminNotes','type','agents','tmpSubmissionMu'],inplace=True)
    team_episodes.sort_values('avg_score', ascending = False, inplace=True)
    team_episodes.set_index('id',drop=False,inplace=True)
    
    return rj, team_episodes

In [4]:
def saveEpisode(epid, scraped_dir):
    # request
    re = requests.post(get_url, json = {"EpisodeId": int(epid)})
    
    # save replay
    with open(SCRAPED_DIR + '{}.json'.format(epid), 'w') as f:
        f.write(re.json()['result']['replay'])

In [5]:
def updateTeams(teams_df, rj):
    # update teams list to include teams that were not previously on the list
    teams_df_new = pd.DataFrame(rj['result']['teams'])
    teams_df_new.drop(columns=['competitionId','teamLeaderId','isBenchmark',
                               'createdAfterDeadline','medal','dateMedalAwarded',
                               'privateLeaderboardSubmissionId','privateLeaderboardScore',
                               'privateLeaderboardRank','competition','teamMembers',
                               'inboundMergeRequests','outboundMergeRequests',
                               'publicLeaderboardScore'],inplace=True)
    teams_df_new['lastSubmissionDate']=[submitdate['seconds'] for submitdate in teams_df_new['lastSubmissionDate']]
    teams_df_new.set_index('id',drop=False,inplace=True)
    
    if len(teams_df.columns) == len(teams_df_new.columns) and (teams_df.columns == teams_df_new.columns).all():
        teams_df = pd.concat( (teams_df_new.loc[[c for c in teams_df_new.index if c not in teams_df.index]], teams_df ) )
        teams_df_new.drop_duplicates(subset='id', keep = 'first', inplace = True)
        teams_df.sort_values('publicLeaderboardRank', inplace = True)
               
    else:
        print('teams dataframe did not match')
        
    return teams_df

In [6]:
def updateEpisodes(episodes_df, team_episodes):
    team_episodes.drop(columns=['tmpSubmissionId','tmpLatestMu'],inplace=True)
    if len(episodes_df.columns) == len(team_episodes.columns) and (episodes_df.columns == team_episodes.columns).all():
        # first update the latest score values for episodes that are already included
        for episode in list(set(team_episodes['id']) & set(episodes_df['id'])):
            for player in range(4):
                if team_episodes.loc[episode,('agentLatestMu'+str(player))] > 0:
                    episodes_df.loc[episode,('agentLatestMu'+str(player))] = team_episodes.loc[episode]['agentLatestMu'+str(player)]
        # next add new episodes
        episodes_df = pd.concat( (episodes_df, 
                                  team_episodes.loc[[c for c in team_episodes.id 
                                                     if c not in episodes_df.id.values]] ) )
    else:
        print('teams dataframe did not match')
    return episodes_df

## List of teams

Starting with arbitrary teamID, request a list of all episodes and look at the output teams

In [7]:
if USE_SAVED:
    teams_df = pd.read_pickle(SCRAPED_DIR+teams_df_file)
    episodes_df = pd.read_pickle(SCRAPED_DIR+episodes_df_file)
else:
    rj, episodes_df = getTeamEpisodes(4820508)

    teams_df = pd.DataFrame(rj['result']['teams'])
    
    teams_df.drop(columns=['competitionId','teamLeaderId','isBenchmark',
                   'createdAfterDeadline','medal','dateMedalAwarded',
                   'privateLeaderboardSubmissionId','privateLeaderboardScore',
                   'privateLeaderboardRank','competition','teamMembers',
                   'inboundMergeRequests','outboundMergeRequests',
                   'publicLeaderboardScore'],inplace=True)
    episodes_df.drop(columns=['tmpSubmissionId','tmpLatestMu'],inplace=True)
    teams_df['lastSubmissionDate']=[submitdate['seconds'] for submitdate in teams_df['lastSubmissionDate']]
    teams_df.sort_values(by='lastSubmissionDate',ascending=False, inplace = True)
    teams_df.drop_duplicates(subset='id', keep = 'first', inplace = True)
    teams_df.sort_values('publicLeaderboardRank', inplace = True)
    episodes_df.set_index('id',drop=False,inplace=True)
    
    teams_df.to_pickle(SCRAPED_DIR+teams_df_file)
    episodes_df.to_pickle(SCRAPED_DIR+episodes_df_file)
        
teams_df

Unnamed: 0_level_0,id,teamName,submissionCount,lastSubmissionDate,publicLeaderboardSubmissionId,publicLeaderboardRank
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5118779,5118779,mzotkiew,13,1595620437,16647790,1
5119134,5119134,Fei Wang,88,1596002251,16662467,2
5133228,5133228,Robiland,37,1596160963,16680981,3
4714287,4714287,Tom Van de Wiele,2,1594416886,16466155,4
5129729,5129729,Plundering Planetary Privateers,71,1596134490,16673704,5
...,...,...,...,...,...,...
5264397,5264397,Hasnain Ajmal,1,1594748443,16518809,864
5150750,5150750,Catch-22,1,1592880270,16237898,865
5119877,5119877,Ali Raza,1,1592314783,16146015,866
5349490,5349490,ashish pokhriyal,1,1596450094,16767746,868


In [8]:
episodes_df

Unnamed: 0_level_0,id,replayUrl,state,startTime,matchLength,avg_score,agent0,agent1,agent2,agent3,...,agentTeam2,agentTeam3,agentCreation0,agentCreation1,agentCreation2,agentCreation3,agentLatestMu0,agentLatestMu1,agentLatestMu2,agentLatestMu3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1916961,1916961,gs://kaggle-episode-replays/1916961.json,completed,1596010057,248,1359.943794,16466155,16616549,16609926,16647790,...,5129729,5118779,1594416886,1595421396,1595380107,1595620437,1331.365781,1292.012720,1299.978873,1453.651135
1932401,1932401,gs://kaggle-episode-replays/1932401.json,completed,1596063510,193,1355.589201,16634895,16660601,16618728,16553920,...,5129729,4820508,1595538506,1595720195,1595431464,1594984376,1491.452829,1355.316715,1315.311467,1294.602509
1938092,1938092,gs://kaggle-episode-replays/1938092.json,completed,1596083311,266,1355.519852,16466155,16669467,16616549,16647790,...,4820508,5118779,1594416886,1595779057,1595421396,1595620437,-1.000000,1516.011699,1292.012720,-1.000000
1935753,1935753,gs://kaggle-episode-replays/1935753.json,completed,1596075032,183,1352.591563,16659844,16665075,16634895,16609926,...,5118779,5129729,1595710987,1595754906,1595538506,1595380107,1316.004119,1323.126279,1491.452829,1299.978873
1933654,1933654,gs://kaggle-episode-replays/1933654.json,completed,1596067832,261,1350.310780,16680981,16647790,16665075,16617759,...,5119134,4820508,1595857987,1595620437,1595754906,1595426949,-1.000000,1362.742470,-1.000000,1294.580957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1978484,1978484,gs://kaggle-episode-replays/1978484.json,completed,1596220165,45,371.783751,16150971,16155781,16443070,16180879,...,5124142,5132249,1592338393,1592373959,1594237381,1592523758,387.661996,-1.000000,-1.000000,-1.000000
1980156,1980156,gs://kaggle-episode-replays/1980156.json,completed,1596225983,85,361.127576,16261872,16516090,16150971,16576240,...,5116375,5137213,1593020962,1594731680,1592338393,1595151860,-1.000000,-1.000000,387.661996,-1.000000
2042207,2042207,gs://kaggle-episode-replays/2042207.json,completed,1596429940,219,360.275347,16752690,16179499,16430301,16150971,...,5201960,5116375,1596352868,1592512366,1594138912,1592338393,-1.000000,-1.000000,-1.000000,387.661996
2066996,2066996,gs://kaggle-episode-replays/2066996.json,completed,1596509220,82,356.484884,16210069,16609636,16240152,16150971,...,5153207,5116375,1592707788,1595377708,1592895690,1592338393,-1.000000,-1.000000,-1.000000,387.661996


## Organize existing list of games in directory structure

In [9]:
all_files = []
for root, dirs, files in os.walk(SCRAPED_DIR, topdown=False):
    all_files.extend(files)
    
seen_episodes = [int(f.split('.')[0]) for f in all_files 
                      if '.' in f and f.split('.')[0].isdigit() and f.split('.')[1] == 'json']

print('{} games in existing library'.format(len(seen_episodes)))

21678 games in existing library


## Download new episodes 

In [10]:
# Initialize variables
pulled_teams = {}
pulled_episodes = []
start_time = datetime.datetime.now()
r = BUFFER;

In [11]:
# scrape data using the parameters
while len(pulled_episodes) < EPISODES:
    # pull another team
    top_teams = [i for i in teams_df.id if i not in pulled_teams]
    if len(top_teams) > 0:
        team_id = top_teams[0]
    else:
        break;
        
    # get team data
    team_json, team_episodes_df = getTeamEpisodes(team_id); r+=1;
    teams_df = updateTeams(teams_df, team_json)
    episodes_df = updateEpisodes(episodes_df, team_episodes_df.copy())
    print('{} games for {}'.format(len(team_episodes_df), teams_df.loc[teams_df.id == team_id].iloc[0].teamName))

    team_episodes_df = team_episodes_df[team_episodes_df.tmpLatestMu > MIN_FINAL_RATING]    
    print('   {} in score range from {} submissions'.format(len(team_episodes_df), len(team_episodes_df.tmpSubmissionId.unique() ) ) )
    
    team_episodes_df = team_episodes_df[~team_episodes_df.id.isin(pulled_episodes + seen_episodes)]        
    print('      {} remain to be downloaded\n'.format(len(team_episodes_df)))
    
    # pull games
    target_team_games = int(np.ceil(EPISODES / NUM_TEAMS))
    if target_team_games + len(pulled_episodes) > EPISODES:
        target_team_games = EPISODES - len(pulled_episodes)
     
    pulled_teams[team_id] = 0

    i = 0
    while i < len(team_episodes_df) and pulled_teams[team_id] < target_team_games:
        epid = team_episodes_df.id.iloc[i]
        if not (epid in pulled_episodes or epid in seen_episodes):
            try:
                saveEpisode(epid, SCRAPED_DIR); r+=1;
                pulled_episodes.append(epid)
                pulled_teams[team_id] += 1
                try:
                    size = os.path.getsize(SCRAPED_DIR + '{}.json'.format(epid)) / 1e6
                    print('Saved Episode #{} @ {:.1f}MB'.format(epid, size))
                except:
                    print('  file {}.json did not seem to save'.format(epid))
            except:
                print('  file {}.json could not be retrieved'.format(epid))
            if r > (datetime.datetime.now() - start_time).seconds:
                time.sleep( r - (datetime.datetime.now() - start_time).seconds)
                
        i+=1;
    print(); print()    

4640 games for mzotkiew
   1631 in score range from 6 submissions
      369 remain to be downloaded

Saved Episode #2070946 @ 2.1MB


25823 games for Fei Wang
   9689 in score range from 45 submissions
      5689 remain to be downloaded

Saved Episode #2070956 @ 2.0MB


13100 games for Robiland
   6519 in score range from 21 submissions
      2448 remain to be downloaded

Saved Episode #2070300 @ 1.9MB


791 games for Tom Van de Wiele
   312 in score range from 1 submissions
      1 remain to be downloaded

Saved Episode #2071052 @ 1.9MB


25015 games for Plundering Planetary Privateers
   6514 in score range from 33 submissions
      1940 remain to be downloaded

Saved Episode #2070965 @ 1.9MB




## Organize Dataframe with Agents


In [12]:
# make a list of all episodes
agents_df = pd.DataFrame(data=episodes_df['agent0'].values, index=None, columns=['agent'])
agents_df['teamId'] = episodes_df['agentTeam0'].values
agents_df['dateSubmitted'] = episodes_df['agentCreation0'].values
agents_df['LatestMu'] = episodes_df['agentLatestMu0'].values
agents_df.drop_duplicates(subset='agent',inplace=True)

for player in range(1,4):
    player_agents_df = pd.DataFrame(data=episodes_df['agent'+str(player)].values, index=None, columns=['agent'])
    player_agents_df['teamId'] = episodes_df['agentTeam'+str(player)].values
    player_agents_df['dateSubmitted'] = episodes_df['agentCreation'+str(player)].values
    player_agents_df['LatestMu'] = episodes_df['agentLatestMu'+str(player)].values
    agents_df.append(player_agents_df)
    agents_df.drop_duplicates(subset='agent',inplace=True)

agents_df.sort_values('LatestMu',ascending=False, inplace=True)
agents_df['teamName'] = [teams_df.loc[id]['teamName'] if id in teams_df['id'].values else '' for id in agents_df.teamId.values ]
agents_df['dateString'] = [datetime.datetime.fromtimestamp(secdate).strftime("%A, %B %d, %Y %I:%M:%S") for secdate in agents_df['dateSubmitted'].values]
agents_df.index = agents_df['agent'].values

In [13]:
teams_df.sort_values(by='lastSubmissionDate',ascending=False, inplace = True)
teams_df.drop_duplicates(subset='id', keep = 'first', inplace = True)
teams_df.sort_values('publicLeaderboardRank', inplace = True)
teams_df.to_pickle(SCRAPED_DIR+teams_df_file)
episodes_df.to_pickle(SCRAPED_DIR+episodes_df_file)
agents_df.to_pickle(SCRAPED_DIR+agents_df_file)

In [14]:
agents_df.to_pickle(SCRAPED_DIR+agents_df_file)