# Data Acquisition

In [3]:
from datetime import datetime
import time
import opendota
import pandas as pd
import numpy as np
import json
import os

In [4]:
def get_match_list(datetime_start_unix, dateime_end_unix):
  '''Queries the OpenDota "public_matches" table between two datetimes using PostgreSQL and saves a csv called matches_yyyymmdd_hhmmss.csv'''
  client = opendota.OpenDota()

  result_size = 500000
  start_time = datetime_start_unix
  i=0

  # Divides the requests into 500000 chunks, due to size limitations when querying the OpenDota API
  while result_size == 500000:
    i=i+1
    output_cur = client.explorer(f'SELECT * FROM public_matches where start_time > {start_time} AND start_time < {dateime_end_unix} LIMIT 500000')
    start_time = output_cur[-1]['start_time']
    result_size = len(output_cur)
    df_output_cur = pd.DataFrame(output_cur)
    
    if i == 1:
      df_output = df_output_cur
    else:
      df_output = pd.concat([df_output, df_output_cur], axis=0)

    print('Loop ' + str(i) +' - Size ' + str(len(df_output))) 
  
  return df_output

In [7]:
# Generate match list
# Dates for fixed hero time:
# Marci added 28/10/2021, so 29/10/2021 = 1635462000
# Primal Beast added 23/02/2022, so 22/02/2022 = 1645488000
# Above time window is ~4 months

# Dates for Primal Beast added
# Primal Beast added 23/02/2022, so 24/02/2022 = 1645660800
# Up to 2 months after, so 24/04/2022 = 1650758400

### Sometimes the first few runs do not work due to no response for the OpenDota API - keep trying and it will eventually work
df_match_list = get_match_list(1645660800,1650758400)
now = datetime.now()
df_match_list.to_csv(f'match_list_{now.year:04d}{now.month:02d}{now.day:02d}_{now.hour:02d}{now.minute:02d}{now.second:02d}.csv')

Loop 1 - Size 500000
Loop 2 - Size 1000000
Loop 3 - Size 1500000
Loop 4 - Size 2000000
Loop 5 - Size 2500000
Loop 6 - Size 3000000
Loop 7 - Size 3500000
Loop 8 - Size 4000000
Loop 9 - Size 4500000
Loop 10 - Size 5000000
Loop 11 - Size 5500000
Loop 12 - Size 5600752


In [5]:
def download_match_heroes(match_ids):
    '''Uses the PyOpenDota function "get_match" to return match information about each "match_id" in iterable argument "matches"
    Saves heroes picked/banned for each match_id in a json file called match-id.json'''
    client = opendota.OpenDota()
    total = len(match_ids)
    count = 0
    for match_id in match_ids:
        count += 1
        print(str(count) + '/' + str(total))
        try:
            match = client.get_match(match_id)
            matchjson = json.dumps(match)
            f = open(f'match_jsons\\{match_id}.json','w')
            f.write(matchjson)
            f.close()
        except:
            pass


In [None]:
# Fetch selected heroes for each match_id
matches = pd.read_csv('match_list_.csv') # replace argument with match_list .csv file
download_match_heroes(matches['match_id'])

In [6]:
def check_hero_picks(picks_bans):
    '''Function takes a list of pick+ban dictionaries and returns boolean with whether there were a total of 10 picks and 5 on each team
    True: 10 total picks, 5 team0 picks, 5 team1 picks, picks were in first ten records in list
    False: otherwise'''

    picks = [np.nan]*10
    valid = False
    picks_total = 0
    picks_team0 = 0
    picks_team1 = 0
    count=0
    first_ten = True
    for pickban in picks_bans:
        if pickban['is_pick']==True:
            picks_total+=1
            if count>9:
                first_ten = False
            if pickban['team']==0:
                if picks_team0<5:
                    picks[picks_team0] = pickban['hero_id']
                picks_team0+=1
            elif pickban['team']==1:
                if picks_team1<5:
                    picks[picks_team1+5] = pickban['hero_id']
                picks_team1+=1
        count+=1

    if (picks_total==10) & (picks_team0==5) & (picks_team1==5) & first_ten:
        valid = True
    
    output = {'valid': valid, 'picks': picks}

    return output

In [13]:
def compile_match_heroes():
    matches = pd.read_csv('match_list_.csv') # replace argument with match list csv
    num_matches = len(matches)
    dir_match_jsons = 'match_jsons/'
    # Results df
    df_heroes = pd.DataFrame(columns=['match_id','hero0','hero1','hero2','hero3','hero4','hero5','hero6','hero7','hero8','hero9','heroes_valid'])

    count = 0
    for idx, match in matches.iterrows():
        match_id = match['match_id']
        if match_id>6447015601:
            break
        # Try see if json for match exists
        try:
            picks_bans = json.load(open(f'{dir_match_jsons}{match_id}.json'))['picks_bans']
            picksbans_checked = check_hero_picks(picks_bans)
            df_heroes.loc[count,:] = [match_id] + picksbans_checked['picks'] + [picksbans_checked['valid']]
            
        # If no match json, include row of nans
        except:
            # print('exception')
            df_heroes.loc[count,:] = [match_id] + [np.nan]*10 + [False]
        count+=1
        print(f'{count}/{num_matches} - {match_id}')

    now = datetime.now()        
    df_heroes.to_csv(f'match_heroes_{now.year:04d}{now.month:02d}{now.day:02d}_{now.hour:02d}{now.minute:02d}{now.second:02d}.csv')

In [None]:
# Compile jsons into updated "matches" table
compile_match_heroes()

In [15]:
# Merge matches and heroes datasets
df_match_list = pd.read_csv('match_list_.csv') # replace argument with match list csv
df_match_heroes = pd.read_csv('match_heroes_.csv') # replace argument with match hero csv

df_match_merged = df_match_list.merge(df_match_heroes, on='match_id', how='left')
now = datetime.now()
df_match_merged.to_csv(f'match_merged_{now.year:04d}{now.month:02d}{now.day:02d}_{now.hour:02d}{now.minute:02d}{now.second:02d}.csv')