# Data Acquisition

In [1]:
from datetime import datetime
import time
import opendota
import pandas as pd
import numpy as np
import json
import os

In [5]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [135]:
def get_match_list(datetime_start_unix, dateime_end_unix):
  '''Queries the OpenDota "public_matches" table between two datetimes using PostgreSQL and saves a csv called matches_yyyymmdd_hhmmss.csv'''
  client = opendota.OpenDota()

  result_size = 500000
  start_time = datetime_start_unix
  i=0

  # Divides the requests into 500000 chunks, due to size limitations when querying the OpenDota API
  while result_size == 500000:
    i=i+1
    output_cur = client.explorer(f'SELECT * FROM public_matches where start_time > {start_time} AND start_time < {dateime_end_unix} LIMIT 500000')
    start_time = output_cur[-1]['start_time']
    result_size = len(output_cur)
    df_output_cur = pd.DataFrame(output_cur)
    
    if i == 1:
      df_matches = df_output_cur
    else:
      df_matches = pd.concat([df_matches, df_output_cur], axis=0)

    print('Loop ' + str(i) +' - Size ' + str(len(df_matches))) 
  
  return df_matches

In [7]:
# Generate match list
# Dates for fixed hero time:
# Marci added 28/10/2021, so 29/10/2021 = 1635462000
# Primal Beast added 23/02/2022, so 22/02/2022 = 1645488000
# Above time window is ~4 months

# Dates for Primal Beast added
# Primal Beast added 23/02/2022, so 24/02/2022 = 1645660800
# Up to 2 months after, so 24/04/2022 = 1650758400

### Sometimes the first few runs do not work due to no response for the OpenDota API - keep trying and it will eventually work
df_matches = get_match_list(1635462000,1645488000)
df_matches.to_csv(f'matches.csv', index=False)

Loop 1 - Size 500000
Loop 2 - Size 1000000
Loop 3 - Size 1500000
Loop 4 - Size 2000000
Loop 5 - Size 2500000
Loop 6 - Size 3000000
Loop 7 - Size 3500000
Loop 8 - Size 4000000
Loop 9 - Size 4500000
Loop 10 - Size 5000000
Loop 11 - Size 5500000
Loop 12 - Size 5600752


In [9]:
def get_match_picks(match_ids):
    query_size = 1000 # allows multiple match_ids per query, reducing the total number of queries and improving extraction speed (OpenDota only returns around 1000 at a time)
    client = opendota.OpenDota()
    columns = ['match_id'] + [f'hero{i}_pick' for i in range(0,10)] + [f'hero{i}_slot' for i in range(0,10)]
    num_matches = len(match_ids)
    df_picks = pd.DataFrame(
        data=np.zeros([num_matches,len(columns)]),
        columns=columns)

    count = 0
    row = -1

    match_ids_current = []
    for match_id in match_ids:
        count+=1
        match_ids_current.append(str(match_id)) # builds up a list of match_ids until reaching query size, or until last match reached
        if (count==query_size) | ((num_matches-row-len(match_ids_current))==1):
            count = 0 # reset count
            match_ids_current_str = ','.join(match_ids_current) # expands list in comma-separated string for use in PostgreSQL query
            query = f'SELECT * FROM public_player_matches WHERE match_id IN ({match_ids_current_str})'
            client = opendota.OpenDota()
            picks = client.explorer(query) # query OpenDota
            
            # loop through results (each item is a match id pick, there *should* be 10 picks per match id)
            match_id_cur = -1 # memory of current match_id_cur, allows moving to next row in output df
            hero = -1 # initial hero
            for pick in picks:
                # new match_id means move to next row
                if pick['match_id'] != match_id_cur:
                    match_id_cur = int(pick['match_id'])
                    row+=1 # next row for next match
                    hero=-1 # reset first hero
                    
                    df_picks.loc[df_picks.index[row], 'match_id'] = match_id_cur # assign match_id column
                
                hero+=1
                df_picks.loc[df_picks.index[row], f'hero{hero}_pick'] = pick['hero_id']
                df_picks.loc[df_picks.index[row], f'hero{hero}_slot'] = pick['player_slot']

            match_ids_current = [] # reset current match_id batch
            print(f'{row+1} matches picks extracted')
    
    return df_picks

In [None]:
# Query debugger
# out = client.explorer("SELECT * FROM public_player_matches WHERE match_id IN (6246229802,6246229803,6246229805,6246229904,6246229807,6246230009,6246229915,6246229809,6246229905,6246229907)")
# client = opendota.OpenDota() #6447015200
# out = client.explorer("SELECT * FROM public_player_matches WHERE match_id IN (6246229802)")
# print(out)

In [45]:
df_matches = pd.read_csv('../data/matches_2.csv')

In [None]:
# Extract and save matches 5000 at a time
num_matches = len(df_matches)
save_every = 5000

starts = np.arange(45000, num_matches, save_every)

for start in starts:
    print(f'Start: {start}')
    end = min(start+save_every, num_matches)
    # while try catch to ensure if errors, it tries again
    while True:
        try:
            df_picks = get_match_picks(matches['match_id'][start:end].values)
        except:
            continue
        break

    df_picks.to_csv(f'picks/picks_{start}_{end-1}.csv', index=False)

In [None]:
# Combine picks csvs
dir_picks = '../data/picks/' # picks directory with 5000 matches per csv

columns = ['match_id'] + [f'hero{i}_pick' for i in range(0,10)] + [f'hero{i}_slot' for i in range(0,10)]
num_matches = len(df_matches)
df_picks = pd.DataFrame()

for filename in os.listdir(dir_picks):
    print(filename)
    df_current = pd.read_csv(dir_picks + filename)
    df_picks = pd.concat([df_picks, df_current], axis=0)

df_picks = df_picks.reset_index(drop=True)

In [61]:
# Write combined picks to csv
df_picks.to_csv('../data/picks.csv', index=False)

In [63]:
# Combine matches and picks and write to csv
df_matches = pd.read_csv('../data/matches.csv')
df_picks = pd.read_csv('../data/picks.csv')
df_combined = pd.merge(left=df_matches, right=df_picks, on='match_id', how='left')
df_combined.to_csv('../data/combined.csv', index=False)