# Notebook for scraping players at different ELOs

This notebook outlines how to scrape high level games from different regions. It is broken into 3 parts:
1. Querying RIOT for information about summoners, and getting a list of interesting matches.
2. Querying RIOT for the information about those matches.
3. Saving (and potentially loading) those matches.

Load libraries

In [3]:
import requests, json
import numpy as np
from src import API_io
import importlib
import pandas as pd
import time
from src import feature_calc
import pdb

In [6]:
working_dir = 'C:\\Users\\Me\\Documents\\GitHub\\lolML'
with open(working_dir+ '\\api_key.txt', 'r') as api_file:
    api_key =  api_file.read()

## Get a list of low level matches

Set the region. Get Kaceytron's match history. (Note, I need to start with a Bronze V user, so I use Kaceytron as she is a famous streamer and is in Bronze V. I think she's hilarious.)

In [35]:
region_key = 'na'
match_requests_params = API_io.make_RIOT_request_params(api_key, 'solo', 2015)
BronzeV_ID = API_io.get_summoner_ids_from_names('Kaceytron', api_key, region_key) [0]
BronzeV_url = API_io.make_matchhistory_url(BronzeV_ID, region_key)
BronzeV_match_history = requests.get(BronzeV_url, match_requests_params).json()

Extract the match ID from the match history JSON

In [12]:
match_IDs = np.unique([x['matchId'] for x in BronzeV_match_history['matches'] ])
match_IDs.shape

(394,)

Load a bunch of matches to get more summonerIDs

In [13]:
match_info_params = API_io.make_RIOT_request_params(api_key, timeline_flag=False)
match_urls = [API_io.make_match_info_url(x, region_key) for x in match_IDs] # True flag means we get the timeline
match_urls[:2]

['https://na.api.pvp.net/api/lol/na/v2.2/match/1705169301',
 'https://na.api.pvp.net/api/lol/na/v2.2/match/1705197279']

Loop through the matches, and get the summoner IDs of more players

In [14]:
def get_summonerIds_from_match(cur_match):
    return [x['player']['summonerId'] for x in cur_match['participantIdentities']]

## Get high ELO scraper

In [12]:
region_key = 'na'
high_elo_IDs = API_io.get_master_challenger_Ids(api_key, region_key)
len(high_elo_IDs)

1219

In [19]:
importlib.reload(API_io)
match_requests_params = API_io.make_RIOT_request_params(api_key, 'solo', 2016, True, 'PRE')
summoner_urls = [API_io.make_matchhistory_url(x, region_key =region_key) for x in high_elo_IDs]
print(summoner_urls[:2])

['https://na.api.pvp.net/api/lol/na/v2.2/matchlist/by-summoner/65389094', 'https://na.api.pvp.net/api/lol/na/v2.2/matchlist/by-summoner/51585316']


In [20]:
match_histories = [API_io.get_limited_request(x, match_requests_params) for x in summoner_urls[:500] ]

In [21]:
match_IDs = API_io.parse_match_json_for_matchIDs(match_histories, region_key)
pd.Series(match_IDs).to_csv(region_key + ' Match IDs.csv')
match_IDs.shape

(1687,)

### Query for the games

In [15]:
num_matches = 100
summoner_IDs = np.zeros([num_matches, 10]) # 10 players / match
match_range = np.arange(num_matches)
# this is the worst code EVER
for match_index, cur_match in enumerate(match_range):
    try:
        match_info = requests.get(match_urls[cur_match], match_info_params).json()
        summoner_IDs[match_index] = get_summonerIds_from_match(match_info)
    except requests.exceptions.HTTPError as err:
        print('HHTPError in game ' + str(match_IDs[cur_match]))
    except:
        import sys
        err = sys.exc_info()[0]
        print('Error: ' + str(err) + ' in game ' + str(match_IDs[cur_match]))
    time.sleep(1.2) # RIOT API is throttled to ~0.83 requests / second

In [16]:
summoner_IDs = np.unique(summoner_IDs.ravel())
summoner_IDs = summoner_IDs.astype(int)
summoner_IDs = summoner_IDs[1:]

Now that we have a bunch of IDs, get all their match histories

In [17]:
match_requests_params = API_io.make_RIOT_request_params(api_key, True, True)
summoner_match_history_urls = [API_io.make_matchhistory_url(x, region_key =region_key) for x in summoner_IDs]

In [18]:
match_histories = [API_io.get_limited_request(x, match_requests_params) for x in summoner_match_history_urls[:100] ]

In [19]:
match_IDs = API_io.parse_match_json_for_matchIDs(match_histories, region_key)
pd.Series(match_IDs).to_csv(region_key + ' Match IDs.csv')
match_IDs.shape

(36585,)

In [20]:
match_info_params = API_io.make_RIOT_request_params(api_key, timeline_flag=True)
match_urls = [API_io.make_match_info_url(x, region_key) for x in match_IDs] # True flag means we get the timeline
match_urls[:2]

['https://na.api.pvp.net/api/lol/na/v2.2/match/1704155497',
 'https://na.api.pvp.net/api/lol/na/v2.2/match/1704155828']

## Query RIOT for match information
Determine number of matches to query. Set up the dataframes for querying

In [146]:
match_range = np.arange(30000, 36584)
col_names = feature_calc.col_names
timeline_end = 55
time_indices = np.arange(5, timeline_end, 5)
timelines_df = [ pd.DataFrame(index = match_range, columns= col_names) for x in time_indices]

Actually run the queries

In [147]:
# this is the worst code EVER
for match_index, cur_match in enumerate(match_range):
    try:
        match_info = requests.get(match_urls[cur_match], match_info_params).json()
        for time_index, last_min in enumerate(time_indices):
            try:
                timelines_df[time_index].loc[match_index] = feature_calc.calc_features_single_match(match_info, last_min)
            except:
                continue
    except requests.exceptions.HTTPError as err:
        print('HHTPError in game ' + str(match_IDs[cur_match]))
    except:
        import sys
        err = sys.exc_info()[0]
        print('Error: ' + str(err) + ' in game ' + str(match_IDs[cur_match]))
    time.sleep(1.25) # RIOT API is throttled to ~0.83 requests / second

Error: <class 'ValueError'> in game 1947796173
Error: <class 'ValueError'> in game 1951601164
Error: <class 'ValueError'> in game 1952839830
Error: <class 'ValueError'> in game 1952882663
Error: <class 'ValueError'> in game 1952883019
Error: <class 'ValueError'> in game 1952892495
Error: <class 'ValueError'> in game 1952895417
Error: <class 'ValueError'> in game 1952896586
Error: <class 'ValueError'> in game 1952914752
Error: <class 'ValueError'> in game 1952924543
Error: <class 'ValueError'> in game 1952948771
Error: <class 'ValueError'> in game 1952973336
Error: <class 'ValueError'> in game 1955406467
Error: <class 'ValueError'> in game 1967279693
Error: <class 'ValueError'> in game 1970924734
Error: <class 'ValueError'> in game 1976844363
Error: <class 'ValueError'> in game 1979354612
Error: <class 'ValueError'> in game 1989104279
Error: <class 'ValueError'> in game 1989104520
Error: <class 'ValueError'> in game 1989176301
Error: <class 'ValueError'> in game 1989178948
Error: <class

In [150]:
timelines_df = [x.dropna() for x in timelines_df]
timelines_df = [feature_calc.retype_columns(x) for x in timelines_df ]
timelines_df = [x.set_index('matchId') for x in timelines_df ]

KeyError: 'matchId'

Check how many matches were downloaded.

In [151]:
[x.shape for x in timelines_df]

[(6534, 32),
 (6534, 32),
 (6534, 32),
 (6519, 32),
 (5926, 32),
 (4941, 32),
 (3564, 32),
 (2184, 32),
 (1097, 32),
 (449, 32)]

If you are scraping a lot of matches in chunks, combine previous matches with new matches.

In [153]:
#combined_df = timelines_df
combined_df = [timelines_df[x].append(combined_df[x] ) for x in np.arange(len(timelines_df)) ]

In [154]:
[x.shape for x in combined_df]

[(36036, 32),
 (36036, 32),
 (36035, 32),
 (35969, 32),
 (32611, 32),
 (27777, 32),
 (20487, 32),
 (13097, 32),
 (6914, 32),
 (3018, 32)]

### Save the matches to file.

In [155]:
import pickle
with open('Maybe corrupted Low ELO combined_df.pickle', 'wb') as pickle_file:
    pickle.dump(combined_df, pickle_file)

In [177]:
with open('combined_df.pickle', 'rb') as pickle_file:
    combined_df = pickle.load(pickle_file)