In [53]:
import requests
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import os
import sqlite3
import numpy as np


# Approach: Use Selenium to Load JSON
First, a quick discussion of the [website](https://battlefieldtracker.com/bfv/leaderboards/stats/all/Wins?type=stats&page=1) layout. The previous link is to the first leaderboard page sorted by the number of games users have won. This does not in and of itself provide us with much information. But you can click on each user on the leaderboard and see an associated profile. This profile contains a lot of useful information. We would like to collect this information for every user on every page of the leaderboard.

The data on the website is loaded using Javascript. This means that simply sending GET requests and parsing HTML using Python's request library will not work - the HTML will not contain the desired data as it will not have been loaded. A plausible approach is to just use Selenium to load the leaderboard pages, get the HTML once data has been loaded onto the page with Javascript, find profile URLs from the leaderboard page HTML, load the profile pages with Selenium, then go through the profile page HTML to find data. There are two problems with this approach, both associated with page loading times. First, page loading times vary. To scrape a dynamic page in this fashion, first Selenium starts loading the page. Then the program waits for some set time. After waiting, Selenium passes the loaded page's HTML to BeautifulSoup for parsing. But if the page isn't loaded, the program could crash or at best there will be missing data. To prevent this, the program has to wait for several seconds for pages to load, which becomes prohibitively slow to scrape the whole site. Second, the pages just generally load slowly. This method would have taken several days even with a reasonably low loss rate of around 1% of the data.

A second approach requires determining if the website gets its data from an API. If it does, you may use the API to load the data in JSON format. The advantages here are that you do not have to spend time loading the actual pages, there is data in the API that is not loaded on the page, and the API stores data in a more human readable format than on in the HTML. To find the API, load the page in a browser (I use Firefox for this). Open the developer tools panel with the F12 key. Click the Network tab on the panel and the XHR button on the row below that. Reload the page. The list should populate, and if you look for requests with Type JSON, you should eventually find the API urls. You can use a tool like Insomnia to quickly generate code to make requests to the API with Python. The problem in this case is the APIs did not accept requests.

The third and final approach combines the previous. I use the API, but I use Selenium to load the response to API requests in the browser rather than the pages that display data from the API. These are in JSON format and load nearly instantly. This eliminates or reduces the downsides of the first approach (slow, lost data) and retains most of the benefits of the second approach (faster, no lost data, more features). For scale, this probably speeds up the scraping process by about an order of magnitude, with no lost data.

# Leaderboard
The first step is to get data from the leaderboard page itself. Since so much data can be retrived from the API that loads the profile pages, all that is needed from the leaderboard are the names of players and platforms they play on. The players usernames and platforms are combined to make a features I call "player_id." These take the form (platform_abbreviation)/(username). In addition to serving as identifiers for players, they are needed to get unique information about players from the API.

In [54]:
sd = {}
def parse_player(player_json):
    '''
    Retrives the players username and platform and returns their player_id and platform
    Parameters: player_json, the portion of a JSON string from the leaderboard API associated with a player.
    Returns: player_id, player_platform
    '''
    player_username = player_json['id']
    player_platform = player_json['owner']['metadata']['platformSlug']
    player_id = f'{player_platform}/{player_username}'
    
    return player_id, player_platform

def parse_leaderboard(leaderboard_url, stat_dict):
    '''
    Retrives the player_id and player_platform for every play on a leaderboard page
    Parameters: leaderboard_url (url for the leaderboard API for a page), stat_dict (dictionary of player performance metrics and other player information)
    '''

    # Load the URL
    driver.get(leaderboard_url)
    leaderboard_content = driver.page_source.encode('utf-8').strip()
    soup = bs(leaderboard_content)
    leaderboard_json = json.loads(soup.body.pre.text)

    if 'data' in leaderboard_json:
        leaderboard_data_json = leaderboard_json['data']['items']
        # Get names from leaderboard
        for player_json in leaderboard_data_json:
            player_id, player_platform = parse_player(player_json)
            # Add the player_id to the dictionary
            if 'player_id' in stat_dict:
                stat_dict['player_id'].append(player_id)
            else:
                stat_dict['player_id'] = [player_id]

            # Add the player platform to the dictionary
            if 'platform' in stat_dict:
                stat_dict['platform'].append(player_platform)
            else:
                stat_dict['platform'] = [player_platform]

    else:
        if 'player_id' not in stat_dict:
            stat_dict['player_id'] = []
            stat_dict['platform'] = []
            
    return stat_dict

# Profile
The second step is to scrape data from the player profiles. The profile consists of two broad categories of data. First, overall information, which is referred to as player "history." These are metric calculated accross class like kills, score, etc. Second, there are specific metrics broken out by class and vehicle use. All of these are stored in easy to navigate JSON strings, but they each require different API requests. In both categories, stats are given in raw format (ex. 10000 in the score feature means the player's total score while playing has been 10000) and percentile format (ex. 1.9 in the score feature means the player is in the 1.9% of highest scorign players). These are given different suffixes - _value and _score - to make them identifiable.

## Overall History
The first thing I retrieve is player history information. This is a simple matter of looping over different performance metrics and storing information in a dictionary.

In [55]:
def parse_history_for_player(history_json, stat_dict, history_categories):
    available_stats = history_json.keys()
    for stat in history_categories:
        if stat in available_stats:
            stat_value = history_json[stat]['value']
            stat_percentile = history_json[stat]['percentile']
        else:
            stat_value = np.nan
            stat_percentile = np.nan
        
        # Add value to stat_dict
        if stat+'_value' in stat_dict:
            stat_dict[stat+'_value'].append(stat_value)
        else:
            stat_dict[stat+'_value'] = [stat_value]
            
        # Add percentile to stat_dict
        if stat+'_percentile' in stat_dict:
            stat_dict[stat+'_percentile'].append(stat_percentile)
        else:
            stat_dict[stat+'_percentile'] = [stat_percentile]

    return stat_dict  


## Class stats
The first thing I retrieve is player class information. This is a simple matter of looping over different performance metrics and storing information in a dictionary.

In [56]:
def parse_class_stats(class_json, stat_dict, class_categories):
    '''
    Retrieve class-specific stats for a specific player for a single class.
    '''
    class_name = class_json['metadata']['name']
    available_stats = list(class_json['stats'].keys())[1:]

    # Add class stats to stat_dict       
    for stat in class_categories: # The first entry is player rank, which we don't need
        # Check if the desired stat is present in the JSON
        if stat in available_stats:
            stat_percentile = class_json['stats'][stat]['percentile']
            stat_value = class_json['stats'][stat]['value']
        else:
            stat_percentile = np.nan
            stat_value = np.nan
        
        stat_name = f'{class_name}_{stat}' # ex Assault_kills
        
        # Add stat value to dictionary
        if stat_name+'_value' in stat_dict:
            stat_dict[stat_name+'_value'].append(stat_value)
        else:
            stat_dict[stat_name+'_value'] = [stat_value]
        
        # Add stat percentile to dictionary
        if stat_name+'_percentile' in stat_dict:
            stat_dict[stat_name+'_percentile'].append(stat_percentile)
        else:
            stat_dict[stat_name+'_percentile'] = [stat_percentile]
            
    return stat_dict

def parse_classes_for_player(classes_json, stat_dict, class_categories):
    '''
    Retrieve class-specific stats for a specific player for all classes.
    '''
    classes = ['medic', 'assault', 'support', 'recon', 'tanker', 'pilot']
    for class_json in classes_json:
        stat_dict = parse_class_stats(class_json, stat_dict, class_categories)
    
    # Check to see if class data was found for all classes for the player - some players do not have data for certain classes
    # First, identify class related features
    class_features = []
    for player_class in classes:
        for feature in stat_dict:
            if player_class in feature.lower():
                class_features.append(feature)
    
    # Check to see if all class related features are of same length, if not, fill short features with NaN      
    feature_lengths = [len(stat_dict[feature]) for feature in class_features]
    unique_feature_lengths = set(feature_lengths)
    if len(unique_feature_lengths) > 1:
        num_samples = max(unique_feature_lengths)
        for i, feature_length in enumerate(feature_lengths):
            if feature_length < num_samples:
                short_feature = class_features[i]
                stat_dict[short_feature].append(np.nan) # Since we do this for each player, should never need to add more than 1 NaN per player
                    
    return stat_dict


# Combine Code and Automate Browser

In [57]:
def parse_player_stats(stat_dict, history_categories, class_categories):
    for player_id in tqdm(stat_dict['player_id']):
        
        # Get overall history for player
        api_url = f"https://api.tracker.gg/api/v2/bfv/standard/profile/{player_id}?"

        driver.get(api_url)
        history_content = driver.page_source.encode('utf-8').strip()
        soup = bs(history_content)
        history_json = json.loads(soup.body.pre.text)

        # Some data is simply unavailable for access
        if 'data' in history_json:
            history_json_data = history_json['data']['segments'][0]['stats']

            stat_dict = parse_history_for_player(history_json_data, stat_dict, history_categories)
               
        # Get class info for the user in question
        api_url = f"https://api.tracker.gg/api/v2/bfv/standard/profile/{player_id}/segments/class"

        driver.get(api_url)
        class_content = driver.page_source.encode('utf-8').strip()
        soup = bs(class_content)

        classes_json = json.loads(soup.body.pre.text)
        if 'data' in classes_json:
            classes_json_data = classes_json['data']
            
            stat_dict = parse_classes_for_player(classes_json_data, stat_dict, class_categories)

        # Drop player if no associated information
        if 'data' not in history_json and 'data' not in classes_json:
            stat_dict['player_id'] = stat_dict['player_id'][:-1]
            stat_dict['platform'] = stat_dict['platform'][:-1]
        
        time.sleep(2)
        
        
    return stat_dict        

In [1]:
def read_categories(categories_to_scrape_file):
    with open(categories_to_scrape_file, 'r') as f:
        categories = f.read().split('\n')
        history_categories = categories[0].split(' ')
        class_categories = categories[1].split(' ')
    return history_categories, class_categories

def scrape_page(leaderboard_url, stat_dict, history_categories, class_categories):
    # Scrape the page
    stat_dict = parse_leaderboard(leaderboard_url, stat_dict)
    stat_dict = parse_player_stats(stat_dict, history_categories, class_categories)
    
    return stat_dict

def scrape_site(history_categories, class_categories):
    # determine how many profiles to skip
    skip = 0
    files = os.listdir('data')
    if len(files) > 0:
        file_nums = map(lambda x: int(x.split('p')[1].split('.')[0]), files)
        skip = max(list(file_nums)) + 100
        print(f'Skipping first {skip} profiles.')

    while skip < 78800:
        leaderboard_url = f'https://api.tracker.gg/api/v1/bfv/standard/leaderboards?type=stats&platform=all&board=WINS&skip={skip}&take=100'
        
        # scrape page
        stat_dict = {}
        stat_dict = scrape_page(leaderboard_url, stat_dict, history_categories, class_categories)
        key_lens = [len(stat_dict[key]) for key in stat_dict]
        if len(list(set(key_lens))) > 1:
            for key in stat_dict:
                print(key, len(stat_dict[key]))


        # Load previous progess, if any
        current_iter = pd.DataFrame.from_dict(stat_dict)
        previous_file_name = f'bfvstats_skip{skip-100}.csv'
        
        if previous_file_name in files:
            previous_iter = pd.read_csv('data/'+previous_file_name, index_col=0)
            combined_df = pd.concat([previous_iter, current_iter]).reset_index(drop=True)
            combined_df = combined_df.drop_duplicates(subset=['player_id'])

            # os.remove(previous_file_name) T
            combined_df.to_csv(f'data/bfvstats_skip{skip}.csv')
            current_iter.to_sql('bfvstats', con=con, if_exists='append')
                     
            
        else:
            current_iter.to_csv(f'data/bfvstats_skip{skip}.csv')
            current_iter.to_sql('bfvstats', con=con, if_exists='append')

        skip += 100

con = sqlite3.connect('bfvstats.db')
cur = con.cursor()

options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
history_categories, class_categories = read_categories('categories_to_scrape.txt')

scrape_site(history_categories, class_categories)

NameError: name 'sqlite3' is not defined