In [27]:
import requests
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from tqdm import tqdm

In [28]:
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)


In [29]:
base_url = 'https://battlefieldtracker.com/bfv/leaderboards/stats/all/Wins?type=stats&page=1'

driver.get(base_url)

# Wait for page to load
time.sleep(7)

content = driver.page_source.encode('utf-8').strip()
soup = bs(content, 'html.parser')

Note that this code opened a web browser. Please do not close it, if you plan on following along with the code and executing the cells. The code below automates the browser so data can be scraped from various pages.

The HTML here is actually pretty well organized. And represents something like the image below. The difference between the image and the page generally is that the page contains 100 rows.

<img src='leaderboard_example.png'>

Each player is a row in a table on the page. I want just the body, where the rows are, i.e. I can ignore the thead.

In [30]:
player_table = soup.find("table",{'class':'trn-table'}).tbody
player_rows = player_table.find_all('tr')

Each row in the table is itself a table. The rows of each player row proceed horizontally, left to right, as opposed to vertically. They are all named decently, too. There are a number of features to keep here: rank, username, stat highlight (the stat the players are organized by, here, wins) and stat-collapse (here, rounds played.) In addition, the player's platform can be retrieved from the relative path to their profile.

In [31]:
player_data = player_rows[-1].find_all('td')
player_data

[<td class="rank" data-v-4dbc6408="" data-v-754518f9=""><span data-v-4dbc6408="" data-v-754518f9="" data-v-7a5012a4="">100</span></td>,
 <td class="username" data-v-4dbc6408="" data-v-754518f9=""><div class="text" data-v-4dbc6408="" data-v-754518f9=""><a class="" data-v-4dbc6408="" data-v-754518f9="" href="/bfv/profile/xbl/TightWolf"><div class="avatar" data-v-4dbc6408=""><img alt="TightWolf's Avatar" class="picture" data-v-4dbc6408="" loading="lazy" src="https://imgsvc.trackercdn.com/url/size(32)/https%3A%2F%2Fimages-eds-ssl-ssl.xboxlive.com%2Fimage%3Furl%3DKT_QTPJeC5ZpnbX.xahcbrZ9enA_IV9WfFEWIqHGUb5P30TpCdy9xIzUMuqZVCfbdZLL26ddnKGsETOKJmRbdWb220u73XyNom2XdDEuyB6dU77ty4zFaQxQhVulcL_LGD3VOFkaORqVdn8utqXGw3XeVJ8ksfnDVMejNXSAPro-%26format%3Dpng%26format%3Dpng%26w%3D240%26h%3D240/image.jpg"/> <!-- --></div> <svg class="platform-icon platform-icon platform-xbl" data-v-4dbc6408="" data-v-b17d31f6="" viewbox="0 0 88 88"><path d="M39.73 86.91c-6.628-.635-13.338-3.015-19.102-6.776-4.83-3.15-5.

In [32]:
player_rows[0].find_all('td')[0].text

'\n  1\n'

In [33]:
bftracker_url = 'https://www.battlefieldtracker.com'
player_profile = bftracker_url + player_data[1].a['href']
player_profile

'https://www.battlefieldtracker.com/bfv/profile/xbl/TightWolf'

In [34]:
bftracker_url = 'https://www.battlefieldtracker.com'
stat_dict = {'Rank': [], 'Username':[], 'Platform':[], 'Wins':[], 'Rounds Played':[], 'Profile':[]}

def parse_player_row(player_data, stat_dict):
    # Identify and store features
    player_rank = int(player_data[0].text.strip())
    player_username = player_data[1].find('span', {'class':'trn-ign__username'}).text.strip()
    player_platform = player_data[1].a['href'].split('/')[-2]
    player_wins = int(player_data[-2].text.strip().replace(',', ''))
    player_rounds_played =  int(player_data[-1].text.strip().replace(',', ''))
    player_profile = bftracker_url + player_data[1].a['href']
    
    # Add features to dictionary
    stat_dict['Rank'].append(player_rank)
    stat_dict['Username'].append(player_username)
    stat_dict['Platform'].append(player_platform)
    stat_dict['Wins'].append(player_wins)
    stat_dict['Rounds Played'].append(player_rounds_played)
    stat_dict['Profile'].append(player_profile)
    return stat_dict

def parse_player_rows(player_rows, stat_dict):
    for i,row in enumerate(player_rows):
        player_data = row.find_all('td')
        stat_dict = parse_player_row(player_data, stat_dict)
    return stat_dict
stat_dict = parse_player_rows(player_rows, stat_dict) 
print(stat_dict)

{'Rank': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100], 'Username': ['themadbat2', 'lHluslHlKuslHl', 'jawasandcrawler', 'Z1nYoRiTa', 'Arjen10_oOorWhat', 'BoomKapoow', 'D3athD3al3r GR', 'xXhaydar_ayedXx', 'RagePooch', 'MaxiqYT', 'I Mr GoRi I', 'DASE 101', 'MELEE_OVERDRIVE', 'ChosenOne718', 'Frindly-Fire-4', 'EL-mari-ano98', 'xK1nG R3L1Cx', 'Predatory--Grin', 'Tiger766', 'UmairKingz', 'Cyborg JRS', 'TaruShiba_tv', 'Itendtokill', 'xXTrYagAiMXx', 'KRAG_Nerugui', 'XxGtOwNgOoN702xX', 'ViolentGeeks', 'noraneko-tom-3k', 'mugiwara890', 'BillyTheKid3434', 'LUKESCORPION', 'STIG_YY', 'warbonet', 'fares_8086', 'EnergisedKnight', 'DatelessBook58', 'ZigZag07',

In [35]:
profiles = stat_dict['Profile']
req = requests.get('https://battlefieldtracker.com/bfv/profile/psn/themadbat2/overview').content
soup2 = bs(req)

In [36]:
driver.get(profiles[0])

# Wait for page to load
time.sleep(5)

content = driver.page_source.encode('utf-8').strip()
ex_profile_soup = bs(content, 'html.parser') # an example profile

There are a lot of player stats on each profile page, both overall and broken down by class. The easiest way toget these is using css selectors, for the most part. Note that here, as with class specific stats, I have no interest in whether the player was in the top X% of the category. The part of the page I am processing now looks like this:

<img src='lifetime_stats.png'>

In [37]:
def get_player_lifetime_stats(profile_soup, stat_dict):
    # Collect lifetime stats for a player
    # Get lifetime hours played across all classes
    print(profile_soup.select('span.playtime')[0].text.strip().split(' '))
    lifetime_hours = float(profile_soup.select('span.playtime')[0].text.strip().split(' ')[0][:-1].replace(',', ''))
    if 'lifetime_hours' in stat_dict:
            stat_dict['lifetime_hours'].append(lifetime_hours)
    else:
        stat_dict['lifetime_hours'] = [lifetime_hours]

    # Identify the divs with the lifetime stats
    overall_stat_table = profile_soup.select('div.main')[0]
    stat_names_values = overall_stat_table.select('div.numbers > span')
    stat_names_values[0]['class']

    # Loop to collect stats
    i = 0
    while i < len(stat_names_values):
        stat_name = stat_names_values[i].text
        stat_value = float(stat_names_values[i+1].text.replace(',', '').replace('%', '').strip())
        if stat_name in stat_dict:
            stat_dict[stat_name].append(stat_value)
        else:
            stat_dict[stat_name] = [stat_value]
        i += 2

    return stat_dict

print('Results from example profile')
get_player_lifetime_stats(ex_profile_soup, {})

Results from example profile
['6,614h', 'Play', 'Time']


{'lifetime_hours': [6614.0],
 'Score/Min': [378.97],
 'K/D': [2.62],
 'Kills': [508165.0],
 'Kills/Min': [1.28],
 'Win %': [72.6],
 'Wins': [20538.0],
 'Deaths': [194116.0],
 'Assists': [104553.0],
 'Damage': [58636178.0],
 'Heals': [2288101.0],
 'Revives': [156255.0],
 'Resupplies': [168974.0]}

Next, I collect the stats for each player broken out by class, as well as by vehicle use. That is in a table like the below. The HTML organizing this information is a bit less readable, but there fortunately only two actual HTML tables on the page, and the former contains the desired data. Each row of the table consists of several table data elements. Each of those table data elements has a span with class name "name," which contains the values I want to retrieve. I define a list of stat categories I want to retrieve manually. Then for each class, I add the name of the class under consideration to the stat name, ex. medic_score.

<img src='class_stats.png'>

In [38]:
stat_categories = ['Score', 'Score/Min', 'Kills', 'Kills/Min', 'K/D']
class_stat_table = ex_profile_soup.find_all('tbody')[0].find_all('tr') # each element here is a row in the table pictured above

Below is an example row

In [39]:
class_row = class_stat_table[0].find_all('td')
class_name = class_row[0].select('span.name')[0].text # get the name of the class the row represents
class_hours = float(class_row[0].select('span.sub')[0].text[:-1].replace(',', ''))
stat_dict = {} # Reset stat dict to limit output
# add class time played to stat dictionary
if f'{class_name}_hours' in stat_dict:
        stat_dict[f'{class_name}_hours'].append(class_hours)
else:
    stat_dict[f'{class_name}_hours'] = [class_hours]

# get the stats
for i, td in enumerate(class_row[2:]):
    stat_name = f'{class_name}_{stat_categories[i]}'
    class_stat = td.select('span.name')[0].text.replace(',', '')
    if stat_name in stat_dict:
        stat_dict[stat_name].append(class_stat)
    else:
        stat_dict[stat_name] = [class_stat]

print(stat_dict)

{'Medic_hours': [3748.0], 'Medic_Score': ['89795488'], 'Medic_Score/Min': ['399.25'], 'Medic_Kills': ['335041'], 'Medic_Kills/Min': ['1.49'], 'Medic_K/D': ['2.70']}


We can see from the above that this adds the class specific stats from the row. All that is left to do for the player profile is to loop over the other classes as well. I clear the dictionary of stats to make it a bit more readable.

In [40]:
def get_player_class_stats(profile_soup, stat_dict):
    stat_categories = ['Score', 'Score/Min', 'Kills', 'Kills/Min', 'K/D']
    class_stat_table = profile_soup.find_all('tbody')[0].find_all('tr') # each element here is a row in the table pictured above
    class_row = class_stat_table[0].find_all('td')
    for row in class_stat_table:
        class_row = row.find_all('td')
        class_name = class_row[0].select('span.name')[0].text # get the name of the class the row represents
        
        # add class time played to stat dictionary
        class_hours = class_row[0].select('span.sub')[0].text
        time_split = class_hours.split(' ')
        if len(time_split) > 1:
            class_hours = time_split
            # if time played in XXh YYm format
            if 'h' in class_hours[0]:
                class_hours = float(class_hours[0][:-1]) * 60 + float(class_hours[1][:-1])
                class_hours /= 60
            # if time played in XXm YYs format
            elif 'm' in class_hours[0]:
                class_hours = float(class_hours[0][:-1]) + float(class_hours[1][:-1])/60
                class_hours /= 60
        else:
            class_hours = class_hours[:-1].replace(',', '')
        
                
        if f'{class_name}_hours' in stat_dict:
                stat_dict[f'{class_name}_hours'].append(class_hours)
        else:
            stat_dict[f'{class_name}_hours'] = [class_hours]

        # get the stats
        for i, td in enumerate(class_row[2:]):
            stat_name = f'{class_name}_{stat_categories[i]}'
            class_stat = td.select('span.name')[0].text.replace(',', '')
            if stat_name in stat_dict:
                stat_dict[stat_name].append(class_stat)
            else:
                stat_dict[stat_name] = [class_stat]
    return stat_dict

get_player_class_stats(ex_profile_soup, {})

{'Medic_hours': ['3748'],
 'Medic_Score': ['89795488'],
 'Medic_Score/Min': ['399.25'],
 'Medic_Kills': ['335041'],
 'Medic_Kills/Min': ['1.49'],
 'Medic_K/D': ['2.70'],
 'Recon_hours': ['1223'],
 'Recon_Score': ['28043764'],
 'Recon_Score/Min': ['382.16'],
 'Recon_Kills': ['92421'],
 'Recon_Kills/Min': ['1.26'],
 'Recon_K/D': ['2.55'],
 'Assault_hours': ['1023'],
 'Assault_Score': ['20594554'],
 'Assault_Score/Min': ['335.31'],
 'Assault_Kills': ['47079'],
 'Assault_Kills/Min': ['0.77'],
 'Assault_K/D': ['2.47'],
 'Support_hours': ['550'],
 'Support_Score': ['9516912'],
 'Support_Score/Min': ['288.08'],
 'Support_Kills': ['30079'],
 'Support_Kills/Min': ['0.91'],
 'Support_K/D': ['2.46'],
 'Tanker_hours': ['36'],
 'Tanker_Score': ['1774991'],
 'Tanker_Score/Min': ['813.47'],
 'Tanker_Kills': ['4388'],
 'Tanker_Kills/Min': ['2.01'],
 'Tanker_K/D': ['0.00'],
 'Pilot_hours': ['32'],
 'Pilot_Score': ['344479'],
 'Pilot_Score/Min': ['176.11'],
 'Pilot_Kills': ['634'],
 'Pilot_Kills/Min': [

The next step is to do this for all of the players on a results page. An effective way to do this limits the number of times I have to send requests to the server. First, I load a leaderboard page. From this page, I get all of the profiles for the players on the page. Then I go to each profile and collect data from the profile. Conveniently, code for each part of this task has already been written.

### Parsing a single leaderboard page's information

In [41]:
def get_leaderboard_page(leaderboard_url):
    '''
    Takes a link to a leaderboard page, loads it in the browser and returns the HTML for information loaded on the page in a format that is easy to parse
    '''
    driver.get(leaderboard_url)

    # Wait for page to load
    time.sleep(7)

    content = driver.page_source.encode('utf-8').strip()
    leaderboard_soup = bs(content, 'html.parser')
    return soup

def parse_leaderboard_page(leaderboard_url, stat_dict):
    # Load the leaderboard page
    leaderboard_soup = get_leaderboard_page(leaderboard_url)

    # Get the rows on the page, each representing basic information about a player
    player_table = soup.find("table",{'class':'trn-table'}).tbody
    player_rows = player_table.find_all('tr')

    stat_dict = parse_player_rows(player_rows, stat_dict)

    return stat_dict



### Parsing the Profiles

In [42]:
stat_dict.keys()

dict_keys(['Medic_hours', 'Medic_Score', 'Medic_Score/Min', 'Medic_Kills', 'Medic_Kills/Min', 'Medic_K/D'])

In [43]:
def get_profile_page(profile_url):
    '''
    Takes a link to a profile page, loads it in the browser and returns the HTML for information loaded on the page in a format that is easy to parse
    '''
    driver.get(profile_url)

    # Wait for page to load
    time.sleep(5)

    content = driver.page_source.encode('utf-8').strip()
    profile_soup = bs(content, 'html.parser')

    return profile_soup

def parse_profiles(stat_dict):
    '''
    For each profile on a leaderboard page, add data from the profile to the stat_dict
    '''
    profiles = stat_dict['Profile']
    for profile_url in tqdm(profiles):
        profile_soup = get_profile_page(profile_url)

        # Get the player's overall stats
        stat_dict = get_player_lifetime_stats(profile_soup, stat_dict)

        # Get the player's class stats
        stat_dict = get_player_class_stats(profile_soup, stat_dict)

        time.sleep(5)
    return stat_dict

### Parse Leaderboard Page and Associated Profiles

In [44]:
base_url = 'https://battlefieldtracker.com/bfv/leaderboards/stats/all/Wins?type=stats&page=1'
stat_dict = {'Rank': [], 'Username':[], 'Platform':[], 'Wins':[], 'Rounds Played':[], 'Profile':[]}

def parse_leaderboard_page_and_profiles(leaderboard_url, stat_dict):
    # Get basic leaderboard info
    stat_dict = parse_leaderboard_page(base_url, stat_dict)

    # get info from profiles
    stat_dict = parse_profiles(stat_dict)
    return stat_dict

parse_leaderboard_page_and_profiles(base_url, stat_dict)

  0%|          | 0/100 [00:00<?, ?it/s]

['6,614h', 'Play', 'Time']


  1%|          | 1/100 [00:12<20:02, 12.14s/it]

['3,794h', 'Play', 'Time']


  2%|▏         | 2/100 [00:23<19:03, 11.67s/it]

['6,357h', 'Play', 'Time']


  3%|▎         | 3/100 [00:35<19:27, 12.04s/it]

['6,314h', 'Play', 'Time']


  4%|▍         | 4/100 [00:48<19:35, 12.25s/it]

['5,450h', 'Play', 'Time']


  5%|▌         | 5/100 [01:00<19:16, 12.17s/it]

['5,272h', 'Play', 'Time']


  6%|▌         | 6/100 [01:13<19:14, 12.29s/it]

['6,626h', 'Play', 'Time']


  7%|▋         | 7/100 [01:25<18:57, 12.23s/it]

['5,921h', 'Play', 'Time']


  8%|▊         | 8/100 [01:37<18:43, 12.21s/it]

['4,375h', 'Play', 'Time']


  9%|▉         | 9/100 [01:50<18:49, 12.42s/it]

['4,770h', 'Play', 'Time']


 10%|█         | 10/100 [02:02<18:25, 12.28s/it]

['5,873h', 'Play', 'Time']


 11%|█         | 11/100 [02:15<18:31, 12.49s/it]

['3,267h', 'Play', 'Time']


 12%|█▏        | 12/100 [02:27<18:13, 12.43s/it]

['4,140h', 'Play', 'Time']


 13%|█▎        | 13/100 [02:39<17:55, 12.36s/it]

['3,908h', 'Play', 'Time']


 14%|█▍        | 14/100 [02:52<17:42, 12.35s/it]

['5,545h', 'Play', 'Time']


 15%|█▌        | 15/100 [03:04<17:38, 12.46s/it]

['5,824h', 'Play', 'Time']


 16%|█▌        | 16/100 [03:16<17:13, 12.30s/it]

['5,831h', 'Play', 'Time']


 17%|█▋        | 17/100 [03:29<17:12, 12.44s/it]

['5,497h', 'Play', 'Time']


 18%|█▊        | 18/100 [03:40<16:30, 12.08s/it]

['5,111h', 'Play', 'Time']


 19%|█▉        | 19/100 [03:53<16:29, 12.21s/it]

['4,668h', 'Play', 'Time']


 20%|██        | 20/100 [04:05<16:12, 12.16s/it]

['4,595h', 'Play', 'Time']


 21%|██        | 21/100 [04:17<16:06, 12.23s/it]

['4,952h', 'Play', 'Time']


 22%|██▏       | 22/100 [04:30<16:20, 12.57s/it]

['3,811h', 'Play', 'Time']


 23%|██▎       | 23/100 [04:43<16:16, 12.68s/it]

['3,842h', 'Play', 'Time']


 24%|██▍       | 24/100 [04:57<16:21, 12.91s/it]

['4,108h', 'Play', 'Time']


 25%|██▌       | 25/100 [05:09<16:01, 12.82s/it]

['4,391h', 'Play', 'Time']


 26%|██▌       | 26/100 [05:23<15:54, 12.90s/it]

['5,126h', 'Play', 'Time']


 27%|██▋       | 27/100 [05:35<15:38, 12.85s/it]

['4,850h', 'Play', 'Time']


 28%|██▊       | 28/100 [05:52<16:58, 14.15s/it]

['3,138h', 'Play', 'Time']


 29%|██▉       | 29/100 [06:06<16:41, 14.11s/it]

['5,247h', 'Play', 'Time']


 30%|███       | 30/100 [06:20<16:19, 14.00s/it]

['5,416h', 'Play', 'Time']


 31%|███       | 31/100 [06:34<16:00, 13.93s/it]

['4,044h', 'Play', 'Time']


 32%|███▏      | 32/100 [06:47<15:33, 13.72s/it]

['3,940h', 'Play', 'Time']


 33%|███▎      | 33/100 [07:01<15:15, 13.67s/it]

['7,415h', 'Play', 'Time']


 34%|███▍      | 34/100 [07:14<15:01, 13.66s/it]

['4,718h', 'Play', 'Time']


 35%|███▌      | 35/100 [07:26<14:06, 13.03s/it]

['5,522h', 'Play', 'Time']


 36%|███▌      | 36/100 [07:41<14:25, 13.52s/it]

['5,920h', 'Play', 'Time']


 37%|███▋      | 37/100 [07:54<14:06, 13.44s/it]

['4,113h', 'Play', 'Time']


 38%|███▊      | 38/100 [08:07<13:45, 13.31s/it]

['3,814h', 'Play', 'Time']


 39%|███▉      | 39/100 [08:20<13:34, 13.35s/it]

['3,909h', 'Play', 'Time']


 40%|████      | 40/100 [08:34<13:22, 13.37s/it]

['3,931h', 'Play', 'Time']


 41%|████      | 41/100 [08:46<12:40, 12.89s/it]

['4,283h', 'Play', 'Time']


 42%|████▏     | 42/100 [08:59<12:37, 13.06s/it]

['3,693h', 'Play', 'Time']


 43%|████▎     | 43/100 [09:10<11:58, 12.60s/it]

['7,035h', 'Play', 'Time']


 44%|████▍     | 44/100 [09:24<11:57, 12.81s/it]

['2,739h', 'Play', 'Time']


 45%|████▌     | 45/100 [09:37<11:56, 13.02s/it]

['2,878h', 'Play', 'Time']


 46%|████▌     | 46/100 [09:49<11:22, 12.63s/it]

['3,923h', 'Play', 'Time']


 47%|████▋     | 47/100 [10:01<10:54, 12.34s/it]

['3,206h', 'Play', 'Time']


 48%|████▊     | 48/100 [10:12<10:29, 12.10s/it]

['3,632h', 'Play', 'Time']


 49%|████▉     | 49/100 [10:25<10:31, 12.39s/it]

['5,458h', 'Play', 'Time']


 50%|█████     | 50/100 [10:39<10:34, 12.68s/it]

['4,407h', 'Play', 'Time']


 51%|█████     | 51/100 [10:52<10:31, 12.89s/it]

['4,495h', 'Play', 'Time']


 52%|█████▏    | 52/100 [11:04<10:03, 12.57s/it]

['2,806h', 'Play', 'Time']


 53%|█████▎    | 53/100 [11:17<09:59, 12.75s/it]

['5,051h', 'Play', 'Time']


 54%|█████▍    | 54/100 [11:30<09:53, 12.90s/it]

['2,746h', 'Play', 'Time']


 55%|█████▌    | 55/100 [11:43<09:39, 12.88s/it]

['3,412h', 'Play', 'Time']


 56%|█████▌    | 56/100 [11:55<09:13, 12.59s/it]

['4,811h', 'Play', 'Time']


 57%|█████▋    | 57/100 [12:14<09:14, 12.89s/it]


IndexError: list index out of range

In [None]:
time_played = '6h 20'
time_played = time_played.split(' ')
time_played = float(time_played[0][:-1]) * 60 + float(time_played[1])
time_played /= 60
time_played

6.333333333333333

# Find Number of Leaderboard Page
I did not know how many pages the leaderboard has. Candidly, I found this out manually through trial and error, which only took a couple of minutes. There are 791.

You could do this programmatically, but given it is polite to wait in between sending requests to servers, this would take longer.

A potential programatic approach could go as follows: each leaderboard url ends in a page number. Call this i. You could increase i in increasingly larger step sizes until you no longer load pages with HTML in the form expected. Then I could be decreased in increasingly larger step size until it loaded a page with increasingly larger step sizes until it found a page of the form desired. This could be repeated until it converged on the last page. This is only one fairly naive approach. Generally, it's a search through a sequence of integers that is unbounded on one side.