In [2]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By

import bs4
import re

import pandas as pd

## For a player

In [3]:
options = webdriver.EdgeOptions()
# options.add_argument('headless')
options.add_argument('inprivate')
driver = webdriver.Edge(options= options)

In [4]:
driver.get('https://qlstats.net/player/330308')

### Cookie screen

In [5]:
def is_cookie_screen(driver : selenium.webdriver):
    '''A function to check if the given webpage is the 'accept cookies' screen.
    Regex matches the body of the '''
    element = driver.find_element(By.TAG_NAME, value = 'body')
    cookie_screen = re.compile(r'.*(To continue using qlstats, you need to agree to the use of cookies.\nAgree).*')
    if cookie_screen.search(element.text) is None:
        return False
    return True


In [6]:
#press the button
if is_cookie_screen(driver):
    try:
        button = driver.find_element(By.TAG_NAME, 'button')
        button.click()
    except:
        "Cookies could not be accepted, please recheck"

### Scraping the player page

In [7]:
from bs4 import BeautifulSoup

In [8]:
soup = BeautifulSoup(driver.page_source, 'lxml')


In [9]:
# finding player name
player_name = soup.select_one('h2').text
print('Player name:\n{}'.format(player_name))

#for unavailable IDs:
if re.match(r"Sorry, that player wasn't found!", player_name):
    print("Player not found, can't proceed")

else:
    pass
    #TODO Scrape here

Player name:
FOG


### ELSE tag here onwards

- p_tab_list
- player_info
- games_played
- type_stats

dictionaries

In [10]:
# parse information from the p tab
p_tab_text = soup.select_one('p').text
p_tab_list = list(map(str.strip, p_tab_text.split("\n")))
p_tab_list = [x for x in p_tab_list if x != '']

p_tab_list

['Region: Europe',
 'Player ID: 330308',
 'Steam ID: 76561198062475631',
 'Joined: 2021-04-28   16:22:30',
 'Status: active']

#### Split and store in dictionary

In [21]:
player_info = {'name' : player_name}
for x in p_tab_list:
    elements = x.split(": ")
    player_info[elements[0]] = elements[1]

player_info

{'name': 'FOG',
 'Region': 'Europe',
 'Player ID': '330308',
 'Steam ID': '76561198062475631',
 'Joined': '2021-04-28   16:22:30',
 'Status': 'active'}

In [22]:
# press the overall button on the player page
button = driver.find_element(By.CLASS_NAME, 'tab-overall')
button.click()

In [23]:
games_played = {}

gametypes = soup.find('ul', id = 'gbtab').text
gametypes = [x.strip() for x in gametypes.split('\n') if x != '']

for i in range(len(gametypes)//2):
    t, n = gametypes[i*2], gametypes[2*i+1][1:-1]
    games_played[t] = int(n)

games_played.pop('overall')
games_played

{'ca': 491, 'duel': 19, 'ffa': 1}

### RegEx Extractors

In [29]:
stat_extractor = re.compile(r"Win Rate: (.*) Kill Ratio: (.*) Cap Ratio: (.*) Rating: (.*) B-Rating: (.*) Rank: (.*) Last Played: (.*) Games Played: (.*) Favorite Map: (.*)")
winrate_extractor = re.compile(r"([0-9]+\.?[0-9]*).*\((\d+).*, ([0-9]+).*")
kdRatio_extractor = re.compile(r"(\d+\.?\d+) \((\d+).*, (\d+).*")
capRatio_extractor = re.compile(r'(.*)')
elo_extractor = re.compile(r"([0-9]+) ± ([0-9]+).*, ([0-9]+).*")
rank_extractor = re.compile(r"([0-9]+) of ([0-9]+).*")

extractors = [winrate_extractor, kdRatio_extractor, capRatio_extractor, elo_extractor, elo_extractor, rank_extractor]

In [44]:
def parse_gametype_stats(data:tuple):
    '''Function for parsing the stats info given a tuple of
    - winrate
    - cap ratio
    - elo
    - b-elo
    - rank
    -last played
    - games played
    - fav map
    
    Returns a dictionary containing parsed info and keys:
    winrate, cap_ratio, elo (ordered-tuple as value), b-elo (ordered-tuple as value), rank (ordered-tuple as value), last_played, num_games, fav_map'''
    ret_dict = {}
    keys = ['winrate', 'kd_ratio', 'cap_ratio', 'elo', 'b-elo', 'rank', 'last_played', 'num_games', 'fav_map']
    for i in range(6):
        try:
            ext = extractors[i]
            to_parse = data[i]
            ret_dict[keys[i]] = ext.search(to_parse).groups()
        except:
            ret_dict[keys[i]] = '-'
    return ret_dict
    

In [47]:
def parse_stats(soup, gametypes):
    '''Parse and stores stats by gametype into a dictionary.
    
    Arguments: 
    - soup: the BS4 object
    - gametypes: a list of gametypes for extracting data. This can be acquired by scraping the 'gbtab' 
    list on a player's page on QLStats.
    
    Returns: A dictionary with keys as gametypes and values as the stats associated with them.
    The stats are extracted using RegEx.'''
    stats = {}
    # parse winrate and elo info
    for gt in gametypes:
        type_stats = soup.find('div', id = 'tab-{}'.format(gt)).text
        type_stats = type_stats.split('\n')
        type_stats = [x.strip() for x in type_stats if x.strip() != '']
        stats_as_string = " ".join(type_stats)

        res = stat_extractor.search(stats_as_string)
        #there is winrate, kill ratio, cap ratio, rating, b-rating, rank, last played games played favorite map
        stat_dict = parse_gametype_stats(res.groups())
        stats[gt] = stat_dict

    return stats

parse_stats(soup, games_played.keys())

{'ca': {'winrate': ('65.78', '323', '168'),
  'kd_ratio': ('1.52', '6936', '4554'),
  'cap_ratio': ('-',),
  'elo': ('1408', '37', '3326'),
  'b-elo': ('1211', '48', '116'),
  'rank': ('257', '1515')},
 'duel': {'winrate': ('63.16', '12', '7'),
  'kd_ratio': ('1.17', '261', '223'),
  'cap_ratio': ('-',),
  'elo': ('1505', '134', '513'),
  'b-elo': '-',
  'rank': ('134', '482')},
 'ffa': {'winrate': ('0.0', '0', '1'),
  'kd_ratio': ('2.27', '25', '11'),
  'cap_ratio': ('-',),
  'elo': ('1198', '147', '3'),
  'b-elo': ('1275', '84', '10'),
  'rank': '-'}}

Flow is something like this:
- Winrate
- K/D ratio
- Cap Ratio
- ELO
- B-ELO
- Rank
- Last Played
- Games played
- Favourite map

Overall tab just has the latest/maximal values from the other tabs.

In [16]:
soup.findAll('table', class_= 'table table-hover table-condensed')

[<table class="table table-hover table-condensed">
 <thead>
 <tr>
 <th>#</th>
 <th></th>
 <th>Played</th>
 <th>Type</th>
 <th>Server</th>
 <th>Map</th>
 <th>Result</th>
 <th>Opponent</th>
 <th>Rating</th>
 <th title="Rating ± Uncertainty">Old Glicko</th>
 <th title="Rating / Uncertainty">Glicko Change</th>
 </tr>
 </thead>
 <tbody>
 </tbody>
 </table>]

### Shut down the browser

In [17]:
# driver.close()