In [68]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By

import bs4
import re

import pandas as pd

## For a player

In [69]:
options = webdriver.EdgeOptions()
# options.add_argument('headless')
options.add_argument('inprivate')
driver = webdriver.Edge(options= options)

In [70]:
driver.get('https://qlstats.net/player/330308')

### Cookie screen

In [71]:
def is_cookie_screen(driver : selenium.webdriver):
    '''A function to check if the given webpage is the 'accept cookies' screen.
    Regex matches the body of the '''
    element = driver.find_element(By.TAG_NAME, value = 'body')
    cookie_screen = re.compile(r'.*(To continue using qlstats, you need to agree to the use of cookies.\nAgree).*')
    if cookie_screen.search(element.text) is None:
        return False
    return True


In [72]:
#press the button
if is_cookie_screen(driver):
    try:
        button = driver.find_element(By.TAG_NAME, 'button')
        button.click()
    except:
        "Cookies could not be accepted, please recheck"

### Scraping the player page

In [73]:
from bs4 import BeautifulSoup

In [74]:
soup = BeautifulSoup(driver.page_source, 'lxml')


In [75]:
# finding player name
player_name = soup.select_one('h2').text
print('Player name:\n{}'.format(player_name))

#for unavailable IDs:
if re.match(r"Sorry, that player wasn't found!", player_name):
    print("Player not found, can't proceed")

else:
    pass
    #TODO Scrape here

Player name:
FOG


## ELSE tag here onwards

- p_tab_list
- player_info
- games_played
- stats

The flow for gametypes is as follows:
- Winrate
- K/D ratio
- Cap Ratio
- ELO
- B-ELO
- Rank
- Last Played
- Games played
- Favourite map

### RegEx Extractors

In [76]:
stat_extractor = re.compile(r"Win Rate: (.*) Kill Ratio: (.*) Cap Ratio: (.*) Rating: (.*) B-Rating: (.*) Rank: (.*) Last Played: (.*) Games Played: (.*) Favorite Map: (.*)")
winrate_extractor = re.compile(r"([0-9]+\.?[0-9]*).*\((\d+).*, ([0-9]+).*")
kdRatio_extractor = re.compile(r"(\d+\.?\d+) \((\d+).*, (\d+).*")
capRatio_extractor = re.compile(r'(.*)')
elo_extractor = re.compile(r"([0-9]+) ± ([0-9]+).*, ([0-9]+).*")
rank_extractor = re.compile(r"([0-9]+) of ([0-9]+).*")

extractors = [winrate_extractor, kdRatio_extractor, capRatio_extractor, elo_extractor, elo_extractor, rank_extractor]

In [77]:
# parse information from the p tab
def extract_player_info(soup):
    p_tab_text = soup.select_one('p').text
    p_tab_list = list(map(str.strip, p_tab_text.split("\n")))
    p_tab_list = [x for x in p_tab_list if x != '']
    return p_tab_list



def extract_player_name_details(p_tab_list):
    player_info = {'name' : player_name}
    for x in p_tab_list:
        elements = x.split(": ")
        player_info[elements[0]] = elements[1]
    return player_info



# # press the overall button on the player page
# button = driver.find_element(By.CLASS_NAME, 'tab-overall')
# button.click()

def extract_game_info(soup):
    games_played = {}

    gametypes = soup.find('ul', id = 'gbtab').text
    gametypes = [x.strip() for x in gametypes.split('\n') if x != '']

    for i in range(len(gametypes)//2):
        t, n = gametypes[i*2], gametypes[2*i+1][1:-1]
        games_played[t] = int(n)

    games_played.pop('overall')
    return games_played


def parse_gametype_stats(data:tuple):
    '''Function for parsing the stats info given a tuple of
    - winrate
    - cap ratio
    - elo
    - b-elo
    - rank
    -last played
    - games played
    - fav map
    
    Returns a dictionary containing parsed info and keys:
    winrate, cap_ratio, elo (ordered-tuple as value), b-elo (ordered-tuple as value), rank (ordered-tuple as value), last_played, num_games, fav_map'''
    ret_dict = {}
    keys = ['winrate', 'kd_ratio', 'cap_ratio', 'elo', 'b-elo', 'rank', 'last_played', 'num_games', 'fav_map']
    for i in range(6):
        try:
            ext = extractors[i]
            to_parse = data[i]
            ret_dict[keys[i]] = ext.search(to_parse).groups()
        except:
            ret_dict[keys[i]] = '-'
    return ret_dict

def parse_stats(soup, gametypes):
    '''Parse and stores stats by gametype into a dictionary.
    
    Arguments: 
    - soup: the BS4 object
    - gametypes: a list of gametypes for extracting data. This can be acquired by scraping the 'gbtab' 
    list on a player's page on QLStats.
    
    Returns: A dictionary with keys as gametypes and values as the stats associated with them.
    The stats are extracted using RegEx.'''
    stats = {}
    # parse winrate and elo info
    for gt in gametypes:
        type_stats = soup.find('div', id = 'tab-{}'.format(gt)).text
        type_stats = type_stats.split('\n')
        type_stats = [x.strip() for x in type_stats if x.strip() != '']
        stats_as_string = " ".join(type_stats)

        res = stat_extractor.search(stats_as_string)
        #there is winrate, kill ratio, cap ratio, rating, b-rating, rank, last played games played favorite map
        stat_dict = parse_gametype_stats(res.groups())
        stats[gt] = stat_dict

    return stats





p_tab_list = extract_player_info(soup)
player_info = extract_player_name_details(p_tab_list)
games_played = extract_game_info(soup)
stats = parse_stats(soup, games_played.keys())
print(p_tab_list)
print(player_info)
print(games_played)
print(stats)

['Region: Europe', 'Player ID: 330308', 'Steam ID: 76561198062475631', 'Joined: 2021-04-28   16:22:30', 'Status: active']
{'name': 'FOG', 'Region': 'Europe', 'Player ID': '330308', 'Steam ID': '76561198062475631', 'Joined': '2021-04-28   16:22:30', 'Status': 'active'}
{'ca': 491, 'duel': 19, 'ffa': 1}
{'ca': {'winrate': ('65.78', '323', '168'), 'kd_ratio': ('1.52', '6936', '4554'), 'cap_ratio': ('-',), 'elo': ('1408', '37', '3326'), 'b-elo': ('1211', '48', '116'), 'rank': ('257', '1515')}, 'duel': {'winrate': ('63.16', '12', '7'), 'kd_ratio': ('1.17', '261', '223'), 'cap_ratio': ('-',), 'elo': ('1505', '134', '513'), 'b-elo': '-', 'rank': ('134', '482')}, 'ffa': {'winrate': ('0.0', '0', '1'), 'kd_ratio': ('2.27', '25', '11'), 'cap_ratio': ('-',), 'elo': ('1198', '147', '3'), 'b-elo': ('1275', '84', '10'), 'rank': '-'}}


Overall tab just has the latest/maximal values from the other tabs.

In [78]:
soup.findAll('table', class_= 'table table-hover table-condensed')

[<table class="table table-hover table-condensed">
 <thead>
 <tr>
 <th>#</th>
 <th></th>
 <th>Played</th>
 <th>Type</th>
 <th>Server</th>
 <th>Map</th>
 <th>Result</th>
 <th>Opponent</th>
 <th>Rating</th>
 <th title="Rating ± Uncertainty">Old Glicko</th>
 <th title="Rating / Uncertainty">Glicko Change</th>
 </tr>
 </thead>
 <tbody>
 </tbody>
 </table>]

## Scrape match data

In [81]:
#click on 'more' button
try:
    btn = driver.find_element(By.LINK_TEXT, 'More...')
    btn.click()

    btn = driver.find_element(By.CSS_SELECTOR, '[alt="overall"]')
    btn.click()
except:
    print('Could not click the button')

cheh


In [84]:
pd.read_html(driver.current_url, attrs = {'class': 'table table-hover table-condensed'})

[   Unnamed: 0              Played Type  \
 0        view  about 10 hours ago   ca   
 1        view  about 10 hours ago   ca   
 2        view  about 11 hours ago   ca   
 3        view  about 11 hours ago   ca   
 4        view  about 11 hours ago   ca   
 5        view  about 11 hours ago   ca   
 6        view  about 11 hours ago   ca   
 7        view           1 day ago   ca   
 8        view           1 day ago   ca   
 9        view          3 days ago   ca   
 10       view          3 days ago   ca   
 11       view          3 days ago   ca   
 12       view          4 days ago   ca   
 13       view          5 days ago   ca   
 14       view          5 days ago   ca   
 15       view          5 days ago   ca   
 16       view          5 days ago   ca   
 17       view          5 days ago   ca   
 18       view          5 days ago   ca   
 19       view          5 days ago   ca   
 
                                                Server             Map  \
 0   (India+Singapore

### Shut down the browser

In [67]:
# driver.close()