# Scrape a particular match's information from a particular URL

In [1]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup
import re

import pandas as pd

In [2]:
options = webdriver.EdgeOptions()
# options.add_argument('headless')
options.add_argument('inprivate')
driver = webdriver.Edge(options= options)

In [74]:
# https://qlstats.net/game/8489604  # game with multiple particpants of the same name
driver.get('https://qlstats.net/game/8476678')      # regular CA game

driver.get('https://qlstats.net/game/8489604')      # duplicate participants

#driver.get('https://qlstats.net/game/330187')       # duel

## Cookie screen

In [4]:
def is_cookie_screen(driver : selenium.webdriver):
    '''A function to check if the given webpage is the 'accept cookies' screen.
    Regex matches the body of the '''
    element = driver.find_element(By.TAG_NAME, value = 'body')
    cookie_screen = re.compile(r'.*(To continue using qlstats, you need to agree to the use of cookies.\nAgree).*')
    if cookie_screen.search(element.text) is None:
        return False
    return True


In [5]:
#press the button
if is_cookie_screen(driver):
    try:
        button = driver.find_element(By.TAG_NAME, 'button')
        button.click()
    except:
        "Cookies could not be accepted, please recheck"

## Scrape

For match data, the winning team's table occurs first. Therefore, while pandas' read_html works pretty well, it cannot determine which is the winning/losing team.

We need to parse the information using the divs surrounding the table.

The winning team's information isn't shown in game modes like duel. The accuracy and match data table isn't shown in case of old duels.

### RegEx

In [75]:
match_details_finder = re.compile(r"Played: (.*) Game Type: (.*) Server: (.*) Map: (.*) Duration: (.*) Rating Status: (.*)")

### Soup

In [76]:
soup = BeautifulSoup(driver.page_source, 'lxml')

In [77]:
match_text = " ".join(soup.find('p').text.split())
#match_text = [x.strip() for x in match_text if x.strip() != '']
match_text

"Played: 2023-01-09 20:54:10 Game Type: Clan Arena (ca) Server: (India+Singapore+UAE) Topgun's CA Server w/crouchslide Map: thunderstruck Duration: 6:15 Rating Status: Not rated (not enough qualifying players)"

In [78]:
# time of the match, gametype, server, map, duration, if rated
match_details = match_details_finder.match(match_text).groups()
match_details

('2023-01-09 20:54:10',
 'Clan Arena (ca)',
 "(India+Singapore+UAE) Topgun's CA Server w/crouchslide",
 'thunderstruck',
 '6:15',
 'Not rated (not enough qualifying players)')

In [79]:
# in case it's a team-based game...
winner = soup.find('div', {'class' : 'teamname'}).text
winner = [x.strip() for x in winner.split('\n') if x.strip() != '']
winner = winner[0]
winner

'Red'

### Winning team

In [80]:
winner_team = pd.read_html(driver.current_url)[0]
winner_team

Unnamed: 0,Nick,Time,Rounds,Kills,Deaths,Damage Dealt,Damage Taken,Score,Perf,Old Glicko,Glicko Change
0,HYDRAZINE,0:06:15,16,9,7,4752,4292,55,,,
1,(2) Theatricality,0:03:19,9,11,2,2818,2306,35,,,
2,Theatricality,0:02:56,7,2,5,1322,1757,14,,,


### Losing team

In [81]:
loser_team = pd.read_html(driver.current_url)[1]
loser_team

Unnamed: 0,Nick,Time,Rounds,Kills,Deaths,Damage Dealt,Damage Taken,Score,Perf,Old Glicko,Glicko Change
0,A S U R A,0:06:15,16,5,12,3573,4271,40,,,
1,(2) Theatricality,0:02:56,7,6,3,2463,1634,30,,,
2,Theatricality,0:03:19,9,3,7,2501,2521,26,,,


### Accuracy and weapons stats

You want to extract accuracy, kills, hits and shots.

The overall tab is just a bad summary of everything, might as well ignore it.

In [82]:
to_extract = ['acc', 'kills', 'hits', 'shots']

In [91]:
btn = driver.find_element(By.CSS_SELECTOR, 'h3[data-info="{}"]'.format('all'))
btn.click()

In [92]:
player_info = {}

acc_table = soup.find('table', attrs = {'id':'accuracyTable'})
for rows in acc_table.find_all('tr'):
    for data in rows.find_all('td'):
        link = data.find('a', href = True)
        if(link != None):
            href = link['href']
            player_name = link.text
            print(href, player_name)
            player_info[player_name] = href
            continue
        print(data.text)
        #print(data)
        print("CHEE")

/player/312322 HYDRAZINE
Kills / AccHits / Shots
CHEE
1 / 31%31 / 100
CHEE
0 / 78%14 / 18
CHEE
4 / 42%605 / 1446
CHEE
4 / 31%4 / 13
CHEE
/player/63562 A S U R A
Kills / AccHits / Shots
CHEE
0 / 22%13 / 60
CHEE
1 / 62%43 / 69
CHEE
4 / 32%161 / 509
CHEE
0 / 50%7 / 14
CHEE
/player/284886 (2) Theatricality
Kills / AccHits / Shots
CHEE
8 / 48%190 / 400
CHEE
0 / 71%40 / 56
CHEE
12 / 43%882 / 2062
CHEE
14 / 44%22 / 50
CHEE
/player/149897 Theatricality
Kills / AccHits / Shots
CHEE
0 / 40%16 / 40
CHEE
0 / 67%58 / 86
CHEE
2 / 36%256 / 706
CHEE
8 / 40%32 / 80
CHEE


In [85]:
#find weapons and column names

columns = ['PlayerName']
for header in soup.find('table', attrs = {'id':'accuracyTable'}).find_all('th'):
    print('*'+header.text+'*')
    if(header.text != ''): columns.append(header.text.strip())
columns

**
* SG*
* RL*
* LG*
* RG*


['PlayerName', 'SG', 'RL', 'LG', 'RG']

In [86]:
pd.read_html(driver.current_url, attrs = {'id':'accuracyTable'}, flavor = 'lxml')

[          Unnamed: 0  Unnamed: 1  SG  RL  LG  RG
 0          HYDRAZINE         NaN NaN NaN NaN NaN
 1          A S U R A         NaN NaN NaN NaN NaN
 2  (2) Theatricality         NaN NaN NaN NaN NaN
 3      Theatricality         NaN NaN NaN NaN NaN]

In [87]:
pd.read_html('https://qlstats.net/game/7830735')

[           Nick     Time  Kills  Deaths  Damage Dealt  Damage Taken  Score  \
 0  =*=Topgun=*=  0:10:37      2       1          2516          2618      2   
 1         Kursi  0:10:37      1       2          2518          2513      1   
 
    Perf  Old Glicko Glicko Change  
 0     1  1469 ± 318     +26 / -22  
 1     0    945 ± 97       -4 / -1  ,
      Unnamed: 0  Unnamed: 1  MG  SG  GL  RL  LG  RG  PG
 0  =*=Topgun=*=         NaN NaN NaN NaN NaN NaN NaN NaN
 1         Kursi         NaN NaN NaN NaN NaN NaN NaN NaN]