In [117]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By

import bs4
import re

import pandas as pd
from dateutil import parser
from tqdm import tqdm
from urllib.parse import urljoin
import numpy as np

tqdm.pandas()

In [2]:
options = webdriver.EdgeOptions()
# options.add_argument('headless')
options.add_argument('inprivate')
driver = webdriver.Edge(options= options)

In [7]:
BASE_URL = 'https://qlstats.net/'

In [9]:
driver.get(BASE_URL)

### Bypass cookie screen

In [4]:
def is_cookie_screen(driver : selenium.webdriver):
    '''A function to check if the given webpage is the 'accept cookies' screen.
    Regex matches the body of the '''
    element = driver.find_element(By.TAG_NAME, value = 'body')
    cookie_screen = re.compile(r'.*(To continue using qlstats, you need to agree to the use of cookies.\nAgree).*')
    if cookie_screen.search(element.text) is None:
        return False
    return True
#press the button


if is_cookie_screen(driver):
    try:
        button = driver.find_element(By.TAG_NAME, 'button')
        button.click()
    except:
        "Cookies could not be accepted, please recheck"

## Read Matches data

In [6]:
matches = pd.read_csv("data/match_details_post_feb23.csv")
matches

Unnamed: 0,href,Time,Type,Map,Score,Rated
0,/game/8892316,2023-08-14 00:32:27,ca,eviscerated,10:9,A
1,/game/8892261,2023-08-14 00:15:36,ca,hearth,10:6,A
2,/game/8892223,2023-08-14 00:02:27,ca,asylum,9:10,A
3,/game/8892179,2023-08-14 23:44:34,ca,overek,10:5,A
4,/game/8892128,2023-08-14 23:27:45,ca,quarantine,10:7,A
...,...,...,...,...,...,...
1809,/game/8539834,2023-02-04 22:30:55,ca,quarantine,7:10,A
1810,/game/8539812,2023-02-04 22:14:59,ca,overek,10:3,A
1811,/game/8539797,2023-02-04 22:01:27,ca,campgrounds,3:10,A
1812,/game/8539779,2023-02-04 21:46:15,ca,trinity,10:8,A


In [13]:
urljoin(BASE_URL, matches['href'][0])

'https://qlstats.net/game/8892316'

In [14]:
driver.get(urljoin(BASE_URL, matches['href'][0]))

## Scraping time

In [28]:
soup.find('p')

<p style="display:inline-block">
    Played: <span class="abstime" data-epoch="1691953347" title="about 19 hours ago">00:32:27</span><br/>
    Game Type: Clan Arena (ca)<br/>
    Server: <a href="/server/5935" name="Server info page for (India+Singapore+UAE) Topgun's CA Server w/crouchslide">(India+Singapore+UAE) Topgun's CA Server w/crouchslide</a><br/>
    Map: <a href="/map/24" name="Map info page for eviscerated">eviscerated</a><br/>
    Duration: 13:33
    <br/>Rating Status:
    
    Rated
    </p>

### Regex for scraping match info

In [35]:
keys = ["time", 'gametype', 'server', 'map', 'duration', 'if_rated']

exp = re.compile(r'Played: (.*) Game Type: (.*) Server: (.*) Map: (.*) Duration: (.*) Rating Status: (.*)')
exp.findall("Played: 00:32:27 Game Type: Clan Arena (ca) Server: (India+Singapore+UAE) Topgun's CA Server w/crouchslide Map: eviscerated Duration: 13:33 Rating Status: Rated")

[('00:32:27',
  'Clan Arena (ca)',
  "(India+Singapore+UAE) Topgun's CA Server w/crouchslide",
  'eviscerated',
  '13:33',
  'Rated')]

In [29]:
soup = bs4.BeautifulSoup(driver.page_source)
" ".join([x.strip() for x in soup.find('p').text.splitlines() if x.strip() != ''])

"Played: 00:32:27 Game Type: Clan Arena (ca) Server: (India+Singapore+UAE) Topgun's CA Server w/crouchslide Map: eviscerated Duration: 13:33 Rating Status: Rated"

### Extracting IDs and associating them to players

In [59]:
soup.findAll('td', attrs = {'class':'player-nick'})

[<td class="player-nick">
 <a href="/player/171710" title="Go to the info page for this player">
 <span class="nick"><span class="ql7"><span class="ql5">geebee<span class="ql7"></span></span></span></span>
 </a>
 </td>,
 <td class="player-nick">
 <a href="/player/205146" title="Go to the info page for this player">
 <span class="nick"><span class="ql7">ibu_hatela</span></span>
 </a>
 </td>,
 <td class="player-nick">
 <a href="/player/370599" title="Go to the info page for this player">
 <span class="nick"><span class="ql7"><span class="ql1">Stealth<span class="ql7"></span></span></span></span>
 </a>
 </td>,
 <td class="player-nick">
 <a href="/player/312322" title="Go to the info page for this player">
 <span class="nick"><span class="ql7">HYDRAZINE</span></span>
 </a>
 </td>,
 <td class="player-nick">
 <a href="/player/17598" title="Go to the info page for this player">
 <span class="nick"><span class="ql7">bhoot</span></span>
 </a>
 </td>,
 <td class="player-nick">
 <a href="/player/

### Extracting tables

In [74]:
accuracy_table = soup.find("div", attrs={'id':'chartRow'})
acc_table = pd.read_html(accuracy_table.prettify())[0].drop(columns = 'Unnamed: 1').rename(columns = {"Unnamed: 0": "Player"})
acc_table

Unnamed: 0,Player,SG,GL,RL,LG,RG,PG,HMG
0,HYDRAZINE,1 / 11% 9 / 80,0 / 0% 0 / 6,1 / 42% 15 / 36,6 / 42% 688 / 1628,7 / 60% 28 / 47,,
1,geebee,4 / 14% 19 / 140,0 / 25% 1 / 4,2 / 39% 24 / 62,1 / 44% 380 / 866,9 / 48% 32 / 66,,
2,ibu_hatela,7 / 32% 184 / 580,0 / 12% 7 / 57,4 / 40% 33 / 83,3 / 39% 153 / 389,5 / 38% 15 / 40,0 / 0% 0 / 12,0 / 0% 0 / 3
3,bhoot,2 / 10% 4 / 40,0 / 11% 7 / 66,6 / 34% 55 / 162,2 / 22% 116 / 529,0 / 24% 4 / 17,0 / 5% 5 / 108,
4,CHOOCHI DESTROYER,1 / 17% 20 / 120,,2 / 40% 44 / 110,2 / 24% 138 / 570,4 / 40% 16 / 40,,
5,Stealth,,,1 / 30% 39 / 130,4 / 25% 123 / 500,1 / 11% 2 / 19,,


In [89]:
splitter = re.compile(r"(.*) / (\d+)\% (\d+) / (\d+)")
splitter.findall("1 / 11% 9 / 80")

def split_string(acc_string):
    if pd.isna(acc_string):
        return [pd.NA]*4
    return splitter.findall(acc_string)

In [101]:
['{}_{}'.format(weapon, x) for x in cats]

['SG_kills', 'SG_acc', 'SG_hits', 'SG_shots']

In [118]:
acc_split_table = acc_table['Player']
weapon = 'SG'
cats = ['kills', 'acc', 'hits', 'shots']
np.vectorize(split_string)(acc_table.iloc[:, 1:])



ValueError: setting an array element with a sequence.

In [79]:
acc_table.iloc[:, 1:]

Unnamed: 0,SG,GL,RL,LG,RG,PG,HMG
0,1 / 11% 9 / 80,0 / 0% 0 / 6,1 / 42% 15 / 36,6 / 42% 688 / 1628,7 / 60% 28 / 47,,
1,4 / 14% 19 / 140,0 / 25% 1 / 4,2 / 39% 24 / 62,1 / 44% 380 / 866,9 / 48% 32 / 66,,
2,7 / 32% 184 / 580,0 / 12% 7 / 57,4 / 40% 33 / 83,3 / 39% 153 / 389,5 / 38% 15 / 40,0 / 0% 0 / 12,0 / 0% 0 / 3
3,2 / 10% 4 / 40,0 / 11% 7 / 66,6 / 34% 55 / 162,2 / 22% 116 / 529,0 / 24% 4 / 17,0 / 5% 5 / 108,
4,1 / 17% 20 / 120,,2 / 40% 44 / 110,2 / 24% 138 / 570,4 / 40% 16 / 40,,
5,,,1 / 30% 39 / 130,4 / 25% 123 / 500,1 / 11% 2 / 19,,


In [38]:
table_rows = soup.findAll('div', attrs = {'class': 'row'})
len(table_rows)

6

In [48]:
pd.read_html(table_rows[0].prettify())[2]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,SG,GL,RL,LG,RG,PG,HMG
0,HYDRAZINE,Kills / Acc Hits / Shots,1 / 11% 9 / 80,0 / 0% 0 / 6,1 / 42% 15 / 36,6 / 42% 688 / 1628,7 / 60% 28 / 47,,
1,geebee,Kills / Acc Hits / Shots,4 / 14% 19 / 140,0 / 25% 1 / 4,2 / 39% 24 / 62,1 / 44% 380 / 866,9 / 48% 32 / 66,,
2,ibu_hatela,Kills / Acc Hits / Shots,7 / 32% 184 / 580,0 / 12% 7 / 57,4 / 40% 33 / 83,3 / 39% 153 / 389,5 / 38% 15 / 40,0 / 0% 0 / 12,0 / 0% 0 / 3
3,bhoot,Kills / Acc Hits / Shots,2 / 10% 4 / 40,0 / 11% 7 / 66,6 / 34% 55 / 162,2 / 22% 116 / 529,0 / 24% 4 / 17,0 / 5% 5 / 108,
4,CHOOCHI DESTROYER,Kills / Acc Hits / Shots,1 / 17% 20 / 120,,2 / 40% 44 / 110,2 / 24% 138 / 570,4 / 40% 16 / 40,,
5,Stealth,Kills / Acc Hits / Shots,,,1 / 30% 39 / 130,4 / 25% 123 / 500,1 / 11% 2 / 19,,


In [49]:
pd.read_html(table_rows[1].prettify())

ValueError: No tables found