In [1]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By

import bs4
import re

import pandas as pd
from dateutil import parser
from tqdm import tqdm

tqdm.pandas()

In [2]:
options = webdriver.EdgeOptions()
# options.add_argument('headless')
options.add_argument('inprivate')
driver = webdriver.Edge(options= options)

In [3]:
driver.get('https://qlstats.net/games?server_id=5935')

### Bypass cookie screen

In [4]:
def is_cookie_screen(driver : selenium.webdriver):
    '''A function to check if the given webpage is the 'accept cookies' screen.
    Regex matches the body of the '''
    element = driver.find_element(By.TAG_NAME, value = 'body')
    cookie_screen = re.compile(r'.*(To continue using qlstats, you need to agree to the use of cookies.\nAgree).*')
    if cookie_screen.search(element.text) is None:
        return False
    return True
#press the button


if is_cookie_screen(driver):
    try:
        button = driver.find_element(By.TAG_NAME, 'button')
        button.click()
    except:
        "Cookies could not be accepted, please recheck"

### Scrape server matches info

In [5]:
server_scraping_df = pd.DataFrame()

while(True):

    soup = bs4.BeautifulSoup(driver.page_source)
    table = soup.find('table', attrs = {'class': 'table table-hover table-condensed'})

    header = table.find('thead')
    #find columns in head
    head = []
    for col in header.findAll('th'):
        head.append(col.text)

    # find indices of '', 'type', 'map', 'score', 'rated'
    columns = ['', 'Time', 'Type', 'Loc', 'Server', 'Map', 'Score', 'Rated']
    named_columns = ['href'] + columns[1:]

    search_indices = [head.index(x) for x in columns]
    search_key_vals = dict(zip(search_indices, columns))

    # scraping content
    scraped = []
    content = table.find('tbody')
    for row in content.findAll('tr'):
        tr_info = []
        all_content = row.findAll('td')
        for idx in search_indices:
            element = all_content[idx]
        
            if search_key_vals[idx] == '':
                # view button
                tr_info.append(element.find('a').get('href'))
            
            elif search_key_vals[idx] == 'Time':
                tr_info.append(parser.parse(element.find('span', attrs = {'class':'abstime'}).text))

            elif search_key_vals[idx] in ['Type', 'Map', 'Score', 'Rated', 'Loc', 'Server']:
                # other columns
                tr_info.append(element.text.strip())

        #append to scraped
        scraped.append(tr_info)

    page_scrape_df = pd.DataFrame(scraped, columns = named_columns)
    server_scraping_df = pd.concat([server_scraping_df, page_scrape_df])

    try:
        btn = driver.find_element(by = By.NAME, value = 'Next Page')
        btn.click()
    except:
        print("Next button not found, exiting...")
        break

Next button not found, exiting...


In [15]:
server_scraping_df

Unnamed: 0,href,Time,Type,Loc,Server,Map,Score,Rated
0,/game/8893785,2023-08-14 17:40:37,ca,CN,-=Moon CA(Blue)=-,campgrounds,10:9,A
1,/game/8893784,2023-08-14 17:39:39,duel,US,"#1 Chicago, IL (CHI) - qcon_duel",hektik,2:-999,A
2,/game/8893783,2023-08-14 17:38:27,ca,DE,relax ca #2,overkill,4:10,A
3,/game/8893782,2023-08-14 17:38:15,ffa,RU,|)ark Fiber FFA | RU,bloodrun,50:30,
4,/game/8893781,2023-08-14 17:38:11,duel,AU,#7 <> Sydney VQL - Multi Gametype,bloodrun,8:4,A
...,...,...,...,...,...,...,...,...
18,/game/8872869,2023-08-04 15:09:53,ca,US,֍֍֍ BoneCrusher | CHI ֍֍֍,overkill,10:2,A
19,/game/8872869,2023-08-04 15:09:53,ca,US,֍֍֍ BoneCrusher | CHI ֍֍֍,overkill,10:2,A
20,/game/8872868,2023-08-04 15:09:03,ffa,DE,HOMER Germany Side | Frankfurt cool PING,bloodrun,55:44,A
21,/game/8872867,2023-08-04 15:08:37,ca,DE,relax ca #2,quarantine,10:3,A


In [16]:
server_scraping_df.to_csv("./data/match_details.csv", index = False)

## Testing

In [None]:
urls = [x.get('href') for x in soup.find('table').find_all('a', attrs = {'class':'btn'}, recursive = True)]
urls

['/game/8892316',
 '/game/8892261',
 '/game/8892223',
 '/game/8892179',
 '/game/8892128',
 '/game/8892086',
 '/game/8892043',
 '/game/8892011',
 '/game/8891984',
 '/game/8889851',
 '/game/8889819',
 '/game/8889786',
 '/game/8889753',
 '/game/8889728',
 '/game/8889690',
 '/game/8889662',
 '/game/8889624',
 '/game/8887674',
 '/game/8887645',
 '/game/8887609']

In [None]:
dates = [parser.parse(x.text) for x in soup.find_all('span', attrs = {'class':'abstime'})]
dates

[datetime.datetime(2023, 8, 14, 0, 32, 27),
 datetime.datetime(2023, 8, 14, 0, 15, 36),
 datetime.datetime(2023, 8, 14, 0, 2, 27),
 datetime.datetime(2023, 8, 14, 23, 44, 34),
 datetime.datetime(2023, 8, 14, 23, 27, 45),
 datetime.datetime(2023, 8, 14, 23, 10, 20),
 datetime.datetime(2023, 8, 14, 22, 49, 34),
 datetime.datetime(2023, 8, 14, 22, 31, 49),
 datetime.datetime(2023, 8, 14, 22, 19, 18),
 datetime.datetime(2023, 8, 12, 23, 53, 54),
 datetime.datetime(2023, 8, 12, 23, 39, 16),
 datetime.datetime(2023, 8, 12, 23, 20, 17),
 datetime.datetime(2023, 8, 12, 23, 2, 45),
 datetime.datetime(2023, 8, 12, 22, 48, 1),
 datetime.datetime(2023, 8, 12, 22, 30, 11),
 datetime.datetime(2023, 8, 12, 22, 10, 55),
 datetime.datetime(2023, 8, 12, 21, 46, 57),
 datetime.datetime(2023, 8, 11, 23, 49, 15),
 datetime.datetime(2023, 8, 11, 23, 38, 53),
 datetime.datetime(2023, 8, 11, 23, 23, 36)]

In [None]:
next_btn = soup.find('a', attrs = {"name": "Next Page"}).get('href')
next_btn

'/games?server_id=5935&start_game_id=8887608'