# CS:GO DEMOS DATA SCRAPING

In [6]:
from bs4 import BeautifulSoup
from selenium import webdriver  
from fake_useragent import UserAgent
import requests
import time

# Base HLTV url
base_url = 'https://www.hltv.org'

### Setting up functions for parsing the response data from HLTV

In [7]:

# Returns a BeautifulSoup object of the given URL
# required reloading for updating the userAgent to avoid CloudFlare protection
def get_soup(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    ua = UserAgent()
    userAgent = ua.random
    options.add_argument(f'user-agent={userAgent}')
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    driver_soup = BeautifulSoup(driver.page_source)
    driver.quit()
    return driver_soup



# Retrieves the mathpages links from the response from the filtering process
# and creates and appends to mathpage_list
def get_matchpage_links(soup):
    matchpage_list = []                                             # Create a empty list to store the matchpage links
    match_soup = soup.find_all("div", {"class": "result-con"})      # Soup containing all matches on the page
    for match in match_soup:                                        # Looping thru all mathches found and getting the matchpage links
        div = match.find('a')
        href = div['href']
        matchpage_list.append(href)                                 # Appending links to list
    pagination = soup.find("div", {"class": "pagination-component pagination-top"})
    pagination = int(pagination.span.text.strip().split(' ')[-1])   # Checking for pagination info to see if there are more results
    if pagination > 100:                                            # Looping thru remaining pages
        for i in range(100, pagination, 100):
            offset=i
            soup = get_soup(filtering_url)
            match_soup = soup.find_all("div", {"class": "result-con"})
            for match in match_soup:
                div = match.find('a')
                href = div['href']
                matchpage_list.append(href)
    return matchpage_list


## Get the download link from pages and saves the .dem file to the selected directory
def get_download(matchpage_link, sleep_time=30):
    soup = get_soup(matchpage_link)
    match_a = soup.find("a", {"class": "stream-box"})
    try:
        end_url = match_a['data-demo-link']
        download_url = base_url + end_url
        options = webdriver.ChromeOptions()
        ua = UserAgent()
        userAgent = ua.random
        options.add_argument(f'user-agent={userAgent}')
        options.add_argument('--headless=new')
        prefs= {"download.default_directory":"C:\Programming\csgo_demo\demos_rar"}
        options.add_experimental_option("prefs",prefs)
        driver = webdriver.Chrome(options=options)
        driver.get(download_url)
        print(f'downloading match {matchpage_link}')
        print(f'sleeping for {sleep_time} seconds waiting for download')
        time.sleep(sleep_time)
        print('finished sleeping')
        driver.quit()
    except:
        print('Demo url not found or cloudflare protection')
    



### Setting up the filter settings and loading the data from the response

In [None]:


# Filter settings for matches
offset = 0
start_date = '2023-01-01'
end_date = '2023-05-30'
map_name = 'de_mirage'
min_star = 2
filtering_url = f'https://www.hltv.org/results?offset={offset}&startDate={start_date}&endDate={end_date}&stars={str(min_star)}&map={map_name}'

print(filtering_url)
soup = get_soup(filtering_url)
matchpage_list = get_matchpage_links(soup)


In [9]:
matchpage_list
len(matchpage_list)

['/matches/2364477/liquid-vs-grayhound-iem-dallas-2023',
 '/matches/2364474/mouz-vs-heroic-iem-dallas-2023',
 '/matches/2364472/g2-vs-og-iem-dallas-2023',
 '/matches/2364474/mouz-vs-heroic-iem-dallas-2023',
 '/matches/2364472/g2-vs-og-iem-dallas-2023',
 '/matches/2364437/into-the-breach-vs-big-cct-2023-online-finals-1',
 '/matches/2364579/apeks-vs-monte-esl-challenger-league-season-45-europe',
 '/matches/2364426/ninjas-in-pyjamas-vs-into-the-breach-cct-2023-online-finals-1',
 '/matches/2364360/big-vs-ninjas-in-pyjamas-esl-challenger-katowice-2023-europe-closed-qualifier',
 '/matches/2364291/vitality-vs-apeks-blasttv-paris-major-2023',
 '/matches/2364286/heroic-vs-faze-blasttv-paris-major-2023',
 '/matches/2363890/faze-vs-natus-vincere-blasttv-paris-major-2023',
 '/matches/2363885/faze-vs-bad-news-eagles-blasttv-paris-major-2023',
 '/matches/2363884/ence-vs-ninjas-in-pyjamas-blasttv-paris-major-2023',
 '/matches/2363882/furia-vs-g2-blasttv-paris-major-2023',
 '/matches/2363880/heroic-vs

# Downloading selected demos



In [10]:
for match in matchpage_list[0:20]:
    match_url = base_url + match
    print(match_url)
    get_download(match_url)


https://www.hltv.org/matches/2364477/liquid-vs-grayhound-iem-dallas-2023
downloading match https://www.hltv.org/matches/2364477/liquid-vs-grayhound-iem-dallas-2023
sleeping for 30 seconds waiting for download
finished sleeping
https://www.hltv.org/matches/2364474/mouz-vs-heroic-iem-dallas-2023
downloading match https://www.hltv.org/matches/2364474/mouz-vs-heroic-iem-dallas-2023
sleeping for 30 seconds waiting for download
finished sleeping
https://www.hltv.org/matches/2364472/g2-vs-og-iem-dallas-2023
downloading match https://www.hltv.org/matches/2364472/g2-vs-og-iem-dallas-2023
sleeping for 30 seconds waiting for download
finished sleeping
https://www.hltv.org/matches/2364474/mouz-vs-heroic-iem-dallas-2023
downloading match https://www.hltv.org/matches/2364474/mouz-vs-heroic-iem-dallas-2023
sleeping for 30 seconds waiting for download
finished sleeping
https://www.hltv.org/matches/2364472/g2-vs-og-iem-dallas-2023
Demo url not found or cloudflare protection
https://www.hltv.org/matches