In [1]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup
import pickle

In [2]:
def send_request(url):
    """Returns parsed HTML using ScrapingBee API"""
    
    response = requests.get(
        url='https://app.scrapingbee.com/api/v1/',
        params={
            'api_key': your_api_key,
            'url': url,
            'block_resources':'False'
        }, 
    )
    return(response)

In [3]:
def get_cs_team_id_name (year):
    """Returns two lists:
    1. List of team HLTV IDs (strings) from the given year's top 85 teams (based on Rating 2.0)
    2. List of team corresponding names in format used in HLTV URLs"""
    
    url = f'https://www.hltv.org/stats/teams?startDate={year}-01-01&endDate={year}-12-31'
    
    page = send_request(url)
    
    soup = BeautifulSoup(page.content, "html.parser")
    
    elements = soup.find_all('td', class_="teamCol-teams-overview")
    ID_name = []
    for element in elements:
        line = str(element)
        ID_name.append(line)
    pattern = re.compile(r"/stats/teams/(.*?)\?startDate")
    ID_name = [', '.join(pattern.findall(s)) for s in ID_name]
    
    ID = []
    name = []
    for item in ID_name:
        parts = item.split('/')
        ID.append(parts[0])
        name.append(parts[1])
        
    return ID, name

In [4]:
def get_cs_team_info (team_name, team_id, year):
    """Returns a dataframe containing info on all lineups of a given team in a given year:
    1. End and start date of each lineup
    2. Maps played by each lineup
    3. Wins, losses, draws of each lineup
    4. Names, maps and ratings played by each player in each lineup
    5. Country of origin of each player in each lineup
    (Maps and rating of each player is a culminative value from entire year on this team, not on a particular lineup)
    (Moreover, one-time substitutions are ommited, thus the total number of games for a particular player can be higher than the sum of games played by all lineups)"""
    
    url = f'https://www.hltv.org/stats/teams/lineups/{team_id}/{team_name}?startDate={year}-01-01&endDate={year}-12-31'
    url1 = f'https://www.hltv.org/stats/teams/players/{team_id}/{team_name}?startDate={year}-01-01&endDate={year}-12-31'
    
    page = send_request(url)
    page1 = send_request(url1)

    soup = BeautifulSoup(page.content, "html.parser")
    elements = soup.find_all("div", class_="lineup-container")
    lines = []
    for element in elements:
        element_lines = element.get_text("\n", strip=True).split("\n")
        lines.extend([line for line in element_lines if line.strip()])
    all_content = "\n".join(lines)

    lines = all_content.split('\n')
    to_be_removed = ['Jan ', 'Feb ', 'Mar ', 'Apr ', 'May ', 'Jun ', 'Jul ', 'Aug ', 'Sep ', 'Oct ', 'Nov ', 'Dec ']
    lines = [string for string in lines if not any(month in string for month in to_be_removed)]
    lines = [string for string in lines if string != '-']
    lines = [string for string in lines if string != '&']

    maps = lines[16::22]

    records = lines[18::22]
    wins = [int(record.split('/')[0].strip()) for record in records]
    draws = [int(record.split('/')[1].strip()) for record in records]
    losses = [int(record.split('/')[2].strip()) for record in records]

    first_players = lines[1::22]
    second_players =  lines[4::22]
    third_players =  lines[7::22]
    fourth_players =  lines[10::22]
    fifth_players =  lines[13::22]

    maps = {"Maps": maps}
    wins = {"Wins": wins}
    draws = {"Draws": draws}
    losses = {"Losses": losses}
    first_players = {"Player 1": first_players}
    second_players = {"Player 2": second_players}
    third_players = {"Player 3": third_players}
    fourth_players = {"Player 4": fourth_players}
    fifth_players = {"Player 5": fifth_players}

    dicts_df = [
        maps,
        wins,
        draws,
        losses,
        first_players,
        second_players,
        third_players,
        fourth_players,
        fifth_players
    ]

    try:
        combined_dict_df = {}
        for d in dicts_df:
            combined_dict_df.update(d)

        df = pd.DataFrame(combined_dict_df).transpose()
    except:
        return None


    new_rows = ['Player 1 maps', 'Player 2 maps', 'Player 3 maps', 'Player 4 maps', 'Player 5 maps', 
                'Player 1 rating', 'Player 2 rating', 'Player 3 rating', 'Player 4 rating', 'Player 5 rating', 
                'Player 1 country', 'Player 2 country', 'Player 3 country', 'Player 4 country', 'Player 5 country']
    
    try:
        for row_label in new_rows:
            df.loc[row_label] = pd.Series()
    except:
        return None

    soup = BeautifulSoup(page1.content, "html.parser")
    elements = soup.find_all("tbody")
    lines = []
    for element in elements:
        element_lines = element.get_text("\n", strip=True).split("\n")
        lines.extend([line for line in element_lines if line.strip()])
    extra_content = "\n".join(lines)

    elements = soup.find_all("td")
    elements = [td.find_all('img') for td in elements]
    elements = [img_tag for sublist in elements for img_tag in sublist]
    country = [img.get('alt') for img in elements]

    lines = extra_content.split('\n')
    names = lines[::6]
    names = [string for string in names if not string.isdigit()]
    n_players = len(names)
    maps = lines[1::6]
    maps = maps[:n_players]
    maps = {"Maps": maps}
    rating = lines[5::6]
    rating = rating[:n_players]
    rating = {"Rating": rating}
    country = {"Country": country}

    dicts_extra = [
        maps,
        rating,
        country
    ]

    try:
        combined_dict_extra = {}
        for d in dicts_extra:
            combined_dict_extra.update(d)

        extra = pd.DataFrame(combined_dict_extra).transpose()
        extra.columns = names
    except:
        return None

    for col in df.columns:
        for n in range(5):
            name = df[col][n+4]
            df[col][f"Player {n+1} maps"] = extra[name]["Maps"]
            df[col][f"Player {n+1} rating"] = extra[name]["Rating"]
            df[col][f"Player {n+1} country"] = extra[name]["Country"]

    for col in range(df.shape[1]):
        columns = df.columns.tolist()
        columns[col] = f'{team_name}_{year}_{col+1}'
        df.columns = columns

    return df


In [11]:
#Scraping pipeline - creates a concatenated dataframe of top 85 teams from all given years and saves it to .csv.
#The number of times scraping API has to be used is smaller or equal (if there are no repeating teams in the given years, which most likely won't happen) to 86n, where n is the number of given years.
#If you are using ScrapingBee API like I did, put your API key dow below:
your_api_key = ...
#Unused free version of Scraping Bee API provides 200 uses of API.


years = []
for year in range(2018, 2023):
    years.append(str(year))

all_IDs = []
all_names = []
    

for year in years:
    IDs, names = get_cs_team_id_name(year)
    for ID, name in zip(IDs, names):
        if ID not in all_IDs:
            all_IDs.append(ID)
            all_names.append(name)

data = pd.DataFrame()

for year in years:
    for ID, name in zip(all_IDs, all_names): 
        df = get_cs_team_info(name, ID, year)
        data = pd.concat([data, df], axis=1, sort=False)  
        data.to_csv('cs_data.csv', index=True)

In [60]:
data.shape

(24, 525)

In [64]:
data

Unnamed: 0,grayhound_2018_1,grayhound_2018_2,grayhound_2018_3,astralis_2018_1,furia_2018_1,furia_2018_2,tainted-minds_2018_1,tainted-minds_2018_2,tainted-minds_2018_3,valiance_2018_1,...,unique_2020_2,winstrike_2020_1,winstrike_2020_2,winstrike_2020_3,syman_2020_1,syman_2020_2,heretics_2020_1,chaos_2020_1,chaos_2020_2,chaos_2020_3
Maps,60,100,61,254,54,145,33,79,68,49,...,45,22,76,73,95,56,146,49,70,27
Wins,43,77,49,200,38,105,18,50,54,35,...,22,13,40,41,52,25,83,31,49,17
Losses,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Draws,17,23,12,54,16,38,15,29,14,14,...,23,9,36,32,43,31,63,18,20,10
Player 1,dexter,dexter,dexter,Xyp9x,arT,spacca,ap0c,yam,yam,huNter-,...,fenvicious,hooch,bondik,bondik,mou,mou,kioShiMa,MarKE,steel,steel
Player 2,erkaSt,Gratisfaction,BURNRUOk,dupreeh,yuurih,arT,zewsy,zewsy,zewsy,LETN1,...,PASHANOJ,bondik,NickelBack,HObbit,neaLaN,kreaz,xms,vanity,vanity,smooya
Player 3,malta,erkaSt,Gratisfaction,gla1ve,VINI,yuurih,ofnu,ofnu,ofnu,nexa,...,R0b3n,NickelBack,KrizzeN,KrizzeN,Keoz,neaLaN,Maka,Jonji,Jonji,vanity
Player 4,sterling,malta,erkaSt,device,KSCERATO,VINI,USTILO,BURNRUOk,chuch,ottoNd,...,zorte,Lack1,Lack1,Lack1,n0rb3r7,Keoz,Lucky,Xeppaa,Xeppaa,cam
Player 5,DickStacy,DickStacy,DickStacy,Magisk,ableJ,KSCERATO,Sico,INS,INS,EspiranTo,...,Polt,El1an,El1an,El1an,kade0,n0rb3r7,Nivera,leaf,leaf,ben1337
Player 1 maps,227,227,227,258,203,156,46,153,153,291,...,69,38,178,178,161,161,200,49,168,168


In [66]:
data.to_csv('cs_data.csv', index=True)