In [None]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup
import country_converter as coco

In [None]:
def send_request(url):
    """Returns parsed HTML using ScrapingBee API"""
    
    response = requests.get(
        url='https://app.scrapingbee.com/api/v1/',
        params={
            'api_key': 'you-api-key',
            'url': url,
            'block_resources':'False'
        }, 
    )
    return(response)

In [None]:
def get_lol_team_id(season):
    """Returns a list of IDs (strings) of teams that played in the given season and played at least 50 games. 
    Includes major and minor regions, all tiers and tournaments."""
    
    url = f'https://gol.gg/teams/list/season-S{season}/split-ALL/tournament-ALL/'
    
    page = send_request(url)
    
    soup = BeautifulSoup(page.content, "html.parser")

    elements = soup.find_all('td', class_="tablesaw-cell-persist")
    IDs = []
    for element in elements:
        line = str(element)
        IDs.append(line)
    pattern = re.compile(r"/team-stats/(.*?)\/split-ALL")
    IDs = [', '.join(pattern.findall(s)) for s in IDs]

    elements = soup.find_all('td', class_="text-center")
    games = []
    for element in elements:
        line = str(element)
        games.append(line)    
    games = games[2::29]
    pattern = re.compile(r">(.*?)\<")
    games = [', '.join(pattern.findall(s)) for s in games]
    games = [int(item) for item in games]
    IDs = [ID for ID, game in zip(ID, games) if game > 50]
    
    return IDs

In [None]:
def get_lol_team_info(team_season_id):
    """Returns a dataframe containing info on all players that played more than 10 games on a given team in a given season:
    1. Season, winrate, games played by a team
    2. Player names
    3. Player KDA, VSPM, games played
    4. Player country of origin"""
    
    url = f'https://gol.gg/teams/team-stats/{team_season_id}/split-ALL/tournament-ALL/'
    
    page = send_request(url)
    
    if page.status_code == 200:
    
        soup = BeautifulSoup(page.content, "html.parser")

        elements = soup.find_all('div', class_="col-12 col-sm-6 rowbreak pb-4")
        lines = []
        for element in elements:
            element_lines = element.get_text("\n", strip=True).split("\n")
            lines.extend([line for line in element_lines if line.strip()])
        team_stats = "\n".join(lines)

        elements = soup.select('a[href*="player-stats"]')
        player_names = []
        for element in elements:
            element_lines = element.get_text("\n", strip=True).split("\n")
            player_names.extend([line for line in element_lines if line.strip()])

        elements = soup.select('img[src*="role"]')
        roles = []
        for element in elements:
            line = str(element)
            roles.append(line)
        pattern = re.compile(r'\b(TOP|JUNGLE|MID|ADC|SUPPORT)\b')
        roles = [', '.join(pattern.findall(s)) for s in roles]

        elements = soup.find_all('td', class_="text-center footable-visible")
        player_stats = []
        for element in elements:
            element_lines = element.get_text("\n", strip=True).split("\n")
            player_stats.extend([line for line in element_lines if line.strip()])
        KDA = player_stats[::5]
        VSPM = player_stats[2::5]

        elements = soup.find_all('td', class_="footable-visible footable-last-column")
        games = []
        for index, td in enumerate(elements, start=1):
            games.append(td.get_text())
        games = [re.findall(r'(?:(?<=\n\n\s{7})|\b)(\d+)(?=\n\n|$)', s) for s in games]
        games = [sum(map(int, lista)) for lista in games]

        elements = soup.select('img[src*="img/pays"]')
        countries = []
        for element in elements:
            line = str(element)
            countries.append(line)
        countries = [re.findall(r'img alt="([^"]+)"', s) for s in countries]
        countries = sum(countries, [])
        countries = coco.convert(names=countries, to='name_short')

        lines = team_stats.split('\n')

        team_name = lines[0][:-6]
        season = lines[5]
        win_rate = float(lines[8].replace('%',''))/100
        team_games = lines[7]
        numbers = numbers = re.findall(r'\d+', team_games)
        team_games = sum(int(num) for num in numbers)

        player_info = pd.DataFrame({'Players': player_names, 'Roles': roles, 'KDA': KDA, 'VSPM': VSPM, 'N_games': games, 'Country': countries})
        player_info = player_info.sort_values(by='N_games', ascending=False)
        player_info = player_info.head(5)

        player_count = {}

        player_info['Players'] = player_info['Players'].apply(lambda player: f"{player}_{player_count.setdefault(player, 0) + 1}" if player in player_count else player_count.update({player: 1}) or player)
        player_info.reset_index(drop=True, inplace=True)

        row_names = ['Season', 'Win rate', 'Games']

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1}')

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1} role')

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1} KDA')

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1} VSPM')

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1} games')  

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1} country') 

        df = pd.DataFrame(index=row_names, columns=[team_name])
        df.loc['Season'] = season
        df.loc['Win rate'] = win_rate
        df.loc['Games'] = team_games

        for player in range(len(player_info)):
            df.loc[f'Player {player+1}'] = player_info['Players'][player]

        for player in range(len(player_info)):
            df.loc[f'Player {player+1} role'] = player_info['Roles'][player]

        for player in range(len(player_info)):
            df.loc[f'Player {player+1} KDA'] = player_info['KDA'][player]

        for player in range(len(player_info)):
            df.loc[f'Player {player+1} VSPM'] = player_info['VSPM'][player]

        for player in range(len(player_info)):
            df.loc[f'Player {player+1} games'] = player_info['N_games'][player]

        for player in range(len(player_info)):
            df.loc[f'Player {player+1} country'] = player_info['Country'][player]

        return df
    
    else:
        return None

In [2]:
#Scraping pipeline - creates a concatenated dataframe of selected teams in given seasons based on their team-season ID and saves it to .csv.
#The number of times scraping API has to be used is likely in the range of 150n - 200n, where n is the number of given seasons.
#Unused free version of Scraping Bee API provides 200 uses of API.

seasons = []
for season in range(8, 13):
    seasons.append(str(season))

all_IDs = []
for season in seasons:
    IDs = get_lol_team_id(season)
    all_IDs.extend(IDs)

data = pd.DataFrame()
for ID in all_IDs:
    df = get_lol_team_info(ID)
    data = pd.concat([data, df], axis=1, sort=False)
    
data.to_csv('lol_data.csv', index=False)   

In [None]:
data