In [28]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup
import country_converter as coco

In [29]:
def send_request(url):
    url = url 
    headers = {'User-Agent': 'Mozilla/5.0'}  
    response = requests.get(url, headers=headers)
    
    return(response)

In [41]:
def get_lol_team_id(season):
    """Returns a list of IDs (strings) of teams that played in the given season and played at least 40 games. 
    Includes major and minor regions, all tiers and tournaments."""
    
    url = f'https://gol.gg/teams/list/season-S{season}/split-ALL/tournament-ALL/'
    
    page = send_request(url)

    soup = BeautifulSoup(page.content, "html.parser")
    elements = soup.find_all('a')
    IDs = []
    for element in elements:
        line = str(element)
        IDs.append(line)

    pattern = re.compile(r"/team-stats/(.*?)\/split-ALL")
    IDs = [', '.join(pattern.findall(s)) for s in IDs]
    IDs = [value for value in IDs if value != ""]

    elements = soup.find_all('td', class_="text-center")
    games = []
    for element in elements:
        line = str(element)
        games.append(line)    
    games = games[2::29]
    pattern = re.compile(r">(.*?)\<")
    games = [', '.join(pattern.findall(s)) for s in games]
    games = [int(item) for item in games]
    IDs = [ID for ID, game in zip(IDs, games) if game > 40]
    
    return IDs

In [42]:
def get_lol_team_info(team_season_id):
    """Returns a dataframe containing info on 5 players with the most games played on that team in that season:
    1. Season, winrate, games played by a team
    2. Player names
    3. Player KDA, VSPM, games played
    4. Player country of origin"""
    
    url = f'https://gol.gg/teams/team-stats/{team_season_id}/split-ALL/tournament-ALL/'
    
    page = send_request(url)
    
    if page.status_code == 200:
    
        soup = BeautifulSoup(page.content, "html.parser")

        elements = soup.find_all('div', class_="col-12 col-sm-6 rowbreak pb-4")
        lines = []
        for element in elements:
            element_lines = element.get_text("\n", strip=True).split("\n")
            lines.extend([line for line in element_lines if line.strip()])
        team_stats = "\n".join(lines)

        elements = soup.select('a[href*="player-stats"]')
        player_names = []
        for element in elements:
            element_lines = element.get_text("\n", strip=True).split("\n")
            player_names.extend([line for line in element_lines if line.strip()])

        elements = soup.select('img[src*="role"]')
        roles = []
        for element in elements:
            line = str(element)
            roles.append(line)
        pattern = re.compile(r'\b(TOP|JUNGLE|MID|ADC|SUPPORT)\b')
        roles = [', '.join(pattern.findall(s)) for s in roles]

        elements = str(soup)
        pattern = r'<td class="text-center">(.*?)</td>'
        elements = re.findall(pattern, elements)
        elements = [s for i, s in enumerate(elements) if (len(s) <= 6)]
        elements = elements[3:]
        KDA = elements[::3]
        VSPM = elements[2::3]

        tables = soup.find('table', {'class': 'table_list footable toggle-square-filled'})
        trs = tables.find_all('tr')
        divs = [div for tr in trs for div in tr.find_all('div')]
        elements = divs[2::3]
        games = []
        for index, td in enumerate(elements, start=1):
            games.append(td.get_text())
        games = [re.findall(r'(?:(?<=\n\n\s{7})|\b)(\d+)(?=\n\n|$)', s) for s in games]
        games = [sum(map(int, lista)) for lista in games]

        elements = soup.select('img[src*="img/pays"]')
        countries = []
        for element in elements:
            line = str(element)
            countries.append(line)
        countries = [re.findall(r'img alt="([^"]+)"', s) for s in countries]
        countries = sum(countries, [])
        countries = coco.convert(names=countries, to='name_short')

        lines = team_stats.split('\n')

        team_name = lines[0][:-6]
        season = lines[5]
        win_rate = float(lines[8].replace('%',''))/100
        team_games = lines[7]
        numbers = numbers = re.findall(r'\d+', team_games)
        team_games = sum(int(num) for num in numbers)

        try:
            player_info = pd.DataFrame({'Players': player_names, 'Roles': roles, 'KDA': KDA, 'VSPM': VSPM, 'N_games': games, 'Country': countries})
            player_info = player_info.sort_values(by='N_games', ascending=False)
            player_info = player_info.head(5)
        except:
            return None

        player_count = {}

        player_info['Players'] = player_info['Players'].apply(lambda player: f"{player}_{player_count.setdefault(player, 0) + 1}" if player in player_count else player_count.update({player: 1}) or player)
        player_info.reset_index(drop=True, inplace=True)

        row_names = ['Season', 'Win rate', 'Games']

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1}')

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1} role')

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1} KDA')

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1} VSPM')

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1} games')  

        for player in range(len(player_info)):
            row_names.append(f'Player {player+1} country') 

        df = pd.DataFrame(index=row_names, columns=[team_name])
        df.loc['Season'] = season
        df.loc['Win rate'] = win_rate
        df.loc['Games'] = team_games

        for player in range(len(player_info)):
            df.loc[f'Player {player+1}'] = player_info['Players'][player]

        for player in range(len(player_info)):
            df.loc[f'Player {player+1} role'] = player_info['Roles'][player]

        for player in range(len(player_info)):
            df.loc[f'Player {player+1} KDA'] = player_info['KDA'][player]

        for player in range(len(player_info)):
            df.loc[f'Player {player+1} VSPM'] = player_info['VSPM'][player]

        for player in range(len(player_info)):
            df.loc[f'Player {player+1} games'] = player_info['N_games'][player]

        for player in range(len(player_info)):
            df.loc[f'Player {player+1} country'] = player_info['Country'][player]

        return df
    
    else:
        print("Whoopsie")
        return None

In [47]:
#Scraping pipeline - creates a concatenated dataframe of selected teams in given seasons based on their team-season ID and saves it to .csv.

seasons = []
for season in range(8, 14):
    seasons.append(str(season))

all_IDs = []
for season in seasons:
    IDs = get_lol_team_id(season)
    all_IDs.extend(IDs)
    
data = pd.DataFrame()
for ID in all_IDs:
    df = get_lol_team_info(ID)
    data = pd.concat([data, df], axis=1, sort=False)
    data.to_csv('lol_data.csv', index=True)   

In [48]:
data.shape

(33, 727)

In [50]:
data

Unnamed: 0,100 Thieves,Afreeca Freecs,Detonation FocusMe,Dire Wolves,FC Schalke 04,Fnatic,G2 Esports,Isurus Gaming,Jin Air Green Wings,Kaos Latin Gamers,...,ViV Esport,Vivo Keyd Stars,Vivo Keyd Stars Academy,Weibo Gaming,White Dragons,Wildcard Gaming,Wizards Club,WLGaming Esports,Zero Tenacity,ZETA
Season,S8,S8,S8,S8,S8,S8,S8,S8,S8,S8,...,S13,S13,S13,S13,S13,S13,S13,S13,S13,S13
Win rate,0.54,0.59,0.72,0.74,0.51,0.69,0.58,0.66,0.39,0.7,...,0.46,0.42,0.63,0.61,0.48,0.55,0.62,0.65,0.72,0.47
Games,69,121,61,72,55,87,86,56,97,76,...,41,45,64,118,63,84,77,68,60,45
Player 1,Aphromoo,TusiN,Evi,Triple,Vizicsacsi,Hylissang,Wunder,QQ,SoHwan,Nate,...,XERSUS,Guigo,Tyrin,TheShy,Caucha,Keel,DahVys,Bruness,Color,kaylem
Player 2,ssumday,Kiin,Steal,k1ng,Nukeduck,Broxah,Hjarnan,Kindless,Teddy,Slow,...,Joinze,Grevthar,Sting,xiaohu,Worst,Soligo,IceBreaker,Bananitoo,Ryuzaki,Nyx
Player 3,Ryu,kurO,viviD,Cupcake,Upset,Caps,Wadid,Newbie,UmTi,Tierwulf,...,Serendip,Grell,Leleko,Light,D4SH,Lens,Tsiperakos,HeSSZero,Kabbie,mihai
Player 4,Cody Sun,Spirit,Ceros,Shernfire,VandeR,Rekkles,Jankos,Emp,Grace,Fix,...,Twelve,Trigo,Telas,Crisp,Noma,Duoking1,Simpy,Vladi,SlowQ,Bolyy1
Player 5,Meteos,Kramer,Yutapon,BioPanther,Amazing,sOAZ,Perkz,Buggax,Nova,Plugo,...,Caliste,Damage,ST3PZ,WeiWei,Calmsky,Zamudo,Lindgarde,Jeyrus,Noah,J0J0C
Player 1 role,SUPPORT,SUPPORT,TOP,MID,TOP,SUPPORT,TOP,JUNGLE,TOP,TOP,...,SUPPORT,TOP,TOP,TOP,TOP,JUNGLE,JUNGLE,JUNGLE,TOP,TOP
Player 2 role,TOP,TOP,JUNGLE,ADC,MID,JUNGLE,ADC,ADC,ADC,SUPPORT,...,JUNGLE,MID,JUNGLE,MID,ADC,MID,TOP,SUPPORT,JUNGLE,JUNGLE


In [52]:
data.to_csv('lol_data.csv', index=True)  