In [1]:
from utility import scraper
from bs4 import BeautifulSoup as soup
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np

import re
import string

# Outline
1. Scrape Tier List Table (UGG)
2. Scrape Champion Roles (Fandom Wiki)
3. Scrape Champion Strategies (Tencent)
4. Scrape Champion Details (Fandom Wiki)

## 1. Scrape Tier List Table (UGG)
- Should be updated daily

### 1.1 Extract Raw Data

In [None]:
def extract_from_tier_list_row(row,divison):
    
    # extract information from the row
    divison = divison.capitalize() 
    lane = row.find('img',{'class':'tier-list-role'})['alt'].capitalize() 
    champion = row.find('strong',{'class':'champion-name'}).text
    try:
        regex = re.compile('.*rt-td tier.*')
        tier = row.find('div',{'class':regex}).find('span').text
    except:
        tier = float('nan')
    try:
        regex = re.compile('.*rt-td winrate.*')
        win_rate = row.find('div',{'class':regex}).find('span').text
    except:
        win_rate = float('nan')
    try:
        regex = re.compile('.*rt-td pickrate.*')
        pick_rate = row.find('div',{'class':regex}).find('span').text
    except:
        pick_rate = float('nan')
    try:
        regex = re.compile('.*rt-td banrate.*')
        ban_rate = row.find('div',{'class':regex}).find('span').text
    except:
        ban_rate = float('nan')
    try:
        counter_raw_list = row.find('div',{'class':'against-container'}).findAll('div',{'class':'against'})
        counter_list = list()
        for counter in counter_raw_list:
            counter_champ = counter.find('a')['href'].split('/')[3].capitalize() 
            counter_list.append(counter_champ)
        counters = ','.join(counter_list)
    except:
        counters = float('nan')
    try:
        regex = re.compile('.*rt-td matches.*')
        matches = row.find('div',{'class':regex}).find('span').text
    except:
        matches = float('nan')
    # store all into a dictionary
    row_dict = {'divison':divison,'lane':lane,'champion':champion,
                'tier':tier,'win_rate':win_rate,'pick_rate':pick_rate,
                'ban_rate':ban_rate,'counters':counters,'matches':matches}
    
    return row_dict


def generate_divison_tier_table(champ_tier_list_soup,divison):
    
    tier_list_table = champ_tier_list_soup.find('div',{'class':'tier-list-page'})
    champ_rows = tier_list_table.findAll('div',{'class':'rt-tr-group'})
    
    division_dict = dict()
    for i,row in enumerate(tqdm(champ_rows)):
        division_dict[i] = extract_from_tier_list_row(row,divison)
    df_division = pd.DataFrame.from_dict(division_dict, orient='index').sort_values(by=['win_rate'],ascending=False)
    return df_division

def scrape_ugg_tier_list():
    divisions = ['iron','bronze','silver','gold','platinum','diamond','master','grandmaster','challenger']
    df_tier_list = pd.DataFrame()

    for divsion in tqdm(divisions):
        # generate the table for the given division
        url = f'https://u.gg/lol/tier-list?rank={divsion}'
        champ_tier_list_soup = scraper.get_page_selenium(url)
        df_divsion = generate_divison_tier_table(champ_tier_list_soup,divison=divsion)
        # append to parent table
        df_tier_list = pd.concat([df_tier_list,df_divsion],ignore_index=True)
    return df_tier_list

In [None]:
df_tier_list = scrape_ugg_tier_list()

In [None]:
df_tier_list.to_csv('./temp/ugg_tier_list_raw.csv',index=False)

### 1.2 Preprocess
- Correct names in `counters`

In [None]:
def adjust_name(name):
    adjusted_name = name.translate(str.maketrans('', '', string.punctuation))
    adjusted_name = adjusted_name.replace(' ','').lower().capitalize()
    return adjusted_name

def correct_counters(counters,correction_champ_names,name_correction_dict):
    for name in correction_champ_names:
        if name in counters:
            counters = counters.replace(name,name_correction_dict[name])
    return counters

In [None]:
champ_names = pd.Series(df_tier_list['champion'].unique())
corrected_champ_names = champ_names[champ_names.apply(lambda x: ' ' in x or "'" in x)]
correction_champ_names = pd.Series(corrected_champ_names).apply(adjust_name)

name_correction_dict = dict(zip(correction_champ_names,corrected_champ_names))

In [None]:
df_tier_list['counters'] = df_tier_list['counters'].apply(lambda x: correct_counters(x,correction_champ_names,name_correction_dict))

In [None]:
df_tier_list.to_csv('./preprocessed_data/ugg_tier_list.csv',index=False)

**Notes**:
- Allow mutiple selection for `Divison`
    - Group by `lane` + `champion`
    - To re-compute the numerical values (e.g. `win_rate`,`pick_rate`,`ban_rate`), simply take weighed average using `matches`
    - To re-compute tier, first map tier to numerical values (e.g. D->1 and S+->6) and take weighed average, then map back to tier OR train a classification model and save it to predict in real-time

## 2. Scrape Champion Roles (Fandom Wiki)

In [None]:
url = 'https://leagueoflegends.fandom.com/wiki/List_of_champions'
wiki_champion_page_soup = scraper.get_page(url)
champion_table = wiki_champion_page_soup.findAll('tbody')[1]
champion_rows = champion_table.findAll('tr',recursive=False)[1:]

In [None]:
champion_name_list,primary_role_list,secondary_role_list  = list(),list(),list()

for champion_row in champion_rows:
    champion_name_list.append(champion_row.find('td')['data-sort-value'])
    primary_role_list.append(champion_row.findAll('td')[1]['data-sort-value'])
    secondary_role_list.append(champion_row.findAll('td')[2]['data-sort-value'])

In [None]:
df_roles = pd.DataFrame({'champion':champion_name_list,
                         'primary_role':primary_role_list,
                         'secondary_role':secondary_role_list})

In [None]:
df_roles.to_csv('./temp/wiki_champ_roles.csv',index=False)
df_roles.to_csv('./preprocessed_data/wiki_champ_roles.csv',index=False)

## 3. Scrape Champion Strategies (Tencent)
- Should be updated whenever there's a new champion

### 3.1 Scrape Champion Catalog Page
- Champion name (in Chinese)
- Champion page url

In [None]:
def scrape_champion_catalog():
    url = 'https://lol.qq.com/data/info-heros.shtml'
    champ_list_soup = scraper.get_page_selenium(url)
    champ_lists = champ_list_soup.find('ul',{'class':'imgtextlist'}).findAll('li')
    
    champion_chinese_list,champion_url_list = list(),list()
    for champ in champ_lists:
        # extract
        champion_chinese = champ.find('a')['title']
        champion_url = 'https://lol.qq.com/data/' + champ.find('a',href=True)['href']
        # store
        champion_chinese_list.append(champion_chinese)
        champion_url_list.append(champion_url)
        
    df_champion_strats = pd.DataFrame({'champion_chinese':champion_chinese_list,
                                       'champion_url':champion_url_list})
    return df_champion_strats

In [None]:
df_champion_strats = scrape_champion_catalog()

In [None]:
df_champion_strats.to_csv('./temp/tencent_champion_catalog.csv',index=False)

### 3.2 Scrape Indivdual Champion Page

In [None]:
df_champion_strats = pd.read_csv('./temp/tencent_champion_catalog.csv')

In [None]:
def extract_strats(url):
    for i in range(3):
        try:
            champ_page_soup = scraper.get_page_selenium(url)
            regex = re.compile('.*arttips.*')
            tips = champ_page_soup.find('div',{'class':regex})

            regex = re.compile('.*allytips.*')
            ally_tips = tips.find('dl',{'id':regex})
            regex = re.compile('.*enemytips.*')
            enemy_tips = tips.find('dl',{'id':regex})

            play_strats,counter_strats = '',''
            for tip in ally_tips.findAll('p'):
                play_strats = '\n'.join([play_strats,tip.text.strip()])
            for tip in enemy_tips.findAll('p'):
                counter_strats = '\n'.join([counter_strats,tip.text.strip()])
            return {'play_strats':play_strats,'counter_strats':counter_strats}
        except:
            pass
    print('Error:'+url)
    play_strats,counter_strats = float('nan'),float('nan')
    return {'play_strats':play_strats,'counter_strats':counter_strats}

In [None]:
url_strats_dict = dict()
for url in tqdm(df_champion_strats['champion_url'].values):
    url_strats_dict[url] = extract_strats(url)

In [None]:
df_url_strats = pd.DataFrame.from_dict(url_strats_dict, orient='index')
df_url_strats.replace('\n-',float('nan'),inplace=True)
df_url_strats.reset_index(inplace=True)
df_url_strats.rename(columns={'index':'champion_url'},inplace=True)

In [None]:
df_url_strats.to_csv('./temp/tencent_url_strats.csv',index=False)

### 3.3 Preprocess Starts Table

Inner join two tables.

In [None]:
df_strats =  pd.merge(df_champion_strats, df_url_strats, on="champion_url", how="inner")

In [None]:
df_strats.to_csv('./temp/tencent_champion_strats.csv',index=False)

Get name mapping and map names to English.

In [None]:
url = 'https://weixia.info/league-of-legends-name.html'
champ_translation_soup = scraper.get_page(url)
champion_names = champ_translation_soup.find('tbody').findAll('tr')

In [None]:
champion_names_cn,champion_names_eng = list(),list()
for champion in champion_names:
    champion_names_cn.append(champion.findAll('td')[3].text.strip())
    champion_names_eng.append(champion.findAll('td')[4].text.strip())
cn_eng_name_mapping = dict(zip(champion_names_cn,champion_names_eng))

In [None]:
df_strats['champion_name_cn'] = df_strats['champion_chinese'].apply(lambda x: x.split(' ')[1])
df_strats['champion_name_eng'] = df_strats['champion_name_cn'].map(cn_eng_name_mapping)

In [None]:
def replace_names(txt,cn_eng_name_mapping):
    if isinstance(txt,str):
        for name in list(cn_eng_name_mapping.keys()):
            if name in txt:
                txt = txt.replace(name,cn_eng_name_mapping[name])
        return txt
    else:
        return txt

In [None]:
df_strats['play_strats'] = df_strats['play_strats'].apply(lambda x: replace_names(x,cn_eng_name_mapping))
df_strats['counter_strats'] = df_strats['counter_strats'].apply(lambda x: replace_names(x,cn_eng_name_mapping))

In [None]:
from utility.translation import google_translate
df_strats['play_strats_en'] = df_strats['play_strats'].apply(google_translate)
df_strats['counter_strats_en'] = df_strats['counter_strats'].apply(google_translate)

In [None]:
df_strats.to_csv('./temp/tencent_champion_strats_translated.csv',index=False)

In [None]:
df_champ_strats = df_strats[['champion_name_cn','champion_name_eng','play_strats_en','counter_strats_en']].copy(deep=True)
df_champ_strats.rename(columns={'champion_name_cn':'champion_cn','champion_name_eng':'champion',
                                'play_strats_en':'play_strats','counter_strats_en':'counter_strats'},inplace=True)
df_champ_strats.to_csv('./preprocessed_data/tencent_champion_strats.csv',index=False)

**Note**:
- This table still needs manual editing
    - Impute missing English names and starts by looking up guides
    - Go over the translations to fix obvious errors

## 4. Scrape Champion Details (Fandom Wiki)

### 4.1 Scrape Champion Catalog Page
- Champion name
- Champion page url

In [None]:
url = 'https://leagueoflegends.fandom.com/wiki/List_of_champions'
wiki_champion_page_soup = scraper.get_page(url)
champion_table = wiki_champion_page_soup.findAll('tbody')[1]
champion_rows = champion_table.findAll('tr',recursive=False)[1:]

In [None]:
champion_name_list,champion_url_list  = list(),list()

for champion_row in champion_rows:
    champion_name_list.append(champion_row.find('td')['data-sort-value'])
    url = champion_row.find('td').find('a',href=True)['href']
    champion_url_list.append(f'https://leagueoflegends.fandom.com{url}')

In [None]:
df_wiki_links = pd.DataFrame({'champion':champion_name_list,
                              'url':champion_url_list})

In [None]:
df_wiki_links.to_csv('./temp/wiki_champ_urls.csv',index=False)

### 4.2 Scrape Indivdual Champion Page

In [2]:
df_wiki_links = pd.read_csv('./temp/wiki_champ_urls.csv')

In [10]:
def extract_champion_details(url):
    for i in range(3):
        try:
            champ_page_soup = scraper.get_page(url)

            # extract primary and secondary classes
            classes = champ_page_soup.find('div',{'data-source':'legacy'}).find('div').findAll('span')
            primary_class = classes[0].findAll('a')[1].text.strip()
            try:
                secondary_class = classes[1].findAll('a')[1].text.strip()
            except:
                secondary_class = float('nan')

            # extract adaptive type
            adaptivetype = champ_page_soup.find('div',{'data-source':'adaptivetype'}).find('div').find('span').findAll('a')[0].text.strip()

            # extract rating distributions
            values = champ_page_soup.find('div',{'class':'stat-wheel'})['data-values'].split(';')
            values = pd.Series(values).apply(int)
            ratings = [element.text.strip() for element in champ_page_soup.find('div',{'class':'stat-wheel'}).findAll('th')]
            ratings[0] = f'{adaptivetype} {ratings[0]}'
            rating_dist = dict(zip(ratings,values))

            details_dict = {'primary_class':primary_class,'secondary_class':secondary_class,'ratings':rating_dist}
            return details_dict
        except:
            pass
    print(f'Error{url}')
    details_dict = {'primary_class':float('nan'),'secondary_class':float('nan'),'ratings':float('nan')}
    return details_dict

In [11]:
champion_details_dict = dict()
for url in tqdm(df_wiki_links['url'].values):
    champion_details_dict[url] = extract_champion_details(url)
df_url_details = pd.DataFrame.from_dict(champion_details_dict,orient='index')

  0%|          | 0/154 [00:00<?, ?it/s]

making html request:https://leagueoflegends.fandom.com/wiki/Aatrox
making html request:https://leagueoflegends.fandom.com/wiki/Ahri
making html request:https://leagueoflegends.fandom.com/wiki/Akali
making html request:https://leagueoflegends.fandom.com/wiki/Alistar
making html request:https://leagueoflegends.fandom.com/wiki/Amumu
making html request:https://leagueoflegends.fandom.com/wiki/Anivia
making html request:https://leagueoflegends.fandom.com/wiki/Annie
making html request:https://leagueoflegends.fandom.com/wiki/Aphelios
making html request:https://leagueoflegends.fandom.com/wiki/Ashe
making html request:https://leagueoflegends.fandom.com/wiki/Aurelion_Sol
making html request:https://leagueoflegends.fandom.com/wiki/Azir
making html request:https://leagueoflegends.fandom.com/wiki/Bard
making html request:https://leagueoflegends.fandom.com/wiki/Blitzcrank
making html request:https://leagueoflegends.fandom.com/wiki/Brand
making html request:https://leagueoflegends.fandom.com/wiki/B

making html request:https://leagueoflegends.fandom.com/wiki/Thresh
making html request:https://leagueoflegends.fandom.com/wiki/Tristana
making html request:https://leagueoflegends.fandom.com/wiki/Trundle
making html request:https://leagueoflegends.fandom.com/wiki/Tryndamere
making html request:https://leagueoflegends.fandom.com/wiki/Twisted_Fate
making html request:https://leagueoflegends.fandom.com/wiki/Twitch
making html request:https://leagueoflegends.fandom.com/wiki/Udyr
making html request:https://leagueoflegends.fandom.com/wiki/Urgot
making html request:https://leagueoflegends.fandom.com/wiki/Varus
making html request:https://leagueoflegends.fandom.com/wiki/Vayne
making html request:https://leagueoflegends.fandom.com/wiki/Veigar
making html request:https://leagueoflegends.fandom.com/wiki/Vel%27Koz
making html request:https://leagueoflegends.fandom.com/wiki/Vi
making html request:https://leagueoflegends.fandom.com/wiki/Viego
making html request:https://leagueoflegends.fandom.com/w

In [15]:
df_url_details.to_csv('./temp/wiki_url_details.csv',index=False)

### 4.3 Preprocess Champion Details Table

In [17]:
df_url_details.reset_index(inplace=True)
df_url_details.rename(columns={'index':'url'},inplace=True)

In [19]:
df_details =  pd.merge(df_wiki_links, df_url_details, on="url", how="inner")

In [23]:
df_details[['champion','primary_class','secondary_class','ratings']].to_csv('./preprocessed_data/wiki_champion_details.csv',index=False)