In [1]:
from utility import scraper
from bs4 import BeautifulSoup as soup
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np

import re
import string

from utility.s3_file_transfer import upload_df_to_s3

In [2]:
preprocessed_data_dir = './preprocessed_data'
processed_data_dir = './processed_data'

## 1. Scrape Tier List Table (UGG)
- Updated daily to weekly

### 1.1 Extract Raw Data

In [3]:
def extract_from_tier_list_row(row,divison):
    
    # extract information from the row
    divison = divison.capitalize() 
    lane = row.find('img',{'class':'tier-list-role'})['alt'].capitalize() 
    champion = row.find('strong',{'class':'champion-name'}).text
    try:
        regex = re.compile('.*rt-td tier.*')
        tier = row.find('div',{'class':regex}).find('span').text
    except:
        tier = float('nan')
    try:
        regex = re.compile('.*rt-td winrate.*')
        win_rate = row.find('div',{'class':regex}).find('span').text
    except:
        win_rate = float('nan')
    try:
        regex = re.compile('.*rt-td pickrate.*')
        pick_rate = row.find('div',{'class':regex}).find('span').text
    except:
        pick_rate = float('nan')
    try:
        regex = re.compile('.*rt-td banrate.*')
        ban_rate = row.find('div',{'class':regex}).find('span').text
    except:
        ban_rate = float('nan')
    try:
        counter_raw_list = row.find('div',{'class':'against-container'}).findAll('div',{'class':'against'})
        counter_list = list()
        for counter in counter_raw_list:
            counter_champ = counter.find('a')['href'].split('/')[3].capitalize() 
            counter_list.append(counter_champ)
        counters = ','.join(counter_list)
    except:
        counters = float('nan')
    try:
        regex = re.compile('.*rt-td matches.*')
        matches = row.find('div',{'class':regex}).find('span').text
    except:
        matches = float('nan')
    # store all into a dictionary
    row_dict = {'divison':divison,'lane':lane,'champion':champion,
                'tier':tier,'win_rate':win_rate,'pick_rate':pick_rate,
                'ban_rate':ban_rate,'counters':counters,'matches':matches}
    
    return row_dict


def generate_divison_tier_table(champ_tier_list_soup,divison):
    
    tier_list_table = champ_tier_list_soup.find('div',{'class':'tier-list-page'})
    champ_rows = tier_list_table.findAll('div',{'class':'rt-tr-group'})
    
    division_dict = dict()
    for i,row in enumerate(tqdm(champ_rows)):
        division_dict[i] = extract_from_tier_list_row(row,divison)
    df_division = pd.DataFrame.from_dict(division_dict, orient='index').sort_values(by=['win_rate'],ascending=False)
    return df_division

def scrape_ugg_tier_list():
    
    divisions = ['iron','bronze','silver','gold','platinum','diamond','master','grandmaster','challenger']
    df_tier_list = pd.DataFrame()

    for divsion in tqdm(divisions):
        # generate the table for the given division
        url = f'https://u.gg/lol/tier-list?rank={divsion}'
        champ_tier_list_soup = scraper.get_page_selenium(url)
        df_divsion = generate_divison_tier_table(champ_tier_list_soup,divison=divsion)
        # append to parent table
        df_tier_list = pd.concat([df_tier_list,df_divsion],ignore_index=True)
    return df_tier_list

In [4]:
df_tier_list = scrape_ugg_tier_list()

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/223 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

  0%|          | 0/221 [00:00<?, ?it/s]

  0%|          | 0/216 [00:00<?, ?it/s]

  0%|          | 0/213 [00:00<?, ?it/s]

  0%|          | 0/207 [00:00<?, ?it/s]

  0%|          | 0/214 [00:00<?, ?it/s]

  0%|          | 0/216 [00:00<?, ?it/s]

  0%|          | 0/201 [00:00<?, ?it/s]

In [5]:
df_tier_list.to_csv('./temp/ugg_tier_list_raw.csv',index=False)

### 1.2 Preprocess
- Correct names in `counters`

In [6]:
def adjust_name(name):
    adjusted_name = name.translate(str.maketrans('', '', string.punctuation))
    adjusted_name = adjusted_name.replace(' ','').lower().capitalize()
    return adjusted_name

def correct_counters(counters,correction_champ_names,name_correction_dict):
    for name in correction_champ_names:
        if name in counters:
            counters = counters.replace(name,name_correction_dict[name])
    return counters

In [7]:
champ_names = pd.Series(df_tier_list['champion'].unique())
corrected_champ_names = champ_names[champ_names.apply(lambda x: ' ' in x or "'" in x)]
correction_champ_names = pd.Series(corrected_champ_names).apply(adjust_name)

name_correction_dict = dict(zip(correction_champ_names,corrected_champ_names))

In [8]:
df_tier_list['counters'] = df_tier_list['counters'].apply(lambda x: correct_counters(x,correction_champ_names,name_correction_dict))

In [9]:
df_tier_list.to_csv(f'{preprocessed_data_dir}/ugg_tier_list.csv',index=False)

### 1.3 Process and Upload

In [10]:
def convert_to_float(percent):
    if isinstance(percent,str):
        return round(float(percent.replace('%',''))/100,4)
    else:
        return float('nan')

In [11]:
df_tier_list = pd.read_csv(f'{preprocessed_data_dir}/ugg_tier_list.csv')
df_champ_roles = pd.read_csv(f'{preprocessed_data_dir}/wiki_champ_roles.csv')
df_champ_details = pd.read_csv(f'{preprocessed_data_dir}/wiki_champion_details.csv')

In [12]:
tier_list_set = set(df_tier_list['champion'].unique())
champ_roles_set = set(df_champ_roles['champion'].unique())
champ_details_set = set(df_champ_details['champion'].unique())

In [13]:
if tier_list_set == champ_roles_set and champ_roles_set == champ_details_set:
    # turn matches into int dtype
    df_tier_list['matches'] = df_tier_list['matches'].apply(lambda x: int(x.replace(',','')))
    
    # convert rates to float dtype
    df_tier_list['win_rate'] = df_tier_list['win_rate'].apply(convert_to_float)
    df_tier_list['pick_rate'] = df_tier_list['pick_rate'].apply(convert_to_float)
    df_tier_list['ban_rate'] = df_tier_list['ban_rate'].apply(convert_to_float)
    
    # map tiers into int dtype
    tier_mapping = {'S+':0,'S':1,'A':2,'B':3,'C':4,'D':5}
    df_tier_list['tier'] = df_tier_list['tier'].map(tier_mapping)
    
    df_champ_stats = df_tier_list
    df_champ_stats.rename(columns={'divison':'rank'},inplace=True)

In [14]:
df_champ_stats.head()

Unnamed: 0,rank,lane,champion,tier,win_rate,pick_rate,ban_rate,counters,matches
0,Iron,Supp,Swain,0,0.554,0.046,0.071,"Taric,Brand,Vel'Koz,Sett,Zilean,Alistar,Veigar",4834
1,Iron,Jungle,Dr. Mundo,1,0.5497,0.05,0.043,"Shyvana,Sejuani,Nidalee,Elise,Master Yi,Udyr,A...",5263
2,Iron,Mid,Xin Zhao,2,0.5482,0.009,0.027,"Vel'Koz,Anivia,Morgana,Pantheon,Azir,Viktor,Tr...",934
3,Iron,Supp,Brand,0,0.5444,0.075,0.168,"Alistar,Sona,Pyke,Sett,Malphite,Blitzcrank,Galio",7928
4,Iron,Mid,Diana,0,0.5418,0.06,0.076,"Viego,Xin Zhao,Galio,Brand,Fizz,Pyke,Morgana",6382


In [15]:
# save to local
df_champ_stats.to_csv(f'{processed_data_dir}/champion_stats.csv',index=False)
# upload to s3
upload_df_to_s3(df_champ_stats,'peter-ff15-data/champion_stats.csv')