In [1]:
from utility import scraper
from bs4 import BeautifulSoup as soup
from tqdm import tqdm

import pandas as pd
import numpy as np
import json

import re
import string

from utility.s3_file_transfer import upload_df_to_s3

## 1. Scrape Champion Roles (Fandom Wiki)
- Updated when there's a new champion release
- Can be updated manually

In [2]:
url = 'https://leagueoflegends.fandom.com/wiki/List_of_champions'
wiki_champion_page_soup = scraper.get_page(url)
champion_table = wiki_champion_page_soup.findAll('tbody')[1]
champion_rows = champion_table.findAll('tr',recursive=False)[1:]

making html request:https://leagueoflegends.fandom.com/wiki/List_of_champions


In [3]:
champion_name_list,primary_role_list,secondary_role_list  = list(),list(),list()

for champion_row in champion_rows:
    # get name and classes
    champion_name = champion_row.find('td')['data-sort-value']
    classes = champion_row.findAll('td')[1]['data-sort-value']
    
    # transform classes into primary and secondary roles
    if ',' in classes:
        primary_role = classes.split(',')[0].strip()
        secondary_role = classes.split(',')[1].strip()
    else:
        primary_role = classes
        secondary_role = None
        
    # append to lists
    champion_name_list.append(champion_name)
    primary_role_list.append(primary_role)
    secondary_role_list.append(secondary_role)

In [4]:
df_roles = pd.DataFrame({'champion':champion_name_list,
                         'primary_role':primary_role_list,
                         'secondary_role':secondary_role_list})

In [5]:
display(df_roles.head())
df_roles.to_csv('./temp/wiki_champ_roles.csv',index=False)
df_roles.to_csv('./preprocessed_data/wiki_champ_roles.csv',index=False)

Unnamed: 0,champion,primary_role,secondary_role
0,Aatrox,Juggernaut,
1,Ahri,Burst,
2,Akali,Assassin,
3,Akshan,Marksman,Assassin
4,Alistar,Vanguard,


## 2. Scrape Champion Details (Fandom Wiki)
- Updated when there's a new champion release
- Can be updated manually

### 2.1 Scrape Champion Catalog Page
- Champion name
- Champion page url

In [6]:
url = 'https://leagueoflegends.fandom.com/wiki/List_of_champions'
wiki_champion_page_soup = scraper.get_page(url)
champion_table = wiki_champion_page_soup.findAll('tbody')[1]
champion_rows = champion_table.findAll('tr',recursive=False)[1:]

making html request:https://leagueoflegends.fandom.com/wiki/List_of_champions


In [7]:
champion_name_list,champion_url_list  = list(),list()

for champion_row in champion_rows:
    champion_name_list.append(champion_row.find('td')['data-sort-value'])
    url = champion_row.find('td').find('a',href=True)['href']
    champion_url_list.append(f'https://leagueoflegends.fandom.com{url}')

In [8]:
df_wiki_links = pd.DataFrame({'champion':champion_name_list,
                              'url':champion_url_list})

In [9]:
display(df_wiki_links.head())
df_wiki_links.to_csv('./temp/wiki_champ_urls.csv',index=False)

Unnamed: 0,champion,url
0,Aatrox,https://leagueoflegends.fandom.com/wiki/Aatrox...
1,Ahri,https://leagueoflegends.fandom.com/wiki/Ahri/LoL
2,Akali,https://leagueoflegends.fandom.com/wiki/Akali/LoL
3,Akshan,https://leagueoflegends.fandom.com/wiki/Akshan...
4,Alistar,https://leagueoflegends.fandom.com/wiki/Alista...


### 2.2 Scrape Indivdual Champion Page

In [10]:
df_wiki_links = pd.read_csv('./temp/wiki_champ_urls.csv')

In [11]:
def extract_champion_details(url):
    for i in range(3):
        try:
            champ_page_soup = scraper.get_page(url)

            # extract primary and secondary classes
            classes = champ_page_soup.find('div',{'data-source':'legacy'}).find('div').findAll('span')
            primary_class = classes[0].findAll('a')[1].text.strip()
            try:
                secondary_class = classes[1].findAll('a')[1].text.strip()
            except:
                secondary_class = float('nan')

            # extract adaptive type
            adaptivetype = champ_page_soup.find('div',{'data-source':'adaptivetype'}).find('div').find('span').findAll('a')[0].text.strip()

            # extract rating distributions
            values = champ_page_soup.find('div',{'class':'stat-wheel'})['data-values'].split(';')
            values = pd.Series(values).apply(int)
            ratings = [element.text.strip() for element in champ_page_soup.find('div',{'class':'stat-wheel'}).findAll('th')]
            ratings[0] = f'{adaptivetype} {ratings[0]}'
            rating_dist = dict(zip(ratings,values))

            details_dict = {'primary_class':primary_class,'secondary_class':secondary_class,'ratings':rating_dist}
            return details_dict
        except:
            pass
    print(f'Error{url}')
    details_dict = {'primary_class':float('nan'),'secondary_class':float('nan'),'ratings':float('nan')}
    return details_dict

In [12]:
champion_details_dict = dict()
for url in tqdm(df_wiki_links['url'].values):
    champion_details_dict[url] = extract_champion_details(url)
df_url_details = pd.DataFrame.from_dict(champion_details_dict,orient='index')

  0%|          | 0/159 [00:00<?, ?it/s]

making html request:https://leagueoflegends.fandom.com/wiki/Aatrox/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Ahri/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Akali/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Akshan/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Alistar/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Amumu/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Anivia/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Annie/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Aphelios/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Ashe/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Aurelion_Sol/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Azir/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Bard/LoL
making html request:https://leagueoflegends.fandom.com/wiki/Blitzcrank/LoL
maki

In [13]:
display(df_url_details.head())
df_url_details.to_csv('./temp/wiki_url_details.csv',index=False)

Unnamed: 0,primary_class,secondary_class,ratings
https://leagueoflegends.fandom.com/wiki/Aatrox/LoL,Fighter,Tank,"{'Physical Damage': 3, 'Toughness': 3, 'Contro..."
https://leagueoflegends.fandom.com/wiki/Ahri/LoL,Mage,Assassin,"{'Magic Damage': 3, 'Toughness': 1, 'Control':..."
https://leagueoflegends.fandom.com/wiki/Akali/LoL,Assassin,,"{'Physical Damage': 3, 'Toughness': 1, 'Contro..."
https://leagueoflegends.fandom.com/wiki/Akshan/LoL,Marksman,Assassin,"{'Physical Damage': 3, 'Toughness': 1, 'Contro..."
https://leagueoflegends.fandom.com/wiki/Alistar/LoL,Tank,Support,"{'Magic Damage': 1, 'Toughness': 3, 'Control':..."


### 2.3 Preprocess Champion Details Table

In [14]:
df_url_details.reset_index(inplace=True)
df_url_details.rename(columns={'index':'url'},inplace=True)

In [15]:
df_details =  pd.merge(df_wiki_links, df_url_details, on="url", how="inner")

In [16]:
df_details[['champion','primary_class','secondary_class','ratings']].to_csv('./preprocessed_data/wiki_champion_details.csv',index=False)

## 3. Process and Upload

In [17]:
def merge_classes(class1,class2):
    if isinstance(class2,str):
        return f'{class1}/{class2}'
    else:
        return class1

In [18]:
preprocessed_data_dir = './preprocessed_data'
processed_data_dir = './processed_data'

In [19]:
df_champ_roles = pd.read_csv(f'{preprocessed_data_dir}/wiki_champ_roles.csv')
df_champ_details = pd.read_csv(f'{preprocessed_data_dir}/wiki_champion_details.csv')

In [20]:
# join detail and role tables
df_champ_properties = pd.merge(df_champ_details,df_champ_roles[['champion','primary_role']],how='left',on='champion')
df_champ_properties.rename(columns={'primary_role':'role'},inplace=True)

In [21]:
# merge classes
df_champ_properties['class'] = df_champ_properties.apply(lambda x: merge_classes(x['primary_class'],x['secondary_class']),axis=1)
# keep selective columns
df_champ_properties = df_champ_properties[['champion','class','primary_class','secondary_class','role','ratings']]

# expand ratings into columns and replace na with 0
df_ratings = df_champ_properties['ratings'].apply(lambda x: pd.Series(json.loads(x.replace("'",'"'))))
df_ratings.replace(float('nan'),0,inplace=True)
df_ratings.columns = ['physical_damage','tankiness','control','mobility','utility','magic_damage']
df_champ_properties = pd.concat([df_champ_properties.drop(['ratings'], axis=1),df_ratings],axis=1)

In [27]:
# display
display(df_champ_properties.head())
# save to local
df_champ_properties.to_csv(f'{processed_data_dir}/champion_properties.csv',index=False)
# upload to s3
upload_df_to_s3(df_champ_properties,'peter-ff15-data/champion_properties.csv')

Unnamed: 0,champion,class,primary_class,secondary_class,role,physical_damage,tankiness,control,mobility,utility,magic_damage
0,Aatrox,Fighter/Tank,Fighter,Tank,Juggernaut,3.0,3.0,2.0,2.0,2.0,0.0
1,Ahri,Mage/Assassin,Mage,Assassin,Burst,0.0,1.0,2.0,3.0,1.0,3.0
2,Akali,Assassin,Assassin,,Assassin,3.0,1.0,1.0,3.0,1.0,0.0
3,Akshan,Marksman/Assassin,Marksman,Assassin,Marksman,3.0,1.0,1.0,3.0,2.0,0.0
4,Alistar,Tank/Support,Tank,Support,Vanguard,0.0,3.0,3.0,1.0,2.0,1.0
