In [None]:
from utility import scraper
from bs4 import BeautifulSoup as soup
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import json

import re
import string

from utility.s3_file_transfer import upload_df_to_s3

## 1. Scrape Champion Roles (Fandom Wiki)
- Updated when there's a new champion release
- Can be updated manually

In [None]:
url = 'https://leagueoflegends.fandom.com/wiki/List_of_champions'
wiki_champion_page_soup = scraper.get_page(url)
champion_table = wiki_champion_page_soup.findAll('tbody')[1]
champion_rows = champion_table.findAll('tr',recursive=False)[1:]

In [None]:
champion_name_list,primary_role_list,secondary_role_list  = list(),list(),list()

for champion_row in champion_rows:
    champion_name_list.append(champion_row.find('td')['data-sort-value'])
    primary_role_list.append(champion_row.findAll('td')[1]['data-sort-value'])
    secondary_role_list.append(champion_row.findAll('td')[2]['data-sort-value'])

In [None]:
df_roles = pd.DataFrame({'champion':champion_name_list,
                         'primary_role':primary_role_list,
                         'secondary_role':secondary_role_list})

In [None]:
df_roles.to_csv('./temp/wiki_champ_roles.csv',index=False)
df_roles.to_csv('./preprocessed_data/wiki_champ_roles.csv',index=False)

## 2. Scrape Champion Details (Fandom Wiki)
- Updated when there's a new champion release
- Can be updated manually

### 2.1 Scrape Champion Catalog Page
- Champion name
- Champion page url

In [None]:
url = 'https://leagueoflegends.fandom.com/wiki/List_of_champions'
wiki_champion_page_soup = scraper.get_page(url)
champion_table = wiki_champion_page_soup.findAll('tbody')[1]
champion_rows = champion_table.findAll('tr',recursive=False)[1:]

In [None]:
champion_name_list,champion_url_list  = list(),list()

for champion_row in champion_rows:
    champion_name_list.append(champion_row.find('td')['data-sort-value'])
    url = champion_row.find('td').find('a',href=True)['href']
    champion_url_list.append(f'https://leagueoflegends.fandom.com{url}')

In [None]:
df_wiki_links = pd.DataFrame({'champion':champion_name_list,
                              'url':champion_url_list})

In [None]:
df_wiki_links.to_csv('./temp/wiki_champ_urls.csv',index=False)

### 2.2 Scrape Indivdual Champion Page

In [None]:
df_wiki_links = pd.read_csv('./temp/wiki_champ_urls.csv')

In [None]:
def extract_champion_details(url):
    for i in range(3):
        try:
            champ_page_soup = scraper.get_page(url)

            # extract primary and secondary classes
            classes = champ_page_soup.find('div',{'data-source':'legacy'}).find('div').findAll('span')
            primary_class = classes[0].findAll('a')[1].text.strip()
            try:
                secondary_class = classes[1].findAll('a')[1].text.strip()
            except:
                secondary_class = float('nan')

            # extract adaptive type
            adaptivetype = champ_page_soup.find('div',{'data-source':'adaptivetype'}).find('div').find('span').findAll('a')[0].text.strip()

            # extract rating distributions
            values = champ_page_soup.find('div',{'class':'stat-wheel'})['data-values'].split(';')
            values = pd.Series(values).apply(int)
            ratings = [element.text.strip() for element in champ_page_soup.find('div',{'class':'stat-wheel'}).findAll('th')]
            ratings[0] = f'{adaptivetype} {ratings[0]}'
            rating_dist = dict(zip(ratings,values))

            details_dict = {'primary_class':primary_class,'secondary_class':secondary_class,'ratings':rating_dist}
            return details_dict
        except:
            pass
    print(f'Error{url}')
    details_dict = {'primary_class':float('nan'),'secondary_class':float('nan'),'ratings':float('nan')}
    return details_dict

In [None]:
champion_details_dict = dict()
for url in tqdm(df_wiki_links['url'].values):
    champion_details_dict[url] = extract_champion_details(url)
df_url_details = pd.DataFrame.from_dict(champion_details_dict,orient='index')

In [None]:
df_url_details.to_csv('./temp/wiki_url_details.csv',index=False)

### 2.3 Preprocess Champion Details Table

In [None]:
df_url_details.reset_index(inplace=True)
df_url_details.rename(columns={'index':'url'},inplace=True)

In [None]:
df_details =  pd.merge(df_wiki_links, df_url_details, on="url", how="inner")

In [None]:
df_details[['champion','primary_class','secondary_class','ratings']].to_csv('./preprocessed_data/wiki_champion_details.csv',index=False)

## 3. Process and Upload

In [None]:
def merge_classes(class1,class2):
    if isinstance(class2,str):
        return f'{class1}/{class2}'
    else:
        return class1

In [None]:
preprocessed_data_dir = './preprocessed_data'
processed_data_dir = './processed_data'

In [None]:
df_champ_roles = pd.read_csv(f'{preprocessed_data_dir}/wiki_champ_roles.csv')
df_champ_details = pd.read_csv(f'{preprocessed_data_dir}/wiki_champion_details.csv')

In [None]:
# join detail and role tables
df_champ_properties = pd.merge(df_champ_details,df_champ_roles[['champion','primary_role']],how='left',on='champion')
df_champ_properties.rename(columns={'primary_role':'role'},inplace=True)

In [None]:
# merge classes
df_champ_properties['class'] = df_champ_properties.apply(lambda x: merge_classes(x['primary_class'],x['secondary_class']),axis=1)
# keep selective columns
df_champ_properties = df_champ_properties[['champion','class','primary_class','secondary_class','role','ratings']]

# expand ratings into columns and replace na with 0
df_ratings = df_champ_properties['ratings'].apply(lambda x: pd.Series(json.loads(x.replace("'",'"'))))
df_ratings.replace(float('nan'),0,inplace=True)
df_ratings.columns = ['physical_damage','tankiness','control','mobility','utility','magic_damage']
df_champ_properties = pd.concat([df_champ_properties.drop(['ratings'], axis=1),df_ratings],axis=1)

In [None]:
df_champ_properties.head()

In [None]:
# save to local
df_champ_properties.to_csv(f'{processed_data_dir}/champion_properties.csv',index=False)
# upload to s3
# upload_df_to_s3(df_champ_properties,'peter-ff15-data/champion_properties.csv')