Title: World of Warcraft PvP Leaderboards
Date: 2020-02-08 05:20
Modified: 2020-02-08 05:20
Category: Data Science
Tags: modeling, visualization, SQL
Slug: world-of-warcraft
Status: draft

In [None]:
import os
import sqlite3
import string
import time

from gazpacho import Soup
import pandas as pd
from requests import get
from tqdm import tqdm

# https://worldofwarcraft.com/en-us/game/pvp/leaderboards/2v2?page=10
# https://worldofwarcraft.com/en-us/character/us/zuljin/demoncouch
URL_BASE = 'https://worldofwarcraft.com'
BOARDS = ['2v2', '3v3', 'battlegrounds']

SQL_LEADERBOARD = """
CREATE TABLE {0} (
    player TEXT,
    realm TEXT,
    class TEXT,
    faction TEXT,
    wins INTEGER,
    losses INTEGER,
    rating INTEGER,
    rank INTEGER,
    url TEXT PRIMARY KEY
);
"""

SQL_PLAYERS = """
CREATE TABLE players (
    name TEXT,
    title TEXT,
    details TEXT,
    achievement INTEGER,
    ilvl INTEGER,
    url TEXT PRIMARY KEY
);
"""

def sanitize_filename(name):
    fixed = ''
    for c in name:
        if c in '?=':
            c = '-'
        elif not c.isalnum() or c not in string.printable:
            c = '_'
        fixed += c
    return fixed

def fetch_url(url):
    tokens = url.split('/')
    name = sanitize_filename(tokens[-1])
    realm = sanitize_filename(tokens[-2])
    path = f'cache/{name}-{realm}.html'
    try:
        with open(path, 'r') as fp:
            html = fp.read()
    except:
        print('fetching ' + url)
        html = get(url).text
        with open(path, 'w') as fp:
            fp.write(html)
        time.sleep(1)
    return html

def build_leaderboard_url(board, page):
    url = f'{URL_BASE}/en-us/game/pvp/leaderboards/{board}?page={page}'
    return url

def fetch_leaderboard(board):
    leaderboard = []
    for page in tqdm(range(1, 10+1)): # each page has 100 players
        html = fetch_url(build_leaderboard_url(board, page))
        soup = Soup(html)
        for row in soup.find('div', {'class': 'SortTable-row'}):
            cols = row.find('div', {'class': 'SortTable-col'})
            try:
                leader = {
                    'rank': int(cols[0].attrs['data-value']),
                    'rating': int(cols[1].text),
                    'player': cols[2].attrs['data-value'],
                    'class': cols[3].attrs['data-value'],
                    'faction': cols[4].attrs['data-value'],
                    'realm': cols[5].attrs['data-value'],
                    'wins': int(cols[6].attrs['data-value']),
                    'losses': int(cols[7].attrs['data-value']),
                    'url': URL_BASE + cols[2].find('a')[0].attrs['href']
                }
                leaderboard.append(leader)
            except:
                pass
    return leaderboard

def fetch_player(url):
    player = {
        'name': None,
        'title': None,
        'achievement': None,
        'ilvl': None,
        'details': None,
        'url': url
    }
    try:
        header = Soup(fetch_url(url)).find('div', {'class': 'CharacterHeader'})
        player['name'] = header.find('a', {'class': 'CharacterHeader-name'}).text
        player['achievement'] = header.find('a', {'class': 'CharacterHeader-achievement'}).text
        player['details'] = header.find('div', {'class': 'CharacterHeader-details'}).text
        player['ilvl'] = header.find('a', {'class': 'CharacterHeader-ilvl'}).text.split()[0]
        player['title'] = header.find('div', {'class': 'CharacterHeader-title'}).text
    except:
        pass
    return player

def scrape_all():
    players = {}
    for board in BOARDS:
        leaderboard = fetch_leaderboard(board)
        df = pd.DataFrame(leaderboard)
        df = df.sort_values(['rank', 'player'])
        df.to_csv(f'{board}.csv', index=False)

        for leader in tqdm(leaderboard):
            name = leader['player']
            if name not in players:
                players[name] = fetch_player(leader['url'])

    df = pd.DataFrame.from_dict(players, orient='index')
    df.to_csv('data/players.csv', index=False)

def save_to_db():
    con = sqlite3.connect('data/wow.db')

    df = pd.read_csv('data/2v2.csv')
    con.cursor().execute(SQL_LEADERBOARD.format('arena_2v2'))
    df.to_sql(name='arena_2v2', con=con, if_exists='replace', index=False)

    df = pd.read_csv('data/3v3.csv')
    con.cursor().execute(SQL_LEADERBOARD.format('arena_3v3'))
    df.to_sql(name='arena_3v3', con=con, if_exists='replace', index=False)

    df = pd.read_csv('data/battlegrounds.csv')
    con.cursor().execute(SQL_LEADERBOARD.format('battlegrounds'))
    df.to_sql(name='battlegrounds', con=con, if_exists='replace', index=False)

    df = pd.read_csv('data/players.csv')
    con.cursor().execute(SQL_PLAYERS)
    df.to_sql(name='players', con=con, if_exists='replace', index=False)

    con.commit()
    con.close()

def demo_usage():
    query = '''
        SELECT name, rank, rating, wins, losses, achievement, ilvl
        FROM battlegrounds, players
        WHERE battlegrounds.url = players.url
    '''
    con = sqlite3.connect('data/wow.db')
    df = pd.read_sql(query, con)
    con.close()
    print(df)

if __name__ == '__main__':
    try:
        os.chdir('/home/mlehotay/DSI-TOR-6/submit-michael/project_5')
        os.mkdir('cache')
    except:
        pass

    scrape_all()
    save_to_db()
    demo_usage()