In [None]:
import os
from bs4 import BeautifulSoup
import re
import subprocess
import pandas as pd
schools = ['LVA', 'LAS', 'SEA', 'MIN', 'PHO', 'DAL', 'CON', 'WAS', 'CHI', 'NYL', 'IND', 'ATL']

In [None]:
folder = 'WNBA'
year = 2019

try:
    os.mkdir(folder)
    print("Directory {} created".format(folder)) 
except FileExistsError:
        pass
tables = {}
for team in schools:
    target = os.path.join(folder,team + str(year) + '.html')
    print(target)
    # get the files
    if not os.path.exists(target):
        subprocess.call(['wget','-O',target,
        'https://www.basketball-reference.com/wnba/teams/{}/{}.html'.format(team,year)])
        fs = os.path.getsize(target)
        if fs < 10:
            os.remove(target)
            continue
    
    # load the data
    tables[team] = {}

    with open(target,'rt', encoding = "ISO-8859-1") as fp:
        data = fp.read()
    tables[team]['name'] = re.findall('{} (.*) Stats'.format(year),data)[0]
    bs = BeautifulSoup(data,features="lxml")
    #tables[team]['logo'] = re.findall('(http.*png)',str(bs.find_all('img',{"class": "teamlogo"})[0]))[0]
    #tables[team]['conf'] = re.findall('/cbb/conferences/(.*)/{}.html'.format(year),data)[0]

    # collect all the tables
    m = re.findall(r'<!--[ \n]*(<div[\s\S\r]+?</div>)[ \n]*-->',data)
    m2 = re.findall(r'(<div class="table_outer_container">[ \n]*<div class="overthrow table_container" id="div_roster">[\s\S\r]+?</table>[ \n]*</div>[ \n]*</div>)',data)
    m4 = re.findall(r'(<div class="table_outer_container">[ \n]*<div class="overthrow table_container" id=".*">[\s\S\r]+?</table>[ \n]*</div>[ \n]*</div>)',data)

    m = m2 + m + m4
    print(target,len(m))
    for test_table in m:
        try:
            soup = BeautifulSoup(test_table,features="lxml")
            table_id = str(soup.find('table').get('id'))

            if table_id in ['wnba_playoffs','team_and_opponent']:
                continue
            soup.findAll('tr')

            table_size = {'on_off':1,'on_off_p':1,'shooting':2,'pbp':1,'playoffs_shooting':2,'playoffs_pbp':1,'contracts':1}

            # use getText()to extract the text we need into a list
            headers = [th.getText() for th in soup.findAll('tr')[table_size.get(table_id,0)].findAll('th')]

            # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
            start_col = 1
            if table_id in ['contracts','injury','on_off','on_off_p','roster']:
                start_col = 0

            headers = headers[start_col:]
            rows = soup.findAll('tr')[start_col:]
            player_stats = [[td.getText() for td in rows[i].findAll('td')]
                        for i in range(len(rows))]

            if table_id in ['contracts','injury','on_off','on_off_p','roster']:
                player_names = [[td.getText() for td in rows[i].findAll('th')]
                            for i in range(len(rows))]
                player_stats = [a + b for a,b in zip(player_names[1:],player_stats[1:])]
            headers[0] = 'Name'
            stats = pd.DataFrame(player_stats, columns = headers).set_index('Name')
            if table_id in ['contracts']:
                stats = stats.drop(['Player'])
                stats = stats.iloc[:stats.index.get_loc('')]

            # drop nan
            stats = stats[~ stats.index.isin([None])]
            # convert to float
            obj_cols = stats.loc[:, stats.dtypes == object]
            conv_cols = obj_cols.apply(pd.to_numeric, errors = 'ignore')
            stats.loc[:, stats.dtypes == object] = conv_cols
            
            stats = stats.fillna('')
            
            if True and 'on_off' in table_id:
                stats = stats.iloc[~ stats.index.get_loc('Player')]
                stats = stats.loc[~ (stats.Split == '')]
                stats.index = list(itertools.chain.from_iterable(itertools.repeat(x, 3) for x in [_ for _ in stats.index if _!='']))

            #print(table_id,stats.index)
            tables[team][table_id]= stats
        except KeyboardInterrupt:
            raise
        except:
            #pass
            print('FAILED TO PARSE ' +str(soup.find('table').get('id') ))

            raise
            

In [None]:
import pickle
with open('wnba_{}.pkl'.format(year),'wb') as fp:
    pickle.dump(tables,fp)

In [None]:
tables['LVA'].keys()