In [530]:
import numpy as np
import pandas as pd
import warnings
import codecs
import os

import aut_vinc_bballCrawler as bc

In [531]:
def get_header(pergamedat):
    """Parse the header info."""
    header = []
    for th in pergamedat[0].findAll('th'):
        if not th.getText() in header:
            header.append(th.getText())
    return header


def pull_data_assemble_df(url):
    """Returns pandas DataFrame."""
    pg = bc.getSoupFromURL(url).findAll('table')
    rows = [r for r in pg[0].findAll('tr')[1:]
            if len(r.findAll('td')) > 0]
    parsed_table = [[col.getText() for col in row.findAll('td')]
                    for row in rows]
    df_table = pd.io.parsers.TextParser(parsed_table,
                                        names=get_header(pg)[1:30]).get_chunk()
    df_table.TOV = np.round(1/ptable_16.TOV, decimals=2)
    return df_table


def clean_scrubs(dataframe):
    """Players who dont play have funky missing vals.
    This drops player if nan and if Inf in TOV.
    Both happen because of player lack of production
    because of lack of mins."""
    df = dataframe.drop(['Pos', 'Tm'], 1)   # don't need these columns
    df.iloc[np.where(df.isnull())] = 0   # nan vals because no attempts, make 0.
    df = df.iloc[list(np.where(
                np.isinf(dataframe.TOV)==False)[0]), :]   # drops if Inf in TOV
    return df


def get_dupl_names(dataframe):
    """Find duplicate names.
    This happens when player was traded.
    Each team he played for has an entry.
    """
    return dataframe.Player[dataframe.Player.duplicated()].unique()


def make_uniq_tabl(dataframe, duplicate_list):
    return dataframe.iloc[np.where(np.in1d(dataframe.Player,
                                           duplicate_list)==False)[0], :]


def append_to_df(playername, pd_frame):
    mean_v = pd_frame.iloc[np.where(pd_frame.Player == playername)[0], 1:].mean()
    return pd.concat((pd.Series(playername, index=['Player']),
                      np.round(mean_v, decimals=3)))


def get_playerframe_clean(dataframe, duplicate_list):
    uniq_tbl = make_uniq_tabl(dataframe, duplicate_list)
    for playername in duplicate_list:
        uniq_tbl = uniq_tbl.append(append_to_df(playername, dataframe),
                                   ignore_index=True)
    return uniq_tbl

In [532]:
url = 'http://www.basketball-reference.com/leagues/NBA_2016_per_game.html'
dff = clean_scrubs(pull_data_assemble_df(url))
df16 = get_playerframe_clean(dff, get_dupl_names(dff))
df16.insert(1, 'Season', np.repeat(2016, df16.shape[0]))

url = 'http://www.basketball-reference.com/leagues/NBA_2015_per_game.html'
dff = clean_scrubs(pull_data_assemble_df(url))
df15 = get_playerframe_clean(dff, get_dupl_names(dff))
df15.insert(1, 'Season', np.repeat(2015, df15.shape[0]))

url = 'http://www.basketball-reference.com/leagues/NBA_2014_per_game.html'
dff = clean_scrubs(pull_data_assemble_df(url))
df14 = get_playerframe_clean(dff, get_dupl_names(dff))
df14.insert(1, 'Season', np.repeat(2014, df14.shape[0]))

url = 'http://www.basketball-reference.com/leagues/NBA_2013_per_game.html'
dff = clean_scrubs(pull_data_assemble_df(url))
df13 = get_playerframe_clean(dff, get_dupl_names(dff))
df13.insert(1, 'Season', np.repeat(2013, df13.shape[0]))

In [533]:
df16.to_csv('aut_vincere_2016_trimmeddat.csv', index=False)
df15.to_csv('aut_vincere_2015_trimmeddat.csv', index=False)
df14.to_csv('aut_vincere_2014_trimmeddat.csv', index=False)
df13.to_csv('aut_vincere_2013_trimmeddat.csv', index=False)

In [534]:
dfmult = pd.concat([df16, df15, df14, df13])
dfmult.to_csv('aut_vincere_multiyear_trimmeddat.csv', index=False)

In [535]:
dfmult[dfmult.loc[:, 'Player']=='LeBron James']

Unnamed: 0,Player,Season,Age,G,GS,MP,FG,FGA,FG%,3P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PS/G
193,LeBron James,2016,31,76,76,35.6,9.7,18.6,0.52,1.1,...,0.731,1.5,6.0,7.4,6.8,1.4,0.6,0.3,1.9,25.3
196,LeBron James,2015,30,69,69,36.1,9.0,18.5,0.488,1.7,...,0.71,0.7,5.3,6.0,7.4,1.6,0.7,3.33,2.0,25.3
184,LeBron James,2014,29,77,77,37.7,10.0,17.6,0.567,1.5,...,0.75,1.1,5.9,6.9,6.3,1.6,0.3,1.43,1.6,27.1
181,LeBron James,2013,28,76,76,37.9,10.1,17.8,0.565,1.4,...,0.753,1.3,6.8,8.0,7.3,1.7,0.9,0.71,1.4,26.8


In [536]:
players_intersect = list(set(df16.Player).intersection(df15.Player).intersection(df14.Player))

In [537]:
np.where(np.in1d(df16.Player, players_intersect))

(array([  0,   2,   3,   4,   5,   6,   8,   9,  10,  11,  16,  18,  19,
         21,  22,  23,  24,  25,  27,  28,  29,  30,  31,  32,  33,  34,
         35,  36,  37,  38,  39,  40,  41,  42,  43,  46,  47,  48,  50,
         51,  53,  54,  55,  56,  57,  58,  60,  62,  63,  64,  65,  66,
         67,  68,  71,  72,  74,  75,  76,  77,  79,  80,  83,  84,  86,
         90,  91,  92,  93,  94,  96,  97,  98,  99, 100, 102, 103, 104,
        105, 106, 108, 109, 110, 111, 112, 113, 114, 116, 117, 120, 122,
        127, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 142,
        143, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 160, 164,
        165, 166, 167, 168, 169, 171, 172, 174, 175, 177, 181, 182, 186,
        187, 191, 192, 193, 194, 195, 196, 197, 198, 199, 202, 204, 205,
        207, 208, 209, 211, 212, 214, 215, 216, 217, 218, 219, 220, 221,
        224, 225, 226, 227, 228, 229, 230, 232, 233, 234, 235, 237, 240,
        242, 243, 244, 246, 252, 253, 254, 255, 258

In [538]:
dfmult

Unnamed: 0,Player,Season,Age,G,GS,MP,FG,FGA,FG%,3P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PS/G
0,Quincy Acy,2016,25.0,59.000,29.000,14.800,2.000,3.600,0.556,0.300,...,0.735,1.100,2.100,3.200,0.500,0.500,0.400,2.000,1.700,5.200
1,Jordan Adams,2016,21.0,2.000,0.000,7.500,1.000,3.000,0.333,0.000,...,0.600,0.000,1.000,1.000,1.500,1.500,0.000,1.000,1.000,3.500
2,Steven Adams,2016,22.0,80.000,80.000,25.200,3.300,5.300,0.000,0.000,...,0.000,2.700,3.900,6.700,0.800,0.500,1.100,0.910,2.800,8.000
3,Arron Afflalo,2016,30.0,71.000,57.000,33.400,5.000,11.300,0.443,1.300,...,0.840,0.300,3.400,3.700,2.000,0.400,0.100,0.830,2.000,12.800
4,Alexis Ajinca,2016,27.0,59.000,17.000,14.600,2.500,5.300,0.476,0.000,...,0.839,1.300,3.300,4.600,0.500,0.300,0.600,1.110,2.300,6.000
5,Cole Aldrich,2016,27.0,60.000,5.000,13.300,2.200,3.800,0.000,0.000,...,0.000,1.400,3.400,4.800,0.800,0.800,1.100,0.910,2.300,5.500
6,LaMarcus Aldridge,2016,30.0,74.000,74.000,30.600,7.200,14.100,0.513,0.000,...,0.858,2.400,6.200,8.500,1.500,0.500,1.100,0.770,2.000,18.000
7,Cliff Alexander,2016,20.0,8.000,0.000,4.500,0.600,1.300,0.000,0.000,...,0.000,0.300,0.500,0.800,0.000,0.100,0.300,10.000,0.100,1.300
8,Lavoy Allen,2016,26.0,79.000,28.000,20.200,2.400,4.700,0.000,0.000,...,0.000,2.100,3.300,5.400,1.000,0.300,0.500,1.110,1.900,5.400
9,Tony Allen,2016,34.0,64.000,57.000,25.300,3.400,7.300,0.458,0.200,...,0.652,1.600,3.000,4.600,1.100,1.700,0.300,0.830,2.700,8.400
