In [1]:
import pprint
import sklearn
import json
import csv

# datasets

# player stats: https://www.kaggle.com/drgilermo/nba-players-stats/data
# player salaries: https://data.world/datadavis/nba-salaries/workspace/file?filename=nba_salaries_1990_to_2018.csv
# team stats: http://www.nbaminer.com/team_advanced_stats/

PLAYER_STATS = 'datasets/player_stats.csv'
PLAYER_SALARIES = 'datasets/player_salaries.csv'
TEAM_ADVANCED = 'datasets/team_advanced.csv'
TEAM_FOUR = 'datasets/team_four.csv'
TEAM_BASIC = 'datasets/team_basic.csv'
TEAM_RARE = 'datasets/team_rare.csv'

pp = pprint.PrettyPrinter(indent=4, width=500)

In [2]:
class Player(object):
    def __init__(self, name):
        self.name = name
        self.stats_by_year_end = dict() # year : [PlayerStats object]
        self.salary_by_year_end = dict() # year : salary
        self.year_ends_played = list()
    
    def add_stats_by_year_end(self, year_end, stats):
        # year_end is an int
        # stats is type PlayerStats
        self.year_ends_played.append(year_end)
        self.year_ends_played.sort()
        if year_end in self.stats_by_year_end:
            raise Exception('found year_end already in stats')
        else:
            self.stats_by_year_end[year_end] = stats
    
    def add_salary_by_year_end(self, year_end, salary):
        # year_end is an int
        # salary is an int (USD)
        self.year_ends_played.append(year_end)
        self.year_ends_played.sort()
        if year_end in self.salary_by_year_end:
#             print('tried to add {}'.format(year_end))
#             print(self.salary_by_year_end)
#             raise Exception('found year_end already in salaries')
            current = self.salary_by_year_end[year_end]
            self.salary_by_year_end[year_end] = min(current, salary)
        else:
            self.salary_by_year_end[year_end] = salary
    
    def to_dict(self):
        return {
            'name': self.name,
            'stats_by_year_end': self.stats_by_year_end,
            'salary_by_year_end': self.salary_by_year_end,
            'year_ends_played': self.year_ends_played,
        }
        
class PlayerStats(object):
    def __init__(self, per=0, ts=0, three_p=0, two_p=0, ft_p=0, oreb=0, dreb=0, ast=0, stl=0, blk=0, to=0, pf=0, pts=0):
        self.per = per
        self.ts = ts
        self.three_p = three_p
        self.two_p = two_p
        self.ft_p = ft_p
        self.oreb = oreb
        self.dreb = dreb
        self.ast = ast
        self.stl = stl
        self.blk = blk
        self.to = to
        self.pf = pf
        self.pts = pts
    
    def pretty(self, name, year_end):
        first = 'per: {}, ts: {}, three_p: {}, two_p: {}, ft_p: {}' \
            .format(self.per, self.ts, self.three_p, self.two_p, self.ft_p)
        second = 'oreb: {}, dreb: {}, ast: {}, stl: {}, blk: {}, to: {}, pf: {}, pts: {}' \
            .format(self.oreb, self.dreb, self.ast, self.stl, self.blk, self.to, self.pf, self.pts)
        print('------------- START -------------')
        print(name)
        print(year_end)
        print(first)
        print(second)
        print('-------------- END --------------\n')
    
    def no_empty_stat(self):
        all_stats = [self.per, self.ts, self.three_p, self.two_p, self.ft_p, self.oreb, self.dreb,\
                     self.ast, self.stl, self.blk, self.to, self.pf, self.pts]
        return '' not in all_stats 
    
    def to_dict(self):
        return {
            'per': self.per,
            'ts': self.ts,
            'three_p': self.three_p,
            'two_p': self.two_p,
            'ft_p': self.ft_p,
            'oreb': self.oreb,
            'dreb': self.dreb,
            'ast': self.ast,
            'stl': self.stl,
            'blk': self.blk,
            'to': self.to,
            'pf': self.pf,
            'pts': self.pts,
        }

    def from_dict(dict_object):
        return PlayerStats(
            per = dict_object['per'],
            ts = dict_object['ts'],
            three_p = dict_object['three_p'],
            two_p = dict_object['two_p'],
            ft_p = dict_object['ft_p'],
            oreb = dict_object['oreb'],
            dreb = dict_object['dreb'],
            ast = dict_object['ast'],
            stl = dict_object['stl'],
            blk = dict_object['blk'],
            to = dict_object['to'],
            pf = dict_object['pf'],
            pts = dict_object['pts'],
        )

In [3]:
with open(PLAYER_STATS) as file:
    data = csv.reader(file)
    next(data, None)
    data = [list(map(lambda x:x.lower(), line)) for line in list(data) if line[0] != '']

# name : Player object
players = dict()

for person_year in data:
    person_name = person_year[0] if '*' not in person_year[0] else person_year[0][:-1]
    year_end = int(person_year[1])
    player = players[person_name] if person_name in players else Player(person_name)
    
    idxs_to_correct = [8, 9, 35, 38, 42]
    for idx in idxs_to_correct:
        person_year[idx] = person_year[idx] if len(person_year[idx]) > 0 else '0'
        
    stat = PlayerStats(
        per = float(person_year[8]),
        ts = float(person_year[9]), 
        three_p = float(person_year[35]), 
        two_p = float(person_year[38]), 
        ft_p = float(person_year[42]), 
        oreb = int(person_year[43]), 
        dreb = int(person_year[44]), 
        ast = int(person_year[46]), 
        stl = int(person_year[47]), 
        blk = int(person_year[48]), 
        to = int(person_year[49]), 
        pf = int(person_year[50]), 
        pts = int(person_year[51])
    )
    players[person_name] = player
    player.stats_by_year_end[year_end] = stat

count = 0
for name, player in players.items():
    if name != player.name:
        count += 1

print('{} entries wrong'.format(count))

0 entries wrong


In [4]:
with open(PLAYER_SALARIES) as file:
    data = csv.reader(file)
    next(data, None)
    data = [list(map(lambda x:x.lower(), line)) for line in list(data) if line[0] != '']
    
name_mappings = {
    'louis williams': 'lou williams',
    'peter john ramos': 'peter john',
    'nene': 'nene hilario',
    'jose barea': 'j.j. barea',
    'hot rod williams': 'hot rod',
    'metta world peace': 'metta world',
    'keith van horn': 'keith van',
    'nick van exel': 'nick van',
    'luc mbah a moute': 'luc mbah',
    'james michael mcadoo': 'james michael',
    'patrick mills': 'patty mills',
    'vinny del negro': 'vinny del'
}
missing = set()

for person_year in data:
    person_name = person_year[0].replace('\\', '')
    salary = person_year[1]
    year_end = int(person_year[2])
    
    if person_name in players and year_end-1 in players[person_name].stats_by_year_end:
#         print('adding shit for {} year {} now'.format(person_name, year_end))
        players[person_name].add_salary_by_year_end(year_end, salary)
    
    elif person_name in name_mappings and name_mappings[person_name] in players \
    and year_end-1 in players[name_mappings[person_name]].stats_by_year_end:
#         print('adding shit for {} year {} now'.format(person_name, year_end))
        players[name_mappings[person_name]].add_salary_by_year_end(year_end, salary)
    
    else:
        missing.add(person_name)
#         print('{} not in players'.format(person_name))

print('set of names IN salary NOT IN stats dataset: \n\n {}'.format(missing))

set of names IN salary NOT IN stats dataset: 

 {'chris mccullough', 'anthony parker', 'dee brown', 'michael doleac', 'gilbert arenas', 'donnell harvey', 'antonio lang', 'ron baker', 'festus ezeli', 'eric dawson', 'malik monk', 'josh powell', 'derrick williams', 'robert traylor', 'brent scott', 'matt wenstrom', 'jamario moon', 'chris wright', 'eddie house', 'ronald dupree', 'dillon brooks', 'eduardo najera', 'nicolas laprovittola', 'keith booth', 'marcus camby', 'keith closs', 'roshown mcleod', 'a.j. hammons', 'vernon macklin', 'joe stephens', 'terence morris', 'andrae patterson', 'zoran dragic', 'dickey simpkins', 'aaron miles', 'chris robinson', 'brian oliver', 'omar cook', 'ryan stack', 'jeremy richardson', 'bobby simmons', 'brandon ashley', 'randy woods', 'blake griffin', 'vincent yarbrough', 'james johnson', 'alex acker', 'ricky rubio', 'iman shumpert', 'cezary trybanski', "makhtar n'diaye", 'lionel chalmers', 'todd mundt', 'julius randle', 'marcus liberty', 'akil mitchell', 'guy 

In [5]:
# final filtering 
# - remove people with <= 6 years in NBA
# - remove stats entries that won't be used for prediction

filtered_players = dict()
for name, player_obj in players.items():
    years_by_sal = sorted(list(player_obj.salary_by_year_end.keys()))
    years_by_stats = sorted(list(player_obj.stats_by_year_end.keys()))
    if len(years_by_sal) <= 6 or len(years_by_stats) <= 6:
        continue
    
    filtered_players[name] = player_obj
    
    print('{} \n stat {} \n sal {} \n\n'.format(player_obj.name, years_by_stats, years_by_sal))

nick anderson 
 stat [1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002] 
 sal [1991, 1992, 1993, 1994, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003] 


b.j. armstrong 
 stat [1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000] 
 sal [1991, 1992, 1993, 1994, 1996, 1997, 1998, 1999] 


charles barkley 
 stat [1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000] 
 sal [1991, 1992, 1993, 1994, 1995, 1996, 1997, 1999, 2000] 


dana barros 
 stat [1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2004] 
 sal [1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002] 


mookie blaylock 
 stat [1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002] 
 sal [1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002] 


muggsy bogues 
 stat [1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001] 
 sal [1991, 1992, 1993, 1994, 1995, 1996, 1998, 1999

In [6]:
# sanity check
print(players['lebron james'].to_dict())

{'name': 'lebron james', 'stats_by_year_end': {2004: <__main__.PlayerStats object at 0x1515741c18>, 2005: <__main__.PlayerStats object at 0x1515780e80>, 2006: <__main__.PlayerStats object at 0x15157c94a8>, 2007: <__main__.PlayerStats object at 0x1515809cf8>, 2008: <__main__.PlayerStats object at 0x110c5a320>, 2009: <__main__.PlayerStats object at 0x110c9a4a8>, 2010: <__main__.PlayerStats object at 0x1118fb9b0>, 2011: <__main__.PlayerStats object at 0x11193b8d0>, 2012: <__main__.PlayerStats object at 0x111979c88>, 2013: <__main__.PlayerStats object at 0x1119c1390>, 2014: <__main__.PlayerStats object at 0x111a0b208>, 2015: <__main__.PlayerStats object at 0x151585fef0>, 2016: <__main__.PlayerStats object at 0x15158a7940>, 2017: <__main__.PlayerStats object at 0x15158f0550>}, 'salary_by_year_end': {2005: '4320360', 2006: '4621800', 2007: '5828090', 2008: '13041250', 2009: '14410581', 2010: '15779912', 2015: '20644400', 2016: '22971000', 2017: '30963450', 2018: '33285709', 2012: '16022500',

In [7]:
to_dump = dict()
for name in filtered_players:
    player_obj = players[name].to_dict()
    player_obj['stats_by_year_end'] = \
        {year:stats_obj.to_dict() for year,stats_obj in player_obj['stats_by_year_end'].items()}
    to_dump[name] = player_obj

with open('nba.json', 'w') as file:
    json.dump(to_dump, file)

In [8]:
len(to_dump)

609