In [1]:
import json
import pprint
import csv

pp = pprint.PrettyPrinter(indent=4, width = 700)

In [2]:
NUM_BUCKETS = 4

In [3]:
import numpy as np

def find_cutoffs_by_num_buckets(data, num_buckets):
    sorted_data = np.array(sorted(data))
    percents_per_bucket = 100 / num_buckets
    percentiles = list()
    start = 0
    for bucket in range(num_buckets):
        start += percents_per_bucket
        if bucket == num_buckets - 1:
            start = min(100, start)
        percentiles.append(start)
        
    cutoffs = list()
    for percentile in percentiles:
        cutoffs.append(np.percentile(sorted_data, percentile))
    
    return cutoffs
    
print(find_cutoffs_by_num_buckets([0,1,2,3,4,5,6,7,8,9,10], 4))

[2.5, 5.0, 7.5, 10.0]


In [4]:
player_data_by_year = []

with open('nba.json', 'r') as file:
    data = json.load(file)

In [5]:
all_salaries_by_year = dict()
for player, player_obj in data.items():
    salary_by_year_end = player_obj['salary_by_year_end']
    for year, salary in salary_by_year_end.items():
        if year not in all_salaries_by_year:
            all_salaries_by_year[year] = [salary]
        else:
            all_salaries_by_year[year].append(salary)

cutoffs_by_year = dict() # year : [NUM_BUCKETS items for cutoffs]
for year, all_salaries in all_salaries_by_year.items():
    all_salaries = [int(salary) for salary in all_salaries]
    cutoffs = find_cutoffs_by_num_buckets(all_salaries, NUM_BUCKETS)
    cutoffs_by_year[year] = cutoffs

pp.pprint(cutoffs_by_year)

{   '1991': [500000.0, 850000.0, 1400000.0, 4250000.0],
    '1992': [518750.0, 941000.0, 1662500.0, 3786000.0],
    '1993': [625000.0, 1100000.0, 1900000.0, 5720000.0],
    '1994': [739500.0, 1265500.0, 2271500.0, 5740000.0],
    '1995': [845000.0, 1500000.0, 2587475.0, 7300000.0],
    '1996': [955000.0, 1702000.0, 2881500.0, 18724000.0],
    '1997': [876000.0, 1703000.0, 3087500.0, 30140000.0],
    '1998': [999430.0, 1933440.0, 3466125.0, 33140000.0],
    '1999': [1000000.0, 1923000.0, 3725000.0, 18500000.0],
    '2000': [1100000.0, 2000000.0, 4082500.0, 17142858.0],
    '2001': [1200000.0, 2290000.0, 4806012.5, 19610000.0],
    '2002': [1188810.0, 2746752.5, 5312500.0, 22400000.0],
    '2003': [1100000.0, 3000000.0, 5694231.0, 25200000.0],
    '2004': [1070000.0, 2940625.0, 5832812.5, 28000000.0],
    '2005': [1504272.0, 3371393.0, 6187500.0, 27696430.0],
    '2006': [1661250.75, 3500000.0, 6850328.75, 20000000.0],
    '2007': [1750000.0, 3661207.0, 6807000.0, 21000000.0],
    '2008'

In [6]:
for player, player_obj in data.items():
    player_data = []
    player_name = player_obj['name']
    salary_by_year_end = player_obj['salary_by_year_end']
    stats_by_year_end = player_obj['stats_by_year_end']
    year_ends_played = player_obj['year_ends_played']
    
    for year_end in year_ends_played:
        player_year_data = [player_name]
        player_year_data.append(year_end)
        
        # add player salary + salary bucket
        if str(year_end + 1) in salary_by_year_end:
            salary = int(salary_by_year_end[str(year_end + 1)])
            player_year_data.append(salary)
            relevant_cutoffs = cutoffs_by_year[str(year_end + 1)]
#             print('{} year {} cutoffs {} salary'.format(year_end + 1, relevant_cutoffs, salary))
            
            assigned_bucket = 1
            for idx, cutoff in enumerate(relevant_cutoffs):
                if salary > cutoff:
                    assigned_bucket += 1
                    continue
                else:
                    break
#             print(assigned_bucket)
            
            player_year_data.append(assigned_bucket)
            
        else:
            player_year_data.append(None)
            player_year_data.append(None)
        
        # add player stats
        if str(year_end) in stats_by_year_end:
            for stat, val in stats_by_year_end[str(year_end)].items():
                player_year_data.append(val)
        else:
            for i in range(13):
                player_year_data.append(None)
            
        player_data.append(player_year_data)
    player_data_by_year += player_data

# pp.pprint(player_data)
print(len(player_data_by_year))
print(player_data_by_year[0])
print(player_data_by_year[1])
print(player_data_by_year[2])

player_data_by_year = [player_year for player_year in player_data_by_year if None not in player_year]
incorrect = [player_year for player_year in player_data_by_year if player_year[3] == 5 or player_year[3] == 0]

# DONT CHANGE THIS SHIT TOO MUCH
# each row = 
# (0 NAME, 1 YEAR, 2 SALARY, 3 SALARY BUCKET, 4 AST, 5 BLK, 6 DREB, 7 FT_P, 8 OREB, 9 PER, 10 PF, 11 PTS, 12 STL, 
# 13 THREE_P, 14 TO, 15 TS, 16 TWO_P)

print('{} wrong entries for quartile'.format(len(incorrect)))

7390
['a.c. green', 1991, 1750000, 4, 71, 23, 315, 0.738, 201, 13.8, 117, 750, 59, 0.2, 99, 0.556, 0.507]
['a.c. green', 1992, 1750000, 3, 117, 36, 456, 0.744, 306, 16.7, 141, 1116, 91, 0.214, 111, 0.556, 0.495]
['a.c. green', 1993, 1885000, 3, 116, 39, 424, 0.739, 287, 16.3, 149, 1051, 88, 0.348, 116, 0.603, 0.55]
0 wrong entries for quartile


In [7]:
pp.pprint(player_data_by_year[-3:])

[['zydrunas ilgauskas', 2007, 10142156, 4, 123, 98, 357, 0.807, 242, 18.0, 257, 925, 48, 0.0, 141, 0.527, 0.486], ['zydrunas ilgauskas', 2008, 10841615, 4, 104, 120, 419, 0.802, 263, 18.7, 247, 1029, 34, 0.0, 135, 0.522, 0.475], ['zydrunas ilgauskas', 2009, 11541074, 4, 64, 84, 333, 0.799, 157, 18.0, 183, 838, 28, 0.385, 90, 0.523, 0.477]]


In [8]:
with open('nba_player_year.csv', 'w') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerows([['name', 'year', 'salary', 'salary_bucket', 'ast', 'blk', 'dreb', 'ft_p', 'oreb', 'per', \
                    'pf', 'pts', 'stl', 'three_p', 'to', 'ts', 'two_p']])
    writer.writerows(player_data_by_year)