In [1]:
import pandas as pd
import time
from nba_api.stats.endpoints import LeagueDashPlayerStats
from nba_api.stats.endpoints import CommonPlayerInfo
from nba_api.stats.static import players
import random

In [2]:
all_stats = [] # will be iteratively appended to
for year in range(1980, 2025): # approx 4.5 min to run
    season_str = f"{year}-{str(year + 1)[2:]}" # seasons in api are formatted '1980-81' and so on
    print(f"Working on year: {season_str}")
    try:
        data = LeagueDashPlayerStats(
            season = season_str,
            season_type_all_star = "Regular Season",
            per_mode_detailed = "PerGame",
            league_id_nullable = "00"
        ).get_data_frames()
        season_df = data[0]
        season_df["SEASON"] = season_str
        all_stats.append(season_df)
    except Exception as e:
        print(f"Error on season: {season_str}")
    time.sleep(0.8)

Working on year: 1980-81
Working on year: 1981-82
Working on year: 1982-83
Working on year: 1983-84
Working on year: 1984-85
Working on year: 1985-86
Working on year: 1986-87
Working on year: 1987-88
Working on year: 1988-89
Working on year: 1989-90
Working on year: 1990-91
Working on year: 1991-92
Working on year: 1992-93
Working on year: 1993-94
Working on year: 1994-95
Working on year: 1995-96
Working on year: 1996-97
Working on year: 1997-98
Working on year: 1998-99
Working on year: 1999-00
Working on year: 2000-01
Working on year: 2001-02
Working on year: 2002-03
Working on year: 2003-04
Working on year: 2004-05
Working on year: 2005-06
Working on year: 2006-07
Working on year: 2007-08
Working on year: 2008-09
Working on year: 2009-10
Working on year: 2010-11
Working on year: 2011-12
Working on year: 2012-13
Working on year: 2013-14
Working on year: 2014-15
Working on year: 2015-16
Working on year: 2016-17
Working on year: 2017-18
Working on year: 2018-19
Working on year: 2019-20


In [4]:
pd_stats = pd.concat(all_stats, ignore_index = True)
pd_stats_final = pd_stats[pd_stats['MIN'] >= 20.0].copy() # filter so all minute averages >= 20
pd_stats_final = pd_stats_final[['SEASON'] + [column for column in pd_stats_final if column != 'SEASON']] # put season first
pd_stats_final.to_csv("base_player_stats.csv", index = False)

  pd_stats = pd.concat(all_stats, ignore_index = True)


In [5]:
pids = pd_stats_final["PLAYER_ID"].astype(int).unique()
p_attributes = []
roster = players.get_players()
p_map = {player['id']: player['full_name'] for player in roster}

split data into fourths due to api bootleneck
rate limit if sleep time is very low, sleep time of 0.8-1.5 takes over 1300 seconds for the unique ids

In [6]:
first_half = pids[:len(pids)//2]
first_quarter = first_half[:len(first_half)//2]
second_quarter = first_half[len(first_half)//2:]
second_half = pids[len(pids)//2:]
third_quarter = second_half[:len(second_half)//2]
fourth_quarter = second_half[len(second_half)//2:]

In [None]:
for pid in first_quarter:
    try:
        print(f"{pid}")
        info = CommonPlayerInfo(player_id = pid,
                                timeout = 60).get_normalized_dict()
        pdata = info['CommonPlayerInfo'][0]
        attributes = {'PLAYER_ID': pid, 'POSITION': pdata.get('POSITION'), 'HEIGHT': pdata.get('HEIGHT'), 'WEIGHT': pdata.get('WEIGHT')}
        p_attributes.append(attributes)
        time.sleep(random.uniform(0.8, 1.5))
    except Exception as e:
        time.sleep(5)
        break

920
275
947
297
457
358
193
324
952
213
686
288
717
422
769
182
902
898
448
682
184
258
216
735
179
935
433
384
61
435
76
787
891
369
164
201
423
168
904
185
363
221
361
17
722
905
107
757
105
344
198
330
239
764
133
1051
244
209
23
192
157
400
934
365
96
87
129
1122
57
28
224
922
376
677
1065
911
280
56
718
248
45
786
49
779
299
255
246
21
65
731
165
265
906
765
270
73
1134
375
469
467
420
204
760
711
754
247
693
317
304
271
436
252
173
383
72
954
708
134
788
234
703
111
913
84
195
283
953
919
26
51
907
948
53
95
707
349
899
55
761
1074
511
714
893
63
782
42
302
177
203
98
89
178
901
136
121
442
461
739
951
397
364
296
894
22
109
123
393
915
166
938
964
208
417
64
390
281
937
251
406
949
762
431
428
914
950
70
120
210
371
896
103
339
67
389
763
145
696
895
238
137
452
219
124
702
1005
445
781
243
228
673
924
1533
1498
183
1517
699
1510
1477
1538
1497
685
713
456
724
1504
353
1507
965
923
695
386
15
1540
956
458
93
147
1114
961
1496
977
441
1444
1508
688
1565
932
1073
1500
694
955
278


In [None]:
pd_attributes1 = pd.DataFrame(p_attributes)
pd_attributes1.to_csv("q1.csv", index = False)

In [6]:
p_attributes = []
for pid in second_quarter:
    try:
        print(f"{pid}")
        info = CommonPlayerInfo(player_id = pid,
                                timeout = 60).get_normalized_dict()
        pdata = info['CommonPlayerInfo'][0]
        attributes = {'PLAYER_ID': pid, 'POSITION': pdata.get('POSITION'), 'HEIGHT': pdata.get('HEIGHT'), 'WEIGHT': pdata.get('WEIGHT')}
        p_attributes.append(attributes)
        time.sleep(random.uniform(0.8, 1.5))
    except Exception as e:
        time.sleep(5)
        break

1719
1802
1724
2084
43
1088
2042
2032
77
1023
1761
1751
979
2030
446
990
1531
2033
2034
983
2050
1727
1740
1888
1728
1513
1536
1905
2217
2036
1894
2035
2046
1904
1901
2204
1944
2059
2240
2045
1956
1521
2054
2224
2037
2260
2202
1902
2207
1886
1908
1563
1924
2072
1737
2200
2047
2210
1729
1934
1763
2203
2049
2031
1928
2225
2239
1731
2209
2405
2430
2406
2402
2245
2051
2400
1921
2452
2048
2215
2398
1741
2454
2198
2124
1938
2060
2229
1960
2403
1542
1747
2446
2501
2206
1742
2211
2199
2428
1735
2397
2250
2564
2073
2306
2546
2418
2547
1544
2549
2404
1800
2052
2548
1863
2248
2201
2041
2436
2063
2407
2553
2044
2412
2038
2422
2572
2413
2586
2550
2544
2571
1548
2043
2246
2399
1952
2223
2098
2551
2419
2617
2216
2738
2804
2732
2243
2365
2768
2424
2429
2443
2730
2731
2410
2222
2747
2749
2078
2735
2746
2416
2693
2594
2557
2736
2605
2408
2590
2569
2420
2056
2624
1594
2733
101106
101112
2294
101111
101108
101122
2561
2753
101114
2484
2734
1922
2592
101127
101181
2755
2039
101129
2556
101107
2588
2554
101

In [7]:
pd_attributes2 = pd.DataFrame(p_attributes)
pd_attributes2.to_csv("q2.csv", index = False)

In [6]:
for pid in third_quarter:
    try:
        print(f"{pid}")
        info = CommonPlayerInfo(player_id = pid,
                                timeout = 60).get_normalized_dict()
        pdata = info['CommonPlayerInfo'][0]
        attributes = {'PLAYER_ID': pid, 'POSITION': pdata.get('POSITION'), 'HEIGHT': pdata.get('HEIGHT'), 'WEIGHT': pdata.get('WEIGHT')}
        p_attributes.append(attributes)
        time.sleep(random.uniform(0.8, 1.5))
    except Exception as e:
        time.sleep(5)
        break

2853
101141
201588
201611
201581
200755
201935
201162
201973
201938
201950
201977
201587
201802
201956
202130
201579
200769
201939
201959
201944
201951
201936
202083
101198
202087
101161
201948
201933
101138
201963
201967
201589
201599
201971
202326
202334
202339
202323
202390
201945
202328
201580
201949
201573
201975
202322
202348
201177
2743
202361
101162
202331
201583
201970
2756
201586
200761
201603
201962
201946
202325
202329
101187
202340
202386
202687
202688
201957
202718
202698
202810
202345
201980
201974
202324
202682
202388
202327
201609
202330
202349
202970
202697
202738
101189
201160
201952
202391
202406
202695
202689
202702
202338
202691
202681
201991
202705
201578
201593
202335
201937
201958
200749
202344
202684
201985
203144
203083
203076
203085
203078
203081
203079
202734
201947
202066
203084
202710
202685
201585
202713
202362
202336
202730
202693
203090
203077
202696
201600
200782
202347
201577
200779
201168
202699
203092
201961
202692
203463
203148
201960
203110
20268

In [7]:
pd_attributes3 = pd.DataFrame(p_attributes)
pd_attributes3.to_csv("q3.csv", index = False)

In [7]:
for pid in fourth_quarter:
    try:
        print(f"{pid}")
        info = CommonPlayerInfo(player_id = pid,
                                timeout = 60).get_normalized_dict()
        pdata = info['CommonPlayerInfo'][0]
        attributes = {'PLAYER_ID': pid, 'POSITION': pdata.get('POSITION'), 'HEIGHT': pdata.get('HEIGHT'), 'WEIGHT': pdata.get('WEIGHT')}
        p_attributes.append(attributes)
        time.sleep(random.uniform(0.8, 1.5))
    except Exception as e:
        time.sleep(5)
        break
final_attributes = pd.DataFrame(p_attributes)

1626179
203107
203584
1627755
1627820
1628504
1627936
1629019
1628389
203998
1627854
1626224
1628990
1629012
1628422
1627863
202722
1627761
1629028
1628401
203960
1628973
1628991
1629203
1628371
1629006
1628443
1629026
1628989
1628995
1629013
1629029
1627736
1628365
1628963
1628467
1628969
1628970
1629011
1626149
1628420
203943
1626192
1626154
1629066
1626220
1627853
1628983
1628390
1628418
1628470
1629027
1626203
1626145
1627215
1628976
1628988
1629014
203382
1629634
1628971
1629629
1629661
1626174
1629632
1628464
1629636
1629631
1627884
1628984
1628978
1629130
1629672
1627788
1629018
1629630
1627774
1629633
203895
1629673
1629308
1629134
1629645
1629065
1629652
1628370
1628539
1626181
1629023
1627775
1629628
1629060
1629003
1629004
1626168
1628972
1629639
1628380
1629627
1630197
1630162
1629717
1628419
1626184
1628449
1629643
1630175
1629647
1629001
1630166
1630217
1628407
1629091
1630267
1630466
1628960
1628977
1630171
1629676
1630191
1627826
1630183
1630256
1627751
1630164
1630206


In [8]:
pd_attributes4 = pd.DataFrame(p_attributes)
pd_attributes4.to_csv("q4.csv", index = False)

concatenate q1, q2, q3, q4

In [12]:
full_app = []
full = ["q1.csv", "q2.csv", "q3.csv", "q4.csv"]
for file in full:
    df = pd.read_csv(file)
    full_app.append(df)
full = pd.concat(full_app, ignore_index=True)
full.to_csv("player_traits.csv")

merge the player stats and the player traits over the player ids, which is the only truly distinct key

In [14]:
playerstats = pd.read_csv("base_player_stats.csv")
final = pd.merge(playerstats, full, on = "PLAYER_ID")
final.to_csv("all_player_stats_1996-2025.csv")