In [1]:
import pandas as pd
import numpy as np

In [60]:
df_21= pd.read_csv('data21.csv')
df_19 = pd.read_csv('data19.csv')

In [3]:
list(df_21.columns)

['and_ones',
 'assist_percentage',
 'assists',
 'assists_per_poss',
 'block_percentage',
 'blocking_fouls',
 'blocks',
 'blocks_per_poss',
 'box_plus_minus',
 'center_percentage',
 'defensive_box_plus_minus',
 'defensive_rebound_percentage',
 'defensive_rebounds',
 'defensive_rebounds_per_poss',
 'defensive_win_shares',
 'dunks',
 'effective_field_goal_percentage',
 'field_goal_attempts',
 'field_goal_attempts_per_poss',
 'field_goal_perc_sixteen_foot_plus_two_pointers',
 'field_goal_perc_ten_to_sixteen_feet',
 'field_goal_perc_three_to_ten_feet',
 'field_goal_perc_zero_to_three_feet',
 'field_goal_percentage',
 'field_goals',
 'field_goals_per_poss',
 'free_throw_attempt_rate',
 'free_throw_attempts',
 'free_throw_attempts_per_poss',
 'free_throw_percentage',
 'free_throws',
 'free_throws_per_poss',
 'games_played',
 'games_started',
 'half_court_heaves',
 'half_court_heaves_made',
 'height',
 'lost_ball_turnovers',
 'minutes_played',
 'nationality',
 'net_plus_minus',
 'offensive_box

Notre objectif est d'entraîner un modèle de régression sur les données de la saison NBA 2018-2019, pour ensuite prédire les salaires de la saison 2020-2021 (nous n'utilisons pas la saison 2019-2020 qui a été perturbée par la crise du COVID). Pour ce faire, nous devons d'abord normaliser les salaires à cause de l'inflation.

In [63]:
df_19['salary']=(df_19['salary']-df_19['salary'].mean())/df_19['salary'].std()
df_19['salary']

0      2.170112
1      0.999685
2     -0.720051
3     -0.741359
4     -0.859140
         ...   
617         NaN
618    0.144318
619   -0.717232
620   -0.745048
621    1.509498
Name: salary, Length: 622, dtype: float64

# Machine Learning with Scikit Learn

### Creation of X (feature Matrix)

Nous avons sélectionné les quelques variables explicatives présentes dans nos données qui nous semblaient être les plus pertinentes pour juger la valeur salariale d'un joueur. De plus, nombre de variables sont très corrélées entre elles, ainsi les utiliser ensemble n'aurait que peu d'intérêt. Nous ne considérons également que les joueurs ayant comptabilisé plus de 20 matchs sur la saison en question.

In [64]:
x = df_19[df_19['games_played']>=20][['points_per_poss','total_rebounds_per_poss','assists_per_poss','win_shares','player_efficiency_rating']]
x.dropna()
x

Unnamed: 0,points_per_poss,total_rebounds_per_poss,assists_per_poss,win_shares,player_efficiency_rating
0,39.3,17.7,8.4,14.4,30.9
1,25.4,7.4,8.8,8.2,19.3
2,25.4,7.3,5.2,6.5,17.8
3,16.7,8.3,3.8,2.1,10.9
4,13.7,4.4,6.4,0.1,8.0
...,...,...,...,...,...
617,20.0,8.1,1.6,0.7,11.9
618,12.7,5.8,5.6,2.0,11.7
619,16.1,13.4,4.0,6.0,17.3
620,16.0,11.7,1.5,2.1,11.2


### Creation of Y (target)

In [65]:
y = df_19[df_19['games_played']>=20]['salary']
y

0      2.170112
1      0.999685
2     -0.720051
3     -0.741359
4     -0.859140
         ...   
617         NaN
618    0.144318
619   -0.717232
620   -0.745048
621    1.509498
Name: salary, Length: 503, dtype: float64

Ici nous identifions puis retirons de y et de x les joueurs pour lesquels le salaire n'est pas disponible.

In [66]:
index_with_nan = y.index[y.isna()]
print(index_with_nan)

Int64Index([46, 61, 75, 121, 366, 392, 557, 617], dtype='int64')


In [67]:
y = y.dropna()
y

0      2.170112
1      0.999685
2     -0.720051
3     -0.741359
4     -0.859140
         ...   
616   -0.720051
618    0.144318
619   -0.717232
620   -0.745048
621    1.509498
Name: salary, Length: 495, dtype: float64

In [68]:
x = x.drop(index_with_nan)
x

Unnamed: 0,points_per_poss,total_rebounds_per_poss,assists_per_poss,win_shares,player_efficiency_rating
0,39.3,17.7,8.4,14.4,30.9
1,25.4,7.4,8.8,8.2,19.3
2,25.4,7.3,5.2,6.5,17.8
3,16.7,8.3,3.8,2.1,10.9
4,13.7,4.4,6.4,0.1,8.0
...,...,...,...,...,...
616,15.1,8.7,3.2,1.5,15.3
618,12.7,5.8,5.6,2.0,11.7
619,16.1,13.4,4.0,6.0,17.3
620,16.0,11.7,1.5,2.1,11.2


## Machine Learning Regressors

Nous essayons ici différents modèles de Machine Learning utilisant des méthodes de régression pour tenter de prédire les salaires des joueurs.

In [69]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [70]:
np.random.seed(53)

In [71]:
# linear regression model
from sklearn import svm
reg1 = svm.SVR(gamma = 'scale')
reg1 = reg1.fit(x_train, y_train)

In [72]:
y_preds = reg1.predict(x_test)
y_preds

array([-0.79374661, -0.06695978, -0.30596583, -0.20287717, -0.43127398,
       -0.42563064,  0.59440527, -0.45280074,  0.08039358,  0.59441701,
        1.22392872, -0.45522089,  0.89654255, -0.29321369, -0.57265412,
        0.95792633, -0.40289142, -0.38244941,  0.91358939, -0.59722507,
       -0.51244533, -0.38000802, -0.22731431, -0.48642761, -0.14782364,
        0.49329214, -0.38227735, -0.61964316,  0.12629546, -0.25282724,
       -0.41990532, -0.70592945,  0.21096961, -0.279416  , -0.00846889,
        0.41208657,  1.07989789, -0.33522867, -0.69133674,  0.50668294,
       -0.63338027,  0.23991599,  0.12044201,  1.15698286, -0.61601309,
       -0.15374841, -0.56262384, -0.61502956, -0.31404153,  1.84234973,
       -0.41288124, -0.08446618, -0.49541913,  0.32238328,  0.2345236 ,
        1.0768893 , -0.5572122 ,  0.23311162, -0.75797244, -0.48917637,
       -0.57366783, -0.2713796 , -0.53362339, -0.57542609,  1.526832  ,
       -0.55186689, -0.24551558, -0.43702478,  0.6015282 ,  0.17

In [73]:
y_test

616   -0.720051
63    -0.274995
499    1.021514
78    -0.699769
155   -0.558632
         ...   
150   -0.688090
367   -0.518100
7      0.310451
498    0.917126
536   -0.741359
Name: salary, Length: 99, dtype: float64

In [74]:
reg1.score(x_train,y_train)

0.3219006138234836

In [75]:
reg1.score(x_test,y_test)

0.2019198020127425

In [76]:
np.random.seed(37)

In [77]:
from sklearn.linear_model import LinearRegression
reg2 = LinearRegression()
reg2 = reg2.fit(x_train,y_train)

In [78]:
y_preds2  = reg2.predict(x_test)
y_preds2

array([-7.36079416e-01,  4.42793948e-01,  1.34354356e-02, -2.31266838e-02,
       -1.82603723e-01, -1.40682892e-01,  9.08715151e-01, -1.74142035e-01,
        1.32930337e-01,  5.89750309e-01,  1.15751097e+00, -9.00743953e-02,
        9.48362015e-01,  1.31828666e-02, -2.68267869e-01,  1.37282971e+00,
       -1.41722878e-01, -2.66223504e-02,  8.24406602e-01, -3.75567858e-01,
       -2.02540702e-01,  3.23756123e-02,  3.53360263e-03, -1.59797655e-02,
        7.99388285e-02,  7.82183573e-01, -1.58451275e-01, -4.78669535e-01,
        3.44131645e-01, -3.39088521e-02,  5.10801134e-02, -3.25308083e-01,
        4.97319557e-01, -4.03237846e-02,  3.61659787e-01,  7.07411218e-01,
        1.23063565e+00, -2.98060922e-01, -4.99901333e-01,  6.10701487e-01,
       -3.61090939e-01, -3.07510567e-01,  6.39745131e-01,  6.01576576e-01,
       -4.54038080e-01,  2.86666642e-01, -2.52547457e-01, -3.76729972e-01,
        8.12854863e-03,  3.26122977e+00, -1.80485536e-01,  1.81701603e-01,
       -1.36693349e-01,  

In [79]:
reg2.score(x_train, y_train)

0.33649566840970735

In [80]:
reg2.score(x_test, y_test)

0.3341025062116448

In [21]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, accuracy_score, mean_absolute_error

In [81]:
np.random.seed(42)

In [82]:
from sklearn.ensemble import RandomForestRegressor
reg3 = RandomForestRegressor(n_estimators = 100)
reg3 = reg3.fit(x_train, y_train)

In [83]:
y_preds3 = reg3.predict(x_test)
y_preds3

array([-0.37016471,  0.45152803, -0.16327044, -0.37452549, -0.42290806,
       -0.32487172,  0.38084648, -0.32288832, -0.35133775,  0.49318885,
        1.16109653, -0.23192439,  1.2182052 ,  0.54281893, -0.34378496,
        1.08403404, -0.28548109,  1.26938703,  0.2884105 ,  0.11964951,
        0.11013772, -0.05879727,  0.47773242, -0.27928681,  0.26665701,
        0.84191268, -0.39815624,  0.23424148,  0.18490081,  0.79737903,
       -0.15194402, -0.76278331,  0.39634472, -0.37214369,  0.19636021,
        0.82385731,  1.04624278,  0.04771954, -0.50590793,  0.84644407,
       -0.44652581,  0.35517764,  0.37895462,  1.06571445,  0.30325282,
        0.34531481, -0.33695067, -0.45715541,  0.19547122,  2.66045803,
       -0.42844962,  1.28346585,  0.42002077,  0.5083175 ,  0.18823335,
        1.58997631, -0.23614884,  1.02753596, -0.44649062, -0.53213626,
       -0.36811155, -0.2850623 ,  0.32914663, -0.15445212,  1.47155898,
       -0.59266042, -0.17366671, -0.28695188,  0.93153651,  0.33

In [84]:
reg3.score(x_train, y_train)

0.905213301603017

In [85]:
reg3.score(x_test, y_test)

0.47353552889806605

Après ces premiers tests, nous nous sommes dit qu'il était très commun au basket de comparer les performances des joueurs en utilisant trois variables : les points par match, les passes par match et les rebonds par match. Nous avons alors créé ces trois nouvelles variables qui n'étaient initialement pas dans notre base de données, et les avons utilisés pour nos modèles.

### Nouvelles variables

In [86]:
x = df_19[df_19['games_played']>=20][['points_per_game','reb_per_game','ast_per_game','win_shares','player_efficiency_rating']]
x.dropna()
x

Unnamed: 0,points_per_game,reb_per_game,ast_per_game,win_shares,player_efficiency_rating
0,27.694444,12.472222,5.888889,14.4,30.9
1,15.910256,4.641026,5.512821,8.2,19.3
2,15.640625,4.500000,3.203125,6.5,17.8
3,6.413793,3.172414,1.448276,2.1,10.9
4,6.033333,1.933333,2.800000,0.1,8.0
...,...,...,...,...,...
617,4.937500,2.000000,0.406250,0.7,11.9
618,4.031250,1.828125,1.781250,2.0,11.7
619,6.250000,5.212500,1.537500,6.0,17.3
620,4.680556,3.430556,0.430556,2.1,11.2


In [87]:
x = x.drop(index_with_nan)
x

Unnamed: 0,points_per_game,reb_per_game,ast_per_game,win_shares,player_efficiency_rating
0,27.694444,12.472222,5.888889,14.4,30.9
1,15.910256,4.641026,5.512821,8.2,19.3
2,15.640625,4.500000,3.203125,6.5,17.8
3,6.413793,3.172414,1.448276,2.1,10.9
4,6.033333,1.933333,2.800000,0.1,8.0
...,...,...,...,...,...
616,5.416667,3.125000,1.166667,1.5,15.3
618,4.031250,1.828125,1.781250,2.0,11.7
619,6.250000,5.212500,1.537500,6.0,17.3
620,4.680556,3.430556,0.430556,2.1,11.2


In [88]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [89]:
np.random.seed(53)

In [90]:
# linear regression model
from sklearn import svm
reg4 = svm.SVR(gamma = 'scale')
reg4 = reg4.fit(x_train, y_train)

In [91]:
y_preds4 = reg4.predict(x_test)
y_preds4

array([ 0.86605792, -0.43828623,  0.24078132, -0.5588458 , -0.6242413 ,
       -0.62939962,  0.45519019,  2.54697478,  1.31140999, -0.61277026,
       -0.73423322,  1.70533277, -0.10215934,  1.2513915 ,  2.55056214,
        1.83839657,  0.05026471, -0.37660383,  0.91734462, -0.04065861,
       -0.61116472, -0.52464791,  0.20722269, -0.38826234, -0.05690722,
        0.05535103, -0.61116472, -0.66992603, -0.2079259 ,  0.06220281,
       -0.05823962, -0.62939962, -0.53177327, -0.5588458 ,  0.73310368,
       -0.54773408,  1.45549152, -0.62404428,  1.17727585, -0.24816068,
       -0.66476559, -0.48255817,  1.60879344, -0.0831194 , -0.27581436,
        0.03656317,  2.63845016,  0.19340421, -0.1980737 , -0.10270066,
        0.27043611,  0.41906397, -0.42214354, -0.72080569, -0.61233236,
        1.28767949,  0.40416234, -0.61406759,  0.92946403,  2.39762815,
       -0.52094052, -0.52270583,  0.78898974,  2.61790416, -0.69901445,
       -0.68145473, -0.54780686, -0.50444992, -0.46984813,  0.54

In [92]:
y_test

571    2.165219
431    0.048753
219    0.680151
103   -0.695121
418   -0.669249
         ...   
313   -0.741359
90     0.557005
407   -0.348875
270    0.616245
320   -0.576562
Name: salary, Length: 99, dtype: float64

In [93]:
reg4.score(x_train,y_train)

0.41038957687610855

In [94]:
reg4.score(x_test,y_test)

0.3164058670677353

In [95]:
np.random.seed(37)

In [96]:
from sklearn.linear_model import LinearRegression
reg5 = LinearRegression()
reg5 = reg5.fit(x_train,y_train)

In [97]:
y_preds5  = reg5.predict(x_test)
y_preds5

array([ 0.87594713, -0.24262913,  0.51643024, -0.56391517, -0.21295532,
       -0.79675902,  0.63484189,  2.07376865,  1.14593553, -0.41492328,
       -0.53345633,  1.44410765, -0.01614269,  0.9852987 ,  1.48845421,
        1.40504195,  0.27824873, -0.1607963 ,  0.82391126,  0.21160346,
       -0.25743871, -0.14315586,  0.44687444, -0.20002305,  0.4197308 ,
        0.14657513, -0.25743871, -0.49789881,  0.26270763,  0.35443156,
        0.18120163, -0.79675902, -0.24669699, -0.56391517,  0.99838043,
       -0.50154589,  1.15689148, -0.6015806 ,  1.07558223,  0.09259075,
       -0.64023335, -0.35204013,  1.17346892,  0.27060754, -0.23408407,
        0.10926051,  1.94942065,  0.41413726,  0.04377163,  0.0834005 ,
        0.46462606,  0.65727089, -0.08769279, -0.71453862, -0.5970341 ,
        1.19022553,  0.47289396, -0.2994222 ,  1.00597948,  1.92172879,
       -0.41562689, -0.23342013,  1.02567722,  1.54721528, -0.47941908,
       -0.65711673, -0.36898233, -0.46677237, -0.17169702,  0.75

In [98]:
reg5.score(x_train, y_train)

0.4127916273728065

In [99]:
reg5.score(x_test, y_test)

0.39460121698365325

In [100]:
np.random.seed(42)

In [101]:
from sklearn.ensemble import RandomForestRegressor
reg6 = RandomForestRegressor(n_estimators = 100)
reg6 = reg6.fit(x_train, y_train)

In [102]:
y_preds6 = reg6.predict(x_test)
y_preds6

array([ 1.05258302, -0.22121308,  0.45045943,  0.69873726, -0.17019739,
       -0.57552079,  0.75185503,  1.7946455 ,  0.7685569 , -0.59832194,
       -0.71960248,  0.9234782 ,  0.42165877,  0.96834411,  2.46765344,
        0.8630398 ,  0.26890094,  0.41050236,  0.21302572,  0.33206723,
       -0.20729486,  0.91405596,  0.80600913, -0.32216758,  0.05926532,
        1.02090254, -0.20729486, -0.55020071,  0.21615157,  0.02135368,
        0.01848022, -0.57552079, -0.45214655,  0.69873726,  0.38517766,
       -0.06434219,  1.60646408, -0.5981153 ,  0.54389139, -0.00567492,
       -0.37715939, -0.14385813,  0.5797212 ,  0.68513808, -0.20402413,
       -0.44564164,  2.75231123,  0.26713019, -0.04774118,  0.10888796,
        0.52446404,  0.61073351, -0.25438406, -0.72981923,  0.76940983,
        1.84603997,  0.12808569, -0.14742566,  0.57474463,  2.02399131,
        0.56388   , -0.46647188,  1.71565642,  2.75759274, -0.61182753,
       -0.74063633, -0.56464386,  0.07224971, -0.39601032,  0.89

In [103]:
reg6.score(x_train, y_train)

0.9201488064846495

In [104]:
reg6.score(x_test, y_test)

0.4494517842637

Après avoir essayer trois modèles, le plus efficace (lorsque l'on se base sur le R-squared de la régression) semble être le LinearRegression. On va alors maintenant, en utilisant ce modèle, faire varier certains de ses paramètres pour optimiser son utilisation.

## Improving our model

In [56]:
reg5.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

On peut essayer de faire varier le paramètre "n_estimators".

In [105]:
L = []
for i in range (10, 210, 10):
    np.random.seed(25)
    reg = RandomForestRegressor(n_estimators = i)
    reg = reg.fit(x_train, y_train)
    r2 = reg.score(x_test,y_test)
    print(i, "R squared =", r2)
    L.append(r2)
print("n_opti =",(L.index(max(L))+1)*10)

10 R squared = 0.3875544937323129
20 R squared = 0.451044607052271
30 R squared = 0.4150744208495334
40 R squared = 0.42135518453156684
50 R squared = 0.4258983354364321
60 R squared = 0.4296370519182392
70 R squared = 0.4280290728440568
80 R squared = 0.42493737614028293
90 R squared = 0.4388163839106439
100 R squared = 0.435989178126314
110 R squared = 0.4384026837523035
120 R squared = 0.4404406849113326
130 R squared = 0.43914688678346714
140 R squared = 0.4446712926614821
150 R squared = 0.4453450360216614
160 R squared = 0.4463253118654794
170 R squared = 0.44443673920506044
180 R squared = 0.4449627916009522
190 R squared = 0.4479098990663627
200 R squared = 0.4491777769286567
n_opti = 20


Suite à cet essai, on constate que les résultats du modèle varient très peu entre des valeurs différentes de n_estimators.

In [129]:
df_new = df_21[df_21['games_played']>=20]

In [130]:
mn = df_new['salary'].mean()
st = df_new['salary'].std()

In [131]:
df_new['predicted salary'] = mn + st*reg6.predict(df_21[df_21['games_played']>=20][['points_per_game','reb_per_game','ast_per_game','win_shares','player_efficiency_rating']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [132]:
df_new

Unnamed: 0,and_ones,assist_percentage,assists,assists_per_poss,block_percentage,blocking_fouls,blocks,blocks_per_poss,box_plus_minus,center_percentage,...,value_over_replacement_player,weight,win_shares,win_shares_per_48_minutes,name,team,points_per_game,reb_per_game,ast_per_game,predicted salary
1,,28.7,357.0,8.3,3.2,,73.0,1.7,9.0,0,...,5.6,242,10.2,0.244,Giannis Antetokounmpo,MIL,28.147541,11.000000,5.852459,3.404077e+07
2,,10.3,46.0,3.9,1.6,,10.0,0.9,-3.5,0,...,-0.2,219,0.7,0.057,Thanasis Antetokounmpo,MIL,2.912281,2.175439,0.807018,4.013782e+06
3,,22.2,188.0,7.8,0.1,,1.0,0.0,-2.1,0,...,0.0,183,2.1,0.088,D.J. Augustin,MIL,7.684211,1.649123,3.298246,6.930539e+06
5,,6.3,81.0,2.4,1.3,,23.0,0.7,0.0,0,...,0.8,209,3.5,0.108,Pat Connaughton,MIL,6.782609,4.811594,1.173913,4.433764e+06
6,,7.9,49.0,3.0,2.9,,26.0,1.6,0.4,0,...,0.5,221,2.2,0.133,Torrey Craig,MIL,5.480000,3.940000,0.980000,3.914598e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,,15.4,99.0,4.7,0.8,,9.0,0.4,-0.5,0,...,0.4,194,1.9,0.094,Jordan Poole,GSW,12.000000,1.803922,1.941176,1.022029e+07
622,,18.1,150.0,6.4,2.1,,26.0,1.1,0.8,0,...,0.8,209,3.0,0.130,Juan Toscano-Anderson,GSW,5.679245,4.415094,2.830189,9.795332e+06
623,,22.5,174.0,7.9,1.0,,11.0,0.5,-4.0,0,...,-0.5,210,0.9,0.039,Brad Wanamaker,GSW,5.508197,1.721311,2.852459,2.513736e+06
624,,10.9,167.0,3.3,2.6,,70.0,1.4,-0.4,0,...,1.0,197,3.9,0.080,Andrew Wiggins,GSW,18.591549,4.887324,2.352113,1.604607e+07


In [133]:
df_new['gap salaries']= df_new['salary'] - df_new['predicted salary']
df_new['error ratio']=df_new['gap salaries']/df_new['salary']
df_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,and_ones,assist_percentage,assists,assists_per_poss,block_percentage,blocking_fouls,blocks,blocks_per_poss,box_plus_minus,center_percentage,...,win_shares,win_shares_per_48_minutes,name,team,points_per_game,reb_per_game,ast_per_game,predicted salary,gap salaries,error ratio
1,,28.7,357.0,8.3,3.2,,73.0,1.7,9.0,0,...,10.2,0.244,Giannis Antetokounmpo,MIL,28.147541,11.000000,5.852459,3.404077e+07,-6.512680e+06,-0.236583
2,,10.3,46.0,3.9,1.6,,10.0,0.9,-3.5,0,...,0.7,0.057,Thanasis Antetokounmpo,MIL,2.912281,2.175439,0.807018,4.013782e+06,-2.312189e+06,-1.358838
3,,22.2,188.0,7.8,0.1,,1.0,0.0,-2.1,0,...,2.1,0.088,D.J. Augustin,MIL,7.684211,1.649123,3.298246,6.930539e+06,-2.638721e+05,-0.039581
5,,6.3,81.0,2.4,1.3,,23.0,0.7,0.0,0,...,3.5,0.108,Pat Connaughton,MIL,6.782609,4.811594,1.173913,4.433764e+06,5.045087e+05,0.102163
6,,7.9,49.0,3.0,2.9,,26.0,1.6,0.4,0,...,2.2,0.133,Torrey Craig,MIL,5.480000,3.940000,0.980000,3.914598e+06,-2.235744e+06,-1.331708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,,15.4,99.0,4.7,0.8,,9.0,0.4,-0.5,0,...,1.9,0.094,Jordan Poole,GSW,12.000000,1.803922,1.941176,1.022029e+07,-8.157012e+06,-3.953420
622,,18.1,150.0,6.4,2.1,,26.0,1.1,0.8,0,...,3.0,0.130,Juan Toscano-Anderson,GSW,5.679245,4.415094,2.830189,9.795332e+06,-9.437037e+06,-26.338736
623,,22.5,174.0,7.9,1.0,,11.0,0.5,-4.0,0,...,0.9,0.039,Brad Wanamaker,GSW,5.508197,1.721311,2.852459,2.513736e+06,-2.637356e+05,-0.117216
624,,10.9,167.0,3.3,2.6,,70.0,1.4,-0.4,0,...,3.9,0.080,Andrew Wiggins,GSW,18.591549,4.887324,2.352113,1.604607e+07,1.349594e+07,0.456839


In [135]:
df_new.groupby('team')['gap salaries'].sum()

team
ATL   -3.739311e+07
BOS   -2.634409e+07
BRK    1.803186e+07
CHI   -2.016370e+07
CHO   -3.571098e+07
CLE   -2.353342e+07
DAL   -1.787834e+07
DEN   -1.657767e+07
DET   -3.309845e+07
GSW   -1.135810e+07
HOU   -1.800856e+07
IND   -1.036042e+07
LAC    5.542773e+06
LAL    4.483596e+06
MEM   -3.550914e+07
MIA   -2.247075e+07
MIL   -1.797330e+07
MIN    8.050525e+06
NOP   -2.127725e+07
NYK   -4.829240e+07
OKC   -8.637077e+07
ORL   -1.350037e+07
PHI   -1.243709e+07
PHO   -9.267352e+06
POR   -8.757811e+06
SAC   -2.011918e+07
SAS    1.175127e+05
TOR   -2.698284e+07
UTA    4.778194e+06
WAS    5.125928e+05
Name: gap salaries, dtype: float64