In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from statsmodels.iolib.smpickle import load_pickle

### Dataset/prelim

In [2]:
# Set up dataframes
nba_full = pd.read_csv('DoGR-master/Data/nba_trimmed.csv')
wnba_full = pd.read_csv('DoGR-master/Data/wnba_trimmed.csv')

nba_full['gender'] = 0
wnba_full['gender'] = 1
full = pd.concat((nba_full, wnba_full), ignore_index=True)

features = ['age', 'height', 'pts', 'ast', 'foreign']
full_features = ['gender', 'age', 'height', 'pts', 'ast', 'foreign']

In [3]:
# # Export for DoGR
# nba_sub = nba_full[['salary', 'age', 'height', 'pts', 'ast', 'usg_p', 'foreign']]
# wnba_sub = wnba_full[['salary', 'age', 'height', 'pts', 'ast', 'usg_p', 'foreign']]
# full_sub = full[['salary', 'gender', 'age', 'height', 'pts', 'ast', 'usg_p', 'foreign']]

# nba_sub.to_csv('DoGR-master/Data/nba.csv', index=False)
# wnba_sub.to_csv('DoGR-master/Data/wnba.csv', index=False)
# full_sub.to_csv('DoGR-master/Data/full.csv', index=False)

In [4]:
# Show table structure
nba_full.head(5)

Unnamed: 0,salary,age,height,weight,gp,pts,reb,ast,net_rtg,oreb_p,dreb_p,usg_p,ts_p,ast_p,foreign,gender
0,45780966,33,74,185,47,26.0,5.5,6.3,13.6,0.018,0.127,0.301,0.586,0.284,0,0
1,44211146,33,75,200,52,18.7,7.9,7.8,-3.0,0.037,0.173,0.267,0.511,0.343,0,0
2,43848000,32,77,220,44,22.5,8.0,10.2,0.6,0.026,0.179,0.277,0.576,0.399,0,0
3,41180544,37,81,250,36,29.1,7.7,6.3,-0.3,0.03,0.172,0.306,0.619,0.301,0,0
4,40918900,33,82,240,36,29.3,7.4,5.8,4.5,0.016,0.168,0.31,0.626,0.266,0,0


### Full OLS

In [5]:
# Prelim OLS to pick important features: create subsets of train/test data
x = full[['gender', 'age', 'height', 'pts', 'ast']]
y = full['salary']
all_x_train, all_x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=24)

# Utilize statsmodels.api functionality for pulling feature p-values
x2_train = sm.add_constant(all_x_train)
est = sm.OLS(y_train, x2_train)
est2 = est.fit()
print(est2.summary())

print('\n\n')

# Prelim OLS to pick important features: create subsets of train/test data
x = full[full_features]
y = full['salary']
all_x_train, all_x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=24)

# Utilize statsmodels.api functionality for pulling feature p-values
x2_train = sm.add_constant(all_x_train)
est = sm.OLS(y_train, x2_train)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.565
Model:                            OLS   Adj. R-squared:                  0.559
Method:                 Least Squares   F-statistic:                     95.55
Date:                Mon, 21 Mar 2022   Prob (F-statistic):           2.58e-64
Time:                        10:56:51   Log-Likelihood:                -6370.7
No. Observations:                 374   AIC:                         1.275e+04
Df Residuals:                     368   BIC:                         1.278e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -3.622e+07   8.65e+06     -4.188      0.0

### NBA OLS

In [6]:
# Prelim OLS to pick important features: create subsets of train/test data
x = nba_full[['age', 'height', 'pts', 'ast']]
y = nba_full['salary']
all_x_train, all_x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=24)

# Utilize statsmodels.api functionality for pulling feature p-values
x2_train = sm.add_constant(all_x_train)
est = sm.OLS(y_train, x2_train)
est2 = est.fit()
print(est2.summary())

print('\n\n')

# Prelim OLS to pick important features: create subsets of train/test data
x = nba_full[features]
y = nba_full['salary']
all_x_train, all_x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=24)

# Utilize statsmodels.api functionality for pulling feature p-values
x2_train = sm.add_constant(all_x_train)
est = sm.OLS(y_train, x2_train)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.652
Model:                            OLS   Adj. R-squared:                  0.647
Method:                 Least Squares   F-statistic:                     122.3
Date:                Mon, 21 Mar 2022   Prob (F-statistic):           1.25e-58
Time:                        10:56:51   Log-Likelihood:                -4518.7
No. Observations:                 266   AIC:                             9047.
Df Residuals:                     261   BIC:                             9065.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -5.004e+07   9.88e+06     -5.065      0.0

### WNBA OLS

In [7]:
# Prelim OLS to pick important features: create subsets of train/test data
x = wnba_full[['age', 'height', 'pts', 'ast']]
y = wnba_full['salary']
all_x_train, all_x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=24)

# Utilize statsmodels.api functionality for pulling feature p-values
x2_train = sm.add_constant(all_x_train)
est = sm.OLS(y_train, x2_train)
est2 = est.fit()
print(est2.summary())

print('\n\n')

# Prelim OLS to pick important features: create subsets of train/test data
x = wnba_full[features]
y = wnba_full['salary']
all_x_train, all_x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=24)

# Utilize statsmodels.api functionality for pulling feature p-values
x2_train = sm.add_constant(all_x_train)
est = sm.OLS(y_train, x2_train)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.609
Model:                            OLS   Adj. R-squared:                  0.594
Method:                 Least Squares   F-statistic:                     40.15
Date:                Mon, 21 Mar 2022   Prob (F-statistic):           3.10e-20
Time:                        10:56:51   Log-Likelihood:                -1293.2
No. Observations:                 108   AIC:                             2596.
Df Residuals:                     103   BIC:                             2610.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2.086e+05   9.42e+04     -2.214      0.0

### Mixed effect

In [8]:
# Model comparison: NULL
null_md = smf.mixedlm('salary ~ age + height + pts + ast', full, groups=full['gender'])
null_mdf = null_md.fit()
print(null_mdf.summary())

                      Mixed Linear Model Regression Results
Model:                  MixedLM      Dependent Variable:      salary             
No. Observations:       535          Method:                  REML               
No. Groups:             2            Scale:                   35719187953327.2031
Min. group size:        155          Log-Likelihood:          -9044.4712         
Max. group size:        380          Converged:               Yes                
Mean group size:        267.5                                                    
---------------------------------------------------------------------------------
                Coef.          Std.Err.    z    P>|z|     [0.025        0.975]   
---------------------------------------------------------------------------------
Intercept      -34068090.425 7330240.423 -4.648 0.000 -48435097.652 -19701083.198
age               370049.073   60433.695  6.123 0.000    251601.208    488496.938
height            281971.925   87305.2

In [9]:
# Model comparison: FOREIGN
frgn_md = smf.mixedlm('salary ~ age + height + pts + ast + foreign', full, groups=full['gender'])
frgn_mdf = frgn_md.fit()
print(frgn_mdf.summary())

# NOTE: can compare efficacy by comparing log likelihoods, less negative is better

                      Mixed Linear Model Regression Results
Model:                  MixedLM      Dependent Variable:      salary             
No. Observations:       535          Method:                  REML               
No. Groups:             2            Scale:                   35719271420105.8984
Min. group size:        155          Log-Likelihood:          -9029.6484         
Max. group size:        380          Converged:               Yes                
Mean group size:        267.5                                                    
---------------------------------------------------------------------------------
                Coef.          Std.Err.    z    P>|z|     [0.025        0.975]   
---------------------------------------------------------------------------------
Intercept      -32340607.360 7541787.253 -4.288 0.000 -47122238.754 -17558975.965
age               372508.352   60484.396  6.159 0.000    253961.114    491055.590
height            256347.185   91039.2