In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt

#statistical testing tools
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

#regression models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn import linear_model
from sklearn import kernel_ridge
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.preprocessing import PolynomialFeatures

In [3]:
df = pd.read_excel('data_ready.xlsx')

In [5]:
sr_inputs = ['COL_GP', 'COL_PTS_PERG', 'COL_REB_PERG', 'COL_AST_PERG',
             'COL_FG_PCT', 'COL_FG3_PCT', 'COL_FT_PCT']
model_outputs = ['PRO_MIN_PERG', 'PRO_PTS_PERG', 'PRO_AST_PERG',
                 'PRO_REB_PERG', 'PRO_FG_PCT']

In [7]:
X = df[sr_inputs]
y = df['PRO_AST_PERG']
model = sm.OLS(y, X)
lm = model.fit()
lm.summary()

0,1,2,3
Dep. Variable:,PRO_AST_PERG,R-squared (uncentered):,0.624
Model:,OLS,Adj. R-squared (uncentered):,0.619
Method:,Least Squares,F-statistic:,118.7
Date:,"Wed, 07 Aug 2024",Prob (F-statistic):,4.9499999999999996e-102
Time:,17:23:35,Log-Likelihood:,-741.3
No. Observations:,507,AIC:,1497.0
Df Residuals:,500,BIC:,1526.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
COL_GP,-0.0067,0.011,-0.615,0.539,-0.028,0.015
COL_PTS_PERG,0.0071,0.015,0.493,0.623,-0.021,0.036
COL_REB_PERG,-0.0472,0.028,-1.695,0.091,-0.102,0.008
COL_AST_PERG,0.3567,0.033,10.877,0.000,0.292,0.421
COL_FG_PCT,1.9222,0.887,2.168,0.031,0.180,3.664
COL_FG3_PCT,-0.2791,0.785,-0.356,0.722,-1.822,1.263
COL_FT_PCT,-0.2107,0.542,-0.389,0.698,-1.276,0.855

0,1,2,3
Omnibus:,206.373,Durbin-Watson:,2.116
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1018.75
Skew:,1.75,Prob(JB):,6.04e-222
Kurtosis:,8.998,Cond. No.,762.0


In [9]:
#ridge alpha trial and error
X = df[['COL_FG_PCT', 'COL_AST_PERG']]
y = df['PRO_AST_PERG']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = linear_model.Ridge(alpha=0.4)
scores = cross_val_score(model, X_train, y_train, cv=10)
print(f'mean:{scores.mean()}\nstd:{scores.std()}')

mean:0.20236226830782894
std:0.1799034307028945


In [11]:
def get_rmse(y_test, pred):
    mse = mean_squared_error(y_test, pred)
    return math.sqrt(mse)

#brute-force regressional model testing 
def get_scores(X, y):
    get_linear_regression(X, y)
    get_binomial_regression(X, y)
    get_decision_tree_regressor(X, y)
    get_random_forest_regressor(X, y)
    get_ridge_regression(X, y)
    get_lasso_regression(X, y)
    get_orthogonal_mp(X, y)
    get_kernel_ridge_regression(X, y)
    get_svr_regression(X, y)
    get_nusvr_regression(X, y)
    get_linear_svr_regression(X, y)
    
def get_linear_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = LinearRegression()
    model_type = 'Linear Regression'
    score = cross_val_score(model, X_train, y_train, cv=10)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    rmse = get_rmse(y_test, pred)
    print(f'{model_type}:\nrmse:{rmse}\nmean:{score.mean()}\nstd:{score.std()}\n')

def get_binomial_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = LinearRegression()
    poly_features = PolynomialFeatures(degree=2)
    X_train_poly = poly_features.fit_transform(X_train)
    model_type = 'Binomial Regression'
    score = cross_val_score(model, X_train_poly, y_train, cv=10)
    
    model.fit(X_train_poly, y_train)
    X_test_poly = poly_features.fit_transform(X_test)
    pred = model.predict(X_test_poly)
    rmse = get_rmse(y_test, pred)
    print(f'{model_type}:\nrmse:{rmse}\nmean:{score.mean()}\nstd:{score.std()}\n')

def get_decision_tree_regressor(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = DecisionTreeRegressor()
    model_type = 'Decision Tree Regressor'
    score = cross_val_score(model, X_train, y_train, cv=10)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    rmse = get_rmse(y_test, pred)
    print(f'{model_type}:\nrmse:{rmse}\nmean:{score.mean()}\nstd:{score.std()}\n')

def get_random_forest_regressor(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = RandomForestRegressor()
    model_type = 'Random Forest Regressor'
    score = cross_val_score(model, X_train, y_train, cv=10)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    rmse = get_rmse(y_test, pred)
    print(f'{model_type}:\nrmse:{rmse}\nmean:{score.mean()}\nstd:{score.std()}\n')

def get_ridge_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = linear_model.Ridge(alpha=1)
    model_type = 'Ridge Regression'
    score = cross_val_score(model, X_train, y_train, cv=10)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    rmse = get_rmse(y_test, pred)
    print(f'{model_type}:\nrmse:{rmse}\nmean:{score.mean()}\nstd:{score.std()}\n')

def get_lasso_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = linear_model.Lasso(alpha=1)
    model_type = 'Lasso Regression'
    score = cross_val_score(model, X_train, y_train, cv=10)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    rmse = get_rmse(y_test, pred)
    print(f'{model_type}:\nrmse:{rmse}\nmean:{score.mean()}\nstd:{score.std()}\n')

def get_orthogonal_mp(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = linear_model.OrthogonalMatchingPursuit()
    model_type = 'Orthogonal MP'
    score = cross_val_score(model, X_train, y_train, cv=10)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    rmse = get_rmse(y_test, pred)
    print(f'{model_type}:\nrmse:{rmse}\nmean:{score.mean()}\nstd:{score.std()}\n')

def get_kernel_ridge_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = kernel_ridge.KernelRidge()
    model_type = 'Kernel Ridge Regression'
    score = cross_val_score(model, X_train, y_train, cv=10)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    rmse = get_rmse(y_test, pred)
    print(f'{model_type}:\nrmse:{rmse}\nmean:{score.mean()}\nstd:{score.std()}\n')

def get_svr_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = SVR()
    model_type = 'SVR Regression'
    score = cross_val_score(model, X_train, y_train, cv=10)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    rmse = get_rmse(y_test, pred)
    print(f'{model_type}:\nrmse:{rmse}\nmean:{score.mean()}\nstd:{score.std()}\n')

def get_nusvr_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = NuSVR()
    model_type = 'NuSVR Regression'
    score = cross_val_score(model, X_train, y_train, cv=10)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    rmse = get_rmse(y_test, pred)
    print(f'{model_type}:\nrmse:{rmse}\nmean:{score.mean()}\nstd:{score.std()}\n')

def get_linear_svr_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = LinearSVR()
    model_type = 'Linear SVR Regression'
    score = cross_val_score(model, X_train, y_train, cv=10)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    rmse = get_rmse(y_test, pred)
    print(f'{model_type}:\nrmse:{rmse}\nmean:{score.mean()}\nstd:{score.std()}\n')

In [13]:
#function to see how each model with different independent and dependent variables
get_scores(df[['COL_FG_PCT', 'COL_AST_PERG']], df['PRO_AST_PERG'])

Linear Regression:
rmse:0.9977595377988783
mean:0.1995816620720817
std:0.18214123848784944

Binomial Regression:
rmse:1.0009485446305044
mean:0.23435519706014668
std:0.20778373256767613

Decision Tree Regressor:
rmse:1.436957367886808
mean:-1.0037600281862313
std:0.9421353946281784

Random Forest Regressor:
rmse:1.061116172372558
mean:-0.05090464266225857
std:0.2744836456660774

Ridge Regression:
rmse:0.9931801531307967
mean:0.20334045945672163
std:0.17855678972600655

Lasso Regression:
rmse:0.9890183310762248
mean:0.022967984861749537
std:0.04017102664858251

Orthogonal MP:
rmse:0.9915232342511312
mean:0.2025672733822897
std:0.17695440891182126

Kernel Ridge Regression:
rmse:0.9890995724088012
mean:0.20622932340079903
std:0.17623865839814135

SVR Regression:
rmse:1.0104022182615573
mean:0.1565580964440485
std:0.14934340718174102

NuSVR Regression:
rmse:0.9876606074534141
mean:0.19213002160554218
std:0.1471340486669456

Linear SVR Regression:
rmse:1.000554379579581
mean:0.1761592956727

