## Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report

# Load datasets into dataframes
kenpom = pd.read_csv('data/KenPom Barttorvik.csv')
preseason = pd.read_csv('data/Preseason Votes.csv')
team = pd.read_csv('data/Team Results.csv')
matchups = pd.read_csv('data/Tournament Matchups.csv')
_538_ratings = pd.read_csv('data/538 Ratings.csv')
Bartowick_Away_Neutral = pd.read_csv('data/Barttorvik Away-Neutral.csv')
Bartowick_Away = pd.read_csv('data/Barttorvik Away.csv')
Bartowick_Home = pd.read_csv('data/Barttorvik Home.csv')
Bartowick_Neutral = pd.read_csv('data/Barttorvik Neutral.csv')
Coach_Result = pd.read_csv('data/Coach Results.csv')
Conference_Result = pd.read_csv('data/Conference Results.csv')
Conference_Stats_Away_Neutral = pd.read_csv('data/Conference Stats Away Neutral.csv')
Conference_Stats_Away = pd.read_csv('data/Conference Stats Away.csv')   
Conference_Stats_Home = pd.read_csv('data/Conference Stats Home.csv')
Conference_Stats_Neutral = pd.read_csv('data/Conference Stats Neutral.csv')
Conference_Stats = pd.read_csv('data/Conference Stats.csv')
Heat_Check_Tournament_Indes = pd.read_csv('data/Heat Check Tournament Index.csv')
Public_Picks = pd.read_csv('data/Public Picks.csv')
Resumes = pd.read_csv('data/Resumes.csv')
Seed_Results = pd.read_csv('data/Seed Results.csv')
Shooting_Splits = pd.read_csv('data/Shooting Splits.csv')
Tournament_Locations = pd.read_csv('data/Tournament Locations.csv')
Tournament_Simulation = pd.read_csv('data/Tournament Simulation.csv')
Upset_Count = pd.read_csv('data/Upset Count.csv')
Upset_Seed_Info = pd.read_csv('data/Upset Seed Info.csv')



# Merge datasets
first_merged_df = pd.merge(kenpom, preseason, on=['TEAM', 'YEAR'], how='left')
second_merged_df = pd.merge(first_merged_df, team, on='TEAM', how='left')
third_merged_df = pd.merge(second_merged_df, matchups, on=['TEAM', 'YEAR'], how='right')

fourth_merged_df = pd.merge(third_merged_df, _538_ratings, on=['TEAM', 'YEAR'], how='left', suffixes=('_kenpom', '_538'))
fifth_merged_df = pd.merge(fourth_merged_df, Bartowick_Away_Neutral, on=['TEAM', 'YEAR'], how='left', suffixes=('_538', '_away_neutral'))
sixth_merged_df = pd.merge(fifth_merged_df, Bartowick_Away, on=['TEAM', 'YEAR'], how='left', suffixes=('_away_neutral', '_away'))
seventh_merged_df = pd.merge(sixth_merged_df, Bartowick_Home, on=['TEAM', 'YEAR'], how='left', suffixes=('_away', '_home'))
eighth_merged_df = pd.merge(seventh_merged_df, Bartowick_Neutral, on=['TEAM', 'YEAR'], how='left', suffixes=('_home', '_neutral'))
ninth_merged_df = eighth_merged_df
tenth_merged_df = eighth_merged_df
eleventh_merged_df = eighth_merged_df 
twelfth_merged_df = eighth_merged_df
thirteenth_merged_df = eighth_merged_df
fourteenth_merged_df = eighth_merged_df
fifteenth_merged_df = eighth_merged_df
sixteenth_merged_df = pd.merge(fifteenth_merged_df, Heat_Check_Tournament_Indes, on=['TEAM', 'YEAR'], how='left')
seventeenth_merged_df = pd.merge(sixteenth_merged_df, Public_Picks, on=['TEAM', 'YEAR'], how='left', suffixes=('_TEAM_NO_x','_'))
eighteenth_merged_df = pd.merge(seventeenth_merged_df, Resumes, on=['TEAM', 'YEAR'], how='left', suffixes=('ROUND_x', 'SEED_y'))
nineteenth_merged_df = eighteenth_merged_df
twentieth_merged_df = pd.merge(nineteenth_merged_df, Shooting_Splits, on=['TEAM', 'YEAR'], how='left', suffixes=('TEAM NO_x', '_'))
twentyfirst_merged_df = twentieth_merged_df
final_merged_df = twentieth_merged_df

# Extract features and target variable from the merged dataframe
X = final_merged_df.drop(columns=['TEAM', 'YEAR', 'WIN%_y'])  # Features
y = final_merged_df['WIN%_y']  # Target variable

# Replace missing values with 0
X.fillna(0, inplace=True)
y.fillna(0, inplace=True)

# Convert categorical variables into dummy/indicator variables
X = pd.get_dummies(X)


# Adding a constant column for the intercept term in the regression model
X = sm.add_constant(X)

# Convert data type to numeric
X = X.apply(pd.to_numeric, errors='coerce')

# Keep only numeric columns in the feature set
X = X.select_dtypes(include=[np.number])

# Adding a constant column again after dropping non-numeric columns
X = sm.add_constant(X)

# Perform stepwise regression

model = sm.OLS(y, X)
result = model.fit()
selected_features = result.summary().tables[1]

significant_variables = []
for row in selected_features.data[1:]:
    # Extract the p-value from the row
    p_value = float(row[-1])
    
    
    if p_value < 0.05:
        significant_variables.append(row[0])

print("Variables with p-value less than 0.05:", significant_variables)
print(selected_features)

# Create a DataFrame for selected features
new_features = pd.DataFrame(selected_features.data[1:], columns=selected_features.data[0])

# Select the updated features based on significant features
X_updated = X[significant_variables]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_updated, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Variables with p-value less than 0.05: ['CONF ID', 'QUAD NO', 'TEAM NO_x', 'TEAM ID_x', 'SEED_x', 'ROUND_x', 'K TEMPO', 'K TEMPO RANK', 'KADJ T', 'KADJ T RANK', 'K OFF', 'KO RANK', 'KADJ O RANK', 'K DEF', 'KD RANK', 'KADJ D RANK', 'KADJ EM RANK', 'BADJ EM_538', 'BADJ O_538', 'GAMES_x', 'W_x', 'L_x', 'WIN%_x', 'FTR_538', 'FTRD_538', 'OREB%_538', 'DREB%_538', 'OP OREB%_538', 'OP DREB%_538', '2PT%_538', '2PT%D_538', '3PT%_538', '3PT%D_538', 'BLK%_538', 'AST%_538', 'OP AST%_538', '2PTR_538', '3PTR_538', '2PTRD_538', '3PTRD_538', 'BADJ T_538', 'AVG HGT_538', 'EFF HGT_538', 'EXP_538', 'FT%_538', 'OP FT%_538', 'ELITE SOS_538', 'WAB_538', 'BADJ EM RANK_538', 'BADJ O RANK_538', 'BADJ D RANK_538', 'BARTHAG RANK_538', 'EFG% RANK_538', 'EFGD% RANK_538', 'FTR RANK_538', 'FTRD RANK_538', 'TOV% RANK_538', 'TOV%D RANK_538', 'OREB% RANK_538', 'DREB% RANK_538', 'OP OREB% RANK_538', 'OP DREB% RANK_538', 'RAW T RANK_538', '2PT% RANK_538', '2PT%D RANK_538', '3PT% RANK_538', '3PT%D RANK_538', 'BLK% RANK_538

In [13]:
final_merged_df.shape

(2036, 541)

## SVM For Regression

In [2]:
from sklearn.svm import SVR
#from cuml import SVR as cuSVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Perform grid search with cross-validation to find the best hyperparameters
param_grid = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
grid_search = GridSearchCV(SVR(), param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Get the best SVR model from grid search
best_svr = grid_search.best_estimator_
best_params = grid_search.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.5s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.6s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.5s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.4s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.7s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.8s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.9s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=   0.2s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=   0.2s
[CV] END ..................C=0.1, gamma=scale, k

In [6]:
from sklearn.metrics import mean_absolute_error

#Train the SVR
best_svr.fit(X_train_scaled, y_train)

# Predictions on the scaled test set
y_pred_train = best_svr.predict(X_train_scaled)
y_pred_test = best_svr.predict(X_test_scaled)

print("Best Params: ", best_params)

# Evaluate model
print("Train MSE: ", mean_squared_error(y_pred_train, y_train))
print("Test MSE: ", mean_squared_error(y_pred_test, y_test))
print("Train R^2: ", r2_score(y_pred_train, y_train))
print("Test R^2: ", r2_score(y_pred_test, y_test))
print("Train MAE: ", mean_absolute_error(y_pred_train, y_train))
print("Test MAE: ", mean_absolute_error(y_pred_test, y_test))

Best Params:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Train MSE:  0.005071586635460595
Test MSE:  0.007923909728489692
Train R^2:  0.8759123857741552
Test R^2:  0.8033526510106256
Train MAE:  0.0623489868371097
Test MAE:  0.06996411135115455


## Random Forest For Regression

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X, y)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Get the best SVR model from grid search
best_svr = grid_search.best_estimator_

best_svr.fit(X_train, y_train)

# Predictions on the test set
y_pred = best_svr.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Fitting 5 folds for each of 81 candidates, totalling 405 fits


[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  43.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  43.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  43.8s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=  42.1s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=  42.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  45.8s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=  42.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  43.2s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=  43.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_esti

In [11]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth = 20, min_samples_leaf = 1, min_samples_split = 5, n_estimators = 200)

rf.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred_train = rf.predict(X_train_scaled)
y_pred_test = rf.predict(X_test_scaled)

# Evaluate model
print("Train MSE: ", mean_squared_error(y_pred_train, y_train))
print("Test MSE: ", mean_squared_error(y_pred_test, y_test))
print("Train R^2: ", r2_score(y_pred_train, y_train))
print("Test R^2: ", r2_score(y_pred_test, y_test))
print("Train MAE: ", mean_absolute_error(y_pred_train, y_train))
print("Test MAE: ", mean_absolute_error(y_pred_test, y_test))


Train MSE:  1.0108503613181397e-05
Test MSE:  5.605191853239837e-05
Train R^2:  0.999815141844911
Test R^2:  0.9989718253667119
Train MAE:  0.0012113874138681877
Test MAE:  0.0028673426765913903
