## This code takes the college basketball data and models it. It goes through the process of optimizing each model and then creates a function that outputs predicted basketball scores for a set of two schools playing each other

In [311]:
#import packages

import pandas as pd
import numpy as np
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn.metrics import roc_auc_score
from bs4 import BeautifulSoup
import requests

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from datetime import datetime

from joblib import Parallel, delayed
from multiprocessing import cpu_count

In [312]:
#import data
full_data = pd.read_csv("College Data/All Teams Data.csv")

In [313]:
full_data.columns

Index(['G', 'Date', 'Opp', 'W/L', 'Tm', 'Opp.1', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'FG.1', 'FGA.1', 'FG%.1', '3P.1', '3PA.1', '3P%.1', 'FT.1',
       'FTA.1', 'FT%.1', 'ORB.1', 'TRB.1', 'AST.1', 'STL.1', 'BLK.1', 'TOV.1',
       'PF.1', 'Location', 'Team'],
      dtype='object')

In [314]:
full_data['Date'] = full_data['Date'].astype('datetime64[ns]')

In [315]:
full_data

Unnamed: 0,G,Date,Opp,W/L,Tm,Opp.1,FG,FGA,FG%,3P,...,FT%.1,ORB.1,TRB.1,AST.1,STL.1,BLK.1,TOV.1,PF.1,Location,Team
0,1,2022-11-07,jackson-state,W,65,56,23,57,0.404,8,...,0.714,7,40,9,6,1,21,21,H,abilene-christian
1,2,2022-11-11,texas-am,L,58,77,20,52,0.385,8,...,0.714,10,33,7,11,3,19,17,A,abilene-christian
2,3,2022-11-15,mcmurry,W,104,46,41,68,0.603,5,...,0.667,4,17,7,9,2,27,18,H,abilene-christian
3,4,2022-11-21,wright-state,L,61,77,25,58,0.431,7,...,0.632,1,24,18,12,5,18,14,N,abilene-christian
4,5,2022-11-22,weber-state,L,67,77,26,53,0.491,9,...,0.920,8,29,10,4,0,18,13,N,abilene-christian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11924,30,2023-02-21,robert-morris,L,64,83,23,47,0.489,4,...,0.778,4,24,14,6,7,12,17,A,youngstown-state
11925,31,2023-02-25,iupui,W,93,79,34,59,0.576,8,...,0.636,8,25,15,4,4,12,18,A,youngstown-state
11926,32,2023-03-02,detroit-mercy,W,71,66,24,55,0.436,5,...,0.857,13,36,11,5,3,13,21,H,youngstown-state
11927,33,2023-03-06,northern-kentucky,L,63,75,24,53,0.453,4,...,0.737,16,32,6,6,2,2,18,N,youngstown-state


In [316]:
full_data['Opp'] = full_data['Opp'].replace('lsu','louisiana-state')\
.replace('ole-miss', 'mississippi')\
.replace('ut-southern', 'texas-southern')\
.replace('usc', 'southern-california')\
.replace('pitt', 'pittsburgh')\
.replace('umass', 'massachusetts')\
.replace('umbc', 'maryland-baltimore-county')\
.replace('vcu', 'virginia-commonwealth')\
.replace('smu', 'southern-methodist')\
.replace('siu-edwardsville', 'southern-illinois-edwardsville')\
.replace('penn', 'pennsylvania')\
.replace('byu', 'brigham-young')\
.replace('sam-houston-state-state', 'sam-houston-state')

In [317]:
len(full_data)

11929

In [318]:
full_data['eFG'] = (full_data["FG"] + (0.5 * full_data['3P'])) / full_data['FGA']
full_data['eFG.1'] = (full_data["FG.1"] + (0.5 * full_data['3P.1'])) / full_data['FGA.1']

full_data['TOV%'] = (full_data['TOV'] / (full_data['FGA'] + (0.44 * full_data['FTA']) + full_data['TOV'])) * 100
full_data['TOV%.1'] = (full_data['TOV.1'] / (full_data['FGA.1'] + (0.44 * full_data['FTA.1']) + full_data['TOV.1'])) * 100

full_data['ORB%'] = (full_data['ORB'])/((full_data['ORB']) + (full_data['TRB.1'] - full_data['ORB.1']))
full_data['ORB%.1'] = (full_data['ORB.1'])/((full_data['ORB.1']) + (full_data['TRB'] - full_data['ORB']))

full_data['FT/FGA'] = full_data['FT']/full_data['FGA']
full_data['FT/FGA.1'] = full_data['FT.1']/full_data['FGA.1']

full_data['DRB%'] = (full_data['TRB'] - full_data['ORB'])/((full_data['TRB'] - full_data['ORB']) + (full_data['ORB.1']))
full_data['DRB%.1'] = (full_data['TRB'] - full_data['ORB'])/((full_data['TRB'] - full_data['ORB']) + (full_data['ORB.1']))

full_data['AST%'] = full_data['AST'] / full_data['FG']
full_data['AST%.1'] = full_data['AST.1'] / full_data['FG.1']

full_data['TS%'] = 0.5 * (full_data['Tm'])/((full_data['FGA']) + 0.44 * (full_data['FTA']))
full_data['TS%.1'] = 0.5 * (full_data['Opp.1'])/((full_data['FGA.1']) + 0.44 * (full_data['FTA.1']))

full_data['3PAr'] = full_data['3PA'] / full_data['FGA']
full_data['3PAr.1'] = full_data['3PA.1'] / full_data['FGA.1']

full_data['DRB'] = full_data['TRB'] - full_data['ORB']
full_data['DRB.1'] = full_data['TRB.1'] - full_data['ORB.1']


full_data['POSS'] = (full_data['FGA'] - full_data['ORB']) + full_data['TOV'] + (0.44 * full_data['FTA'])
full_data['POSS.1'] = (full_data['FGA.1'] - full_data['ORB.1']) + full_data['TOV.1'] + (0.44 * full_data['FTA.1'])

full_data['OffRt'] = (full_data['Tm'] / full_data['POSS']) * 100
full_data['OffRt.1'] = (full_data['Opp.1'] / full_data['POSS.1']) * 100

full_data['DefRt'] = (full_data['Opp.1'] / full_data['POSS']) * 100
full_data['DefRt.1'] = (full_data['Tm'] / full_data['POSS.1']) * 100

In [321]:
def make_mean_data(i):
    
    data1 = full_data.loc[i]
    
    date = data1.loc['Date']
    not_needed = ['Date','G','Opp','W/L','Opp.1','Tm','Team','Location']
    team1 = data1['Team']
    team2 = data1['Opp']
    stat_vars = ['FG','FGA','FG%','3P','3PA','3P%','FT','FTA','FT%','ORB','TRB','AST','STL','BLK','TOV','PF',
                'eFG', 'TOV%', 'ORB%', 'FT/FGA', 'DRB%', 'AST%', 'TS%', '3PAr', 'DRB', 'POSS', 'OffRt', 'DefRt']
    
    
    
    team1_full = full_data[full_data['Team'] == team1][full_data[full_data['Team'] == team1]['Date'] <= date]

    if len(team1_full) == 1:
        
        team1_full[stat_vars] = np.nan
    
    elif len(team1_full) > 1:

        team1_full.drop(team1_full.tail(1).index,inplace=True)

    else:
        team1_full = team1_full

    team2_full = full_data[full_data['Team'] == team2][full_data[full_data['Team'] == team2]['Date'] <= date]

    if len(team2_full) == 1:
        
        team2_full[stat_vars] = np.nan
       
    elif len(team2_full) > 1:

        team2_full.drop(team2_full.tail(1).index,inplace=True)

    else:
        team2_full = team2_full

    
    
    team1_stats = team1_full[stat_vars]
    team2_stats = team2_full[stat_vars]
    
    
    team1_stats_avg = team1_stats.astype(float)\
    .mul((np.arange(1, len(team1_stats)+1, 1))/(len(team1_stats)), axis = 0)\
    .sum()\
    .div(sum((np.arange(1, len(team1_stats)+1, 1))/(len(team1_stats))))\
    .to_frame()\
    .T
    
    
    team2_stats_avg = team2_stats.astype(float)\
    .mul((np.arange(1, len(team2_stats)+1, 1))/(len(team2_stats)), axis = 0)\
    .sum()\
    .div(sum((np.arange(1, len(team2_stats)+1, 1))/(len(team2_stats))))\
    .to_frame()\
    .T
    
    
    team2_stats_avg.columns = team2_stats_avg.columns + '.1'
    
    addon = data1[not_needed].to_frame().T.reset_index(drop = True)
    
    final_row = pd.concat([addon,team1_stats_avg,team2_stats_avg], axis = 1).set_index(pd.Index([i]))
    
    return final_row

In [322]:
cores = cpu_count()
    
mean_data = Parallel(n_jobs=cores)(delayed(make_mean_data)(i) for i in range(len(full_data)))


In [323]:
full_mean_data = pd.concat(mean_data)
full_mean_data

Unnamed: 0,Date,G,Opp,W/L,Opp.1,Tm,Team,Location,FG,FGA,...,ORB%.1,FT/FGA.1,DRB%.1,AST%.1,TS%.1,3PAr.1,DRB.1,POSS.1,OffRt.1,DefRt.1
0,2022-11-07,1,jackson-state,W,56,65,abilene-christian,H,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2022-11-11,2,texas-am,L,77,58,abilene-christian,A,23.000000,57.000000,...,0.264706,0.174603,0.813953,0.656250,0.613367,0.412698,35.000000,75.920000,114.594310,71.127503
2,2022-11-15,3,mcmurry,W,46,104,abilene-christian,H,21.000000,53.666667,...,,,,,,,,,,
3,2022-11-21,4,wright-state,L,77,61,abilene-christian,N,31.000000,60.833333,...,0.291657,0.214480,0.806891,0.425512,0.633018,0.185174,27.700000,75.756000,117.233591,89.383734
4,2022-11-22,5,weber-state,L,77,67,abilene-christian,N,28.600000,59.700000,...,0.139543,0.301987,0.795111,0.436316,0.530863,0.405271,23.300000,66.932000,94.437727,107.999632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11924,2023-02-21,30,robert-morris,L,83,64,youngstown-state,A,30.526437,62.758621,...,0.309317,0.201260,0.760079,0.558863,0.538051,0.384905,23.774713,67.045241,102.092467,98.528979
11925,2023-02-25,31,iupui,W,79,93,youngstown-state,A,30.040860,61.741935,...,0.279955,0.228755,0.694507,0.472317,0.550830,0.292846,20.954839,68.477763,98.466561,114.058345
11926,2023-03-02,32,detroit-mercy,W,66,71,youngstown-state,H,30.288306,61.570565,...,0.346580,0.221654,0.741790,0.478361,0.565752,0.429986,22.196970,68.776894,113.245762,109.067868
11927,2023-03-06,33,northern-kentucky,L,75,63,youngstown-state,N,29.907197,61.172348,...,0.295794,0.181619,0.727313,0.581073,0.555964,0.442696,20.634470,63.995076,108.019245,98.663027


In [324]:
full_mean_data[np.isnan(full_mean_data['FT.1'])].sort_values(by = ['Opp'])['Opp']

8464        academy-of-art
2856       albany-state-ga
8026       albertus-magnus
666            alice-lloyd
6148           alice-lloyd
               ...        
9078         william-woods
6685      williams-baptist
2192         wilmington-de
916     wisconsin-parkside
6806       wisconsin-stout
Name: Opp, Length: 974, dtype: object

In [325]:
full_mean_data_clean = full_mean_data[(full_mean_data['FG'] != 0) | (full_mean_data['FG.1'] != 0)].dropna(axis = 0).reset_index(drop = (True))

In [326]:
full_mean_data_clean

Unnamed: 0,Date,G,Opp,W/L,Opp.1,Tm,Team,Location,FG,FGA,...,ORB%.1,FT/FGA.1,DRB%.1,AST%.1,TS%.1,3PAr.1,DRB.1,POSS.1,OffRt.1,DefRt.1
0,2022-11-11,2,texas-am,L,77,58,abilene-christian,A,23.000000,57.000000,...,0.264706,0.174603,0.813953,0.656250,0.613367,0.412698,35.000000,75.920000,114.594310,71.127503
1,2022-11-21,4,wright-state,L,77,61,abilene-christian,N,31.000000,60.833333,...,0.291657,0.214480,0.806891,0.425512,0.633018,0.185174,27.700000,75.756000,117.233591,89.383734
2,2022-11-22,5,weber-state,L,77,67,abilene-christian,N,28.600000,59.700000,...,0.139543,0.301987,0.795111,0.436316,0.530863,0.405271,23.300000,66.932000,94.437727,107.999632
3,2022-11-23,6,california-riverside,L,76,65,abilene-christian,N,27.733333,57.466667,...,0.249515,0.118953,0.856104,0.486961,0.566101,0.417359,28.380952,70.076190,107.410494,89.856350
4,2022-11-27,7,northern-arizona,W,82,92,abilene-christian,A,26.095238,58.190476,...,0.281522,0.178352,0.713698,0.581618,0.511688,0.412427,21.535714,72.134286,97.167696,98.975669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10688,2023-02-21,30,robert-morris,L,83,64,youngstown-state,A,30.526437,62.758621,...,0.309317,0.201260,0.760079,0.558863,0.538051,0.384905,23.774713,67.045241,102.092467,98.528979
10689,2023-02-25,31,iupui,W,79,93,youngstown-state,A,30.040860,61.741935,...,0.279955,0.228755,0.694507,0.472317,0.550830,0.292846,20.954839,68.477763,98.466561,114.058345
10690,2023-03-02,32,detroit-mercy,W,66,71,youngstown-state,H,30.288306,61.570565,...,0.346580,0.221654,0.741790,0.478361,0.565752,0.429986,22.196970,68.776894,113.245762,109.067868
10691,2023-03-06,33,northern-kentucky,L,75,63,youngstown-state,N,29.907197,61.172348,...,0.295794,0.181619,0.727313,0.581073,0.555964,0.442696,20.634470,63.995076,108.019245,98.663027


In [327]:
full_mean_data_clean.shape

(10693, 64)

In [328]:
full_mean_data_clean.to_csv("College Data/Weighted Mean Converted Data.csv", index = False)