In [1]:
import pandas as pd
import os
import fnmatch

Get all dataframes from root folder

In [2]:
all_df = {}
for root, dirs, filenames in os.walk("all_schedules"):
    for filename in fnmatch.filter(filenames, "*.csv"):
        try:
            all_df[filename] = pd.read_csv(os.path.join(root, filename), delimiter=",")
        except pd.errors.EmptyDataError as e:
            print(filename)

In [3]:
list(all_df.items())[0][1].columns

Index(['date', 'opponent/venue', 'result', 'sets', 'kills', 'errors',
       'total_attacks', 'hit_pct', 'assists', 'aces', 'serr', 'digs', 'retatt',
       'rerr', 'b_solo', 'b_assist', 'b_error', 'pts', 'bhe', 'opponent',
       'kills/set', 'errors/set', 'total_attacks/set', 'assists/set',
       'aces/set', 'serr/set', 'digs/set', 'rerr/set', 'b_solo/set',
       'b_assist/set', 'b_error/set', 'pts/set', 'bhe/set', 'sets_from_result',
       'rolling_kills/set_3', 'rolling_kills/set_total',
       'rolling_errors/set_3', 'rolling_errors/set_total',
       'rolling_total_attacks/set_3', 'rolling_total_attacks/set_total',
       'rolling_hit_pct_3', 'rolling_hit_pct_total', 'rolling_assists/set_3',
       'rolling_assists/set_total', 'rolling_aces/set_3',
       'rolling_aces/set_total', 'rolling_serr/set_3',
       'rolling_serr/set_total', 'rolling_digs/set_3',
       'rolling_digs/set_total', 'rolling_b_solo/set_3',
       'rolling_b_solo/set_total', 'rolling_b_assist/set_3',
    

In [4]:
# Columns of end feature set
# t1 will always end up being team first alphabetically
# rolling_*_total is total cumulative average
# rolling_*_3 is 3 game cumulative - first three games of season for every team are NA
# team names are either not included or converted to categorical codes
# result is 0 for t1, 1 for t2

master_cols = {
    "date": [],
    "t1": [],
    "t1_rolling_kills/set_total": [],
    "t1_rolling_kills/set_3": [],
    "t1_rolling_errors/set_total": [],
    "t1_rolling_errors/set_3": [],
    "t1_rolling_total_attacks/set_total": [],
    "t1_rolling_total_attacks/set_3": [],
    "t1_rolling_hit_pct_total": [],
    "t1_rolling_hit_pct_3": [],
    "t1_rolling_assists/set_total": [],
    "t1_rolling_assists/set_3": [],
    "t1_rolling_aces/set_total": [],
    "t1_rolling_aces/set_3": [],
    "t1_rolling_serr/set_total": [],
    "t1_rolling_serr/set_3": [],
    "t1_rolling_digs/set_total": [],
    "t1_rolling_digs/set_3": [],
    "t1_rolling_b_solo/set_total": [],
    "t1_rolling_b_solo/set_3": [],
    "t1_rolling_b_assist/set_total": [],
    "t1_rolling_b_assist/set_3": [],
    "t1_rolling_b_error/set_total": [],
    "t1_rolling_b_error/set_3": [],
    "t1_rolling_pts/set_total": [],
    "t1_rolling_pts/set_3": [],
    "t2": [],
    "t2_rolling_kills/set_total": [],
    "t2_rolling_kills/set_3": [],
    "t2_rolling_errors/set_total": [],
    "t2_rolling_errors/set_3": [],
    "t2_rolling_total_attacks/set_total": [],
    "t2_rolling_total_attacks/set_3": [],
    "t2_rolling_hit_pct_total": [],
    "t2_rolling_hit_pct_3": [],
    "t2_rolling_assists/set_total": [],
    "t2_rolling_assists/set_3": [],
    "t2_rolling_aces/set_total": [],
    "t2_rolling_aces/set_3": [],
    "t2_rolling_serr/set_total": [],
    "t2_rolling_serr/set_3": [],
    "t2_rolling_digs/set_total": [],
    "t2_rolling_digs/set_3": [],
    "t2_rolling_b_solo/set_total": [],
    "t2_rolling_b_solo/set_3": [],
    "t2_rolling_b_assist/set_total": [],
    "t2_rolling_b_assist/set_3": [],
    "t2_rolling_b_error/set_total": [],
    "t2_rolling_b_error/set_3": [],
    "t2_rolling_pts/set_total": [],
    "t2_rolling_pts/set_3": [],
    "result": [] #0 for t1, 1 for t2
}

In [5]:
# Collect one-sided game stats into matching games - "{date}~{team_name1}~{team_name2}"

games = {}

for key, df in all_df.items():
    team_name1 = key.split("-schedule")[0].strip()
    for i, row in df.iterrows():
        team_name2 = row["opponent"].strip()
        date = row["date"]
        res = f"{date}~{team_name1}~{team_name2}" if team_name1 > team_name2 else f"{date}~{team_name2}~{team_name1}"
        
        if res not in games:
            games[res] = []
        games[res].append((team_name1, row))

In [6]:
# Create dataframe from matched up games

for name, item in games.items():
    if len(item) != 2: continue # Don't know why there are games that don't have two teams stats
    date, t1, t2 = name.split("~")
    row1 = item[0][1] if item[0][0] == t1 else item[1][1]
    row2 = item[0][1] if item[0][0] == t2 else item[1][1]
    winner = 0 if int(row1["result"].split("-")[0].strip()) == 3 else 1
    
    # add row to dataframe - yes it is ugly
    master_cols["result"].append(winner)
    master_cols["date"].append(date)
    master_cols["t1"].append(t1)
    master_cols["t1_rolling_kills/set_total"].append(row1["rolling_kills/set_total"])
    master_cols["t1_rolling_kills/set_3"].append(row1["rolling_kills/set_3"])
    master_cols["t1_rolling_errors/set_total"].append(row1["rolling_errors/set_total"])
    master_cols["t1_rolling_errors/set_3"].append(row1["rolling_errors/set_3"])
    master_cols["t1_rolling_total_attacks/set_total"].append(row1["rolling_total_attacks/set_total"])
    master_cols["t1_rolling_total_attacks/set_3"].append(row1["rolling_total_attacks/set_3"])
    master_cols["t1_rolling_hit_pct_total"].append(row1["rolling_hit_pct_total"])
    master_cols["t1_rolling_hit_pct_3"].append(row1["rolling_hit_pct_3"])
    master_cols["t1_rolling_assists/set_total"].append(row1["rolling_assists/set_total"])
    master_cols["t1_rolling_assists/set_3"].append(row1["rolling_assists/set_3"])
    master_cols["t1_rolling_aces/set_total"].append(row1["rolling_aces/set_total"])
    master_cols["t1_rolling_aces/set_3"].append(row1["rolling_aces/set_3"])
    master_cols["t1_rolling_serr/set_total"].append(row1["rolling_serr/set_total"])
    master_cols["t1_rolling_serr/set_3"].append(row1["rolling_serr/set_3"])
    master_cols["t1_rolling_digs/set_total"].append(row1["rolling_digs/set_total"])
    master_cols["t1_rolling_digs/set_3"].append(row1["rolling_digs/set_3"])
    master_cols["t1_rolling_b_solo/set_total"].append(row1["rolling_b_solo/set_total"])
    master_cols["t1_rolling_b_solo/set_3"].append(row1["rolling_b_solo/set_3"])
    master_cols["t1_rolling_b_assist/set_total"].append(row1["rolling_b_assist/set_total"])
    master_cols["t1_rolling_b_assist/set_3"].append(row1["rolling_b_assist/set_3"])
    master_cols["t1_rolling_b_error/set_total"].append(row1["rolling_b_error/set_total"])
    master_cols["t1_rolling_b_error/set_3"].append(row1["rolling_b_error/set_3"])
    master_cols["t1_rolling_pts/set_total"].append(row1["rolling_pts/set_total"])
    master_cols["t1_rolling_pts/set_3"].append(row1["rolling_pts/set_3"])
    master_cols["t2"].append(t2)
    master_cols["t2_rolling_kills/set_total"].append(row2["rolling_kills/set_total"])
    master_cols["t2_rolling_kills/set_3"].append(row2["rolling_kills/set_3"])
    master_cols["t2_rolling_errors/set_total"].append(row2["rolling_errors/set_total"])
    master_cols["t2_rolling_errors/set_3"].append(row2["rolling_errors/set_3"])
    master_cols["t2_rolling_total_attacks/set_total"].append(row2["rolling_total_attacks/set_total"])
    master_cols["t2_rolling_total_attacks/set_3"].append(row2["rolling_total_attacks/set_3"])
    master_cols["t2_rolling_hit_pct_total"].append(row2["rolling_hit_pct_total"])
    master_cols["t2_rolling_hit_pct_3"].append(row2["rolling_hit_pct_3"])
    master_cols["t2_rolling_assists/set_total"].append(row2["rolling_assists/set_total"])
    master_cols["t2_rolling_assists/set_3"].append(row2["rolling_assists/set_3"])
    master_cols["t2_rolling_aces/set_total"].append(row2["rolling_aces/set_total"])
    master_cols["t2_rolling_aces/set_3"].append(row2["rolling_aces/set_3"])
    master_cols["t2_rolling_serr/set_total"].append(row2["rolling_serr/set_total"])
    master_cols["t2_rolling_serr/set_3"].append(row2["rolling_serr/set_3"])
    master_cols["t2_rolling_digs/set_total"].append(row2["rolling_digs/set_total"])
    master_cols["t2_rolling_digs/set_3"].append(row2["rolling_digs/set_3"])
    master_cols["t2_rolling_b_solo/set_total"].append(row2["rolling_b_solo/set_total"])
    master_cols["t2_rolling_b_solo/set_3"].append(row2["rolling_b_solo/set_3"])
    master_cols["t2_rolling_b_assist/set_total"].append(row2["rolling_b_assist/set_total"])
    master_cols["t2_rolling_b_assist/set_3"].append(row2["rolling_b_assist/set_3"])
    master_cols["t2_rolling_b_error/set_total"].append(row2["rolling_b_error/set_total"])
    master_cols["t2_rolling_b_error/set_3"].append(row2["rolling_b_error/set_3"])
    master_cols["t2_rolling_pts/set_total"].append(row2["rolling_pts/set_total"])
    master_cols["t2_rolling_pts/set_3"].append(row2["rolling_pts/set_3"])

In [7]:
master_df = pd.DataFrame(master_cols)
master_df = master_df.dropna() #lose about 7 thousand matches by doing this, could consider only using season averages
master_df["date"] = pd.to_datetime(master_df["date"])
master_df["t1_code"] = master_df["t1"].astype("category").cat.codes
master_df["t2_code"] = master_df["t2"].astype("category").cat.codes

In [8]:
master_df

Unnamed: 0,date,t1,t1_rolling_kills/set_total,t1_rolling_kills/set_3,t1_rolling_errors/set_total,t1_rolling_errors/set_3,t1_rolling_total_attacks/set_total,t1_rolling_total_attacks/set_3,t1_rolling_hit_pct_total,t1_rolling_hit_pct_3,...,t2_rolling_b_solo/set_3,t2_rolling_b_assist/set_total,t2_rolling_b_assist/set_3,t2_rolling_b_error/set_total,t2_rolling_b_error/set_3,t2_rolling_pts/set_total,t2_rolling_pts/set_3,result,t1_code,t2_code
4,2023-08-30,UNC Asheville,8.173333,10.400000,3.413333,4.466667,21.013333,26.688889,0.180200,0.220333,...,0.333333,1.466667,1.955556,0.083333,0.111111,10.866667,14.488889,1,292,88
5,2023-09-01,UNC Asheville,8.088889,9.888889,3.788889,4.622222,22.344444,27.822222,0.161667,0.191333,...,0.000000,1.973333,2.222222,0.066667,0.111111,12.426667,15.777778,1,292,88
6,2023-09-05,Western Caro.,9.580952,12.955556,4.980952,6.600000,30.038095,38.377778,0.131143,0.171000,...,1.333333,2.419048,2.222222,0.352381,0.555556,11.742857,13.333333,0,316,295
7,2023-09-08,UNC Asheville,8.483333,9.000000,4.758333,7.000000,25.383333,32.666667,0.135500,0.061000,...,0.577778,1.600000,2.177778,0.125000,0.222222,15.989583,17.422222,1,292,275
8,2023-09-09,UNC Asheville,8.577778,9.555556,4.933333,7.222222,26.044444,33.444444,0.131111,0.070000,...,0.333333,2.560606,2.500000,0.346970,0.916667,14.706061,17.444444,0,292,149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60564,2021-10-01,Winthrop,12.487179,14.444444,5.461538,6.333333,37.987179,44.222222,0.171769,0.183000,...,0.500000,2.953846,3.166667,0.089744,0.083333,14.406410,15.000000,0,324,216
60565,2021-11-06,Winthrop,12.852083,15.444444,5.126389,3.638889,38.192361,37.416667,0.198875,0.316333,...,1.044444,3.087500,3.022222,0.135417,0.222222,14.804861,17.088889,0,324,216
60566,2021-11-19,Winthrop,13.127778,15.333333,5.025926,4.222222,38.275926,38.944444,0.208333,0.284000,...,0.244444,2.908642,1.477778,0.149383,0.261111,14.410494,11.255556,0,324,216
60567,2021-09-28,LMU,12.814103,14.277778,4.743590,4.277778,28.839744,31.583333,0.261846,0.317333,...,0.305556,2.033333,2.277778,0.434615,1.111111,13.312821,15.888889,0,129,97


In [9]:
games_2023 = master_df[master_df["date"] > "2023-01-01"]
games_2023

Unnamed: 0,date,t1,t1_rolling_kills/set_total,t1_rolling_kills/set_3,t1_rolling_errors/set_total,t1_rolling_errors/set_3,t1_rolling_total_attacks/set_total,t1_rolling_total_attacks/set_3,t1_rolling_hit_pct_total,t1_rolling_hit_pct_3,...,t2_rolling_b_solo/set_3,t2_rolling_b_assist/set_total,t2_rolling_b_assist/set_3,t2_rolling_b_error/set_total,t2_rolling_b_error/set_3,t2_rolling_pts/set_total,t2_rolling_pts/set_3,result,t1_code,t2_code
4,2023-08-30,UNC Asheville,8.173333,10.400000,3.413333,4.466667,21.013333,26.688889,0.180200,0.220333,...,0.333333,1.466667,1.955556,0.083333,0.111111,10.866667,14.488889,1,292,88
5,2023-09-01,UNC Asheville,8.088889,9.888889,3.788889,4.622222,22.344444,27.822222,0.161667,0.191333,...,0.000000,1.973333,2.222222,0.066667,0.111111,12.426667,15.777778,1,292,88
6,2023-09-05,Western Caro.,9.580952,12.955556,4.980952,6.600000,30.038095,38.377778,0.131143,0.171000,...,1.333333,2.419048,2.222222,0.352381,0.555556,11.742857,13.333333,0,316,295
7,2023-09-08,UNC Asheville,8.483333,9.000000,4.758333,7.000000,25.383333,32.666667,0.135500,0.061000,...,0.577778,1.600000,2.177778,0.125000,0.222222,15.989583,17.422222,1,292,275
8,2023-09-09,UNC Asheville,8.577778,9.555556,4.933333,7.222222,26.044444,33.444444,0.131111,0.070000,...,0.333333,2.560606,2.500000,0.346970,0.916667,14.706061,17.444444,0,292,149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4725,2023-11-17,Troy,12.508929,12.377778,5.281548,5.977778,32.245833,34.288889,0.220286,0.189000,...,0.683333,2.723810,3.466667,0.555357,0.250000,16.695238,17.027778,1,275,124
4726,2023-10-30,Ark.-Pine Bluff,8.347619,9.805556,5.023214,4.805556,30.787500,34.638889,0.099786,0.142667,...,0.472222,1.874359,3.000000,0.508333,1.194444,14.555128,17.805556,1,8,6
4728,2023-10-01,UMBC,12.308974,12.388889,5.891026,6.166667,32.720513,32.083333,0.183615,0.189333,...,0.250000,3.233333,3.277778,0.316667,0.277778,14.493333,14.638889,0,290,178
4729,2023-10-29,UMBC,12.899074,15.555556,5.819444,5.555556,34.356481,40.666667,0.195444,0.247000,...,0.000000,3.380952,4.500000,0.321429,0.250000,15.023016,16.083333,1,290,178


In [24]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

cat = CatBoostClassifier()
gbc = GradientBoostingClassifier()
qda = QuadraticDiscriminantAnalysis()
gnb = GaussianNB()
nn = MLPClassifier(alpha=1, max_iter=1000)
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
dc = DecisionTreeClassifier()
logi = LogisticRegression(max_iter=100000)
svc1 = SVC(kernel="linear", C=0.025)
svc2 = SVC(gamma=2, C=1)
ada = AdaBoostClassifier()
xgb = XGBClassifier()
gpc = GaussianProcessClassifier()

classifiers = [
    ("ADA", ada),
    # ("GPC", gpc),
    ("QDA", qda),
    ("GNB", gnb),
    ("KNN", knn),
    ("NN", nn),
    ("DT", dc),
    ("RF", rf),
    ("LogReg", logi),
    ("SVC1", svc1),
    # ("SVC2", svc2),
    ("XGB", xgb)
]

vc = VotingClassifier([
    ("ADA", ada),
    ("GNB", gnb),
    ("KNN", knn),
    ("NN", nn),
    ("RF", rf),
    ("LogReg", logi),
    ("SVC1", svc1),
    ("XGB", xgb), 
    ("GBC", gbc),
    ("CAT", cat)
])

In [28]:
features = [
       't1_rolling_kills/set_total', 't1_rolling_kills/set_3',
       't1_rolling_errors/set_total', 't1_rolling_errors/set_3',
       't1_rolling_total_attacks/set_total', 't1_rolling_total_attacks/set_3',
       't1_rolling_hit_pct_total', 't1_rolling_hit_pct_3',
       't1_rolling_assists/set_total', 't1_rolling_assists/set_3',
       't1_rolling_aces/set_total', 't1_rolling_aces/set_3',
       't1_rolling_serr/set_total', 't1_rolling_serr/set_3',
       't1_rolling_digs/set_total', 't1_rolling_digs/set_3',
       't1_rolling_b_solo/set_total', 't1_rolling_b_solo/set_3',
       't1_rolling_b_assist/set_total', 't1_rolling_b_assist/set_3',
       't1_rolling_b_error/set_total', 't1_rolling_b_error/set_3',
       't1_rolling_pts/set_total', 't1_rolling_pts/set_3',
       't2_rolling_kills/set_total', 't2_rolling_kills/set_3',
       't2_rolling_errors/set_total', 't2_rolling_errors/set_3',
       't2_rolling_total_attacks/set_total', 't2_rolling_total_attacks/set_3',
       't2_rolling_hit_pct_total', 't2_rolling_hit_pct_3',
       't2_rolling_assists/set_total', 't2_rolling_assists/set_3',
       't2_rolling_aces/set_total', 't2_rolling_aces/set_3',
       't2_rolling_serr/set_total', 't2_rolling_serr/set_3',
       't2_rolling_digs/set_total', 't2_rolling_digs/set_3',
       't2_rolling_b_solo/set_total', 't2_rolling_b_solo/set_3',
       't2_rolling_b_assist/set_total', 't2_rolling_b_assist/set_3',
       't2_rolling_b_error/set_total', 't2_rolling_b_error/set_3',
       't2_rolling_pts/set_total', 't2_rolling_pts/set_3'#, "t1_code", "t2_code"
]

train = master_df[master_df["date"] < "2023-11-01"]
test = master_df[master_df["date"] >= "2023-11-01"]

In [29]:
from sklearn.metrics import accuracy_score, precision_score, r2_score, log_loss, f1_score, roc_auc_score
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
import tensorflow as tf
from keras import layers, Sequential
import keras_tuner as kt

In [30]:
scaler = StandardScaler()
train_scaled_x = scaler.fit_transform(train[features].values)
test_scaled_x = scaler.fit_transform(test[features].values)

In [31]:
train_scaled_x.shape

(52754, 48)

Testing basic classifiers

In [20]:
for name, clf in classifiers:
    if name != "LogReg": continue
    clf.fit(train_scaled_x, train["result"].values)
    trpred = clf.predict(train_scaled_x)
    tracc = accuracy_score(train["result"], trpred)
    preds = clf.predict(test_scaled_x)
    acc = accuracy_score(test["result"], preds)
    prec = precision_score(test["result"], preds)
    r2 = r2_score(test["result"], preds)
    ll = log_loss(test["result"], preds)
    f1 = f1_score(test["result"], preds)
    roc = roc_auc_score(test["result"], preds)
    
    print(f"Model Name: {name}\nAccuracy: {acc} Training Accuracy: {tracc}, Precision: {prec}, R2: {r2}, Log Loss: {ll}, f1: {f1}, ROC: {roc}")

Model Name: ADA
Accuracy: 0.7193828351012537 Training Accuracy: 0.729499184895932, Precision: 0.7165354330708661, R2: -0.12270356342964295, Log Loss: 10.114467826647147, f1: 0.7144259077526988, ROC: 0.7192822542840773
Model Name: QDA
Accuracy: 0.5120540019286403 Training Accuracy: 0.721803086021913, Precision: 0.5170068027210885, R2: -0.9521924504996542, Log Loss: 17.587356427090914, f1: 0.23100303951367782, ROC: 0.5068734978756334
Model Name: GNB
Accuracy: 0.7010607521697203 Training Accuracy: 0.7141828107821208, Precision: 0.6974459724950884, R2: -0.196007232519551, Log Loss: 10.77486263319799, f1: 0.6960784313725491, ROC: 0.7009702886311043
Model Name: KNN
Accuracy: 0.7078109932497589 Training Accuracy: 0.7948781135079804, Precision: 0.7015503875968992, R2: -0.16900061759169005, Log Loss: 10.531559283416101, f1: 0.7049659201557935, ROC: 0.7078196036995974
Model Name: NN
Accuracy: 0.7309546769527483 Training Accuracy: 0.7345035447549001, Precision: 0.7348178137651822, R2: -0.07640650

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Name: LogReg
Accuracy: 0.7319189971070396 Training Accuracy: 0.7348637070174774, Precision: 0.7344064386317908, R2: -0.07254842142075857, Log Loss: 9.662618748480782, f1: 0.7242063492063491, ROC: 0.7316675719717546
Model Name: SVC1
Accuracy: 0.7299903567984571 Training Accuracy: 0.7349395306517041, Precision: 0.7314629258517034, R2: -0.08026459711443312, Log Loss: 9.732133991275605, f1: 0.7227722772277226, ROC: 0.7297664312873439
Model Name: XGB
Accuracy: 0.712632594021215 Training Accuracy: 0.8449217120976609, Precision: 0.6975881261595547, R2: -0.1497101783575039, Log Loss: 10.357771176429038, f1: 0.716190476190476, ROC: 0.712963100756736


In [24]:
for name, clf in classifiers:
    if name != "LogReg": continue
    clf.fit(train_scaled_x, train["result"].values)
    trpred = clf.predict(train_scaled_x)
    tracc = accuracy_score(train["result"], trpred)
    preds = clf.predict(test_scaled_x)
    acc = accuracy_score(test["result"], preds)
    prec = precision_score(test["result"], preds)
    r2 = r2_score(test["result"], preds)
    ll = log_loss(test["result"], preds)
    f1 = f1_score(test["result"], preds)
    roc = roc_auc_score(test["result"], preds)
    
    print(f"Model Name: {name}\nAccuracy: {acc} Training Accuracy: {tracc}, Precision: {prec}, R2: {r2}, Log Loss: {ll}, f1: {f1}, ROC: {roc}")

Model Name: LogReg
Accuracy: 0.7328833172613307 Training Accuracy: 0.7348447511089207, Precision: 0.7349397590361446, R2: -0.06869033357392129, Log Loss: 9.627861127083365, f1: 0.7254707631318138, ROC: 0.7326460455529679


In [22]:
gbc.fit(train_scaled_x, train["result"].values)
trpred = gbc.predict(train_scaled_x)
tracc = accuracy_score(train["result"], trpred)
preds = gbc.predict(test_scaled_x)
acc = accuracy_score(test["result"], preds)
prec = precision_score(test["result"], preds)
r2 = r2_score(test["result"], preds)
ll = log_loss(test["result"], preds)
f1 = f1_score(test["result"], preds)
roc = roc_auc_score(test["result"], preds)

print(f"Model Name: {name}\nAccuracy: {acc} Training Accuracy: {tracc}, Precision: {prec}, R2: {r2}, Log Loss: {ll}, f1: {f1}, ROC: {roc}")

Model Name: 08/31/2021~St. Francis Brooklyn~Hofstra
Accuracy: 0.742526518804243 Training Accuracy: 0.7420100845433522, Precision: 0.7392156862745098, R2: -0.030109455105548744, Log Loss: 9.280284913109238, f1: 0.7384916748285996, ROC: 0.7424586846041088


In [23]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier()
cat.fit(train_scaled_x, train["result"].values)
trpred = cat.predict(train_scaled_x)
tracc = accuracy_score(train["result"], trpred)
preds = cat.predict(test_scaled_x)
acc = accuracy_score(test["result"], preds)
prec = precision_score(test["result"], preds)
r2 = r2_score(test["result"], preds)
ll = log_loss(test["result"], preds)
f1 = f1_score(test["result"], preds)
roc = roc_auc_score(test["result"], preds)

print(f"Model Name: {name}\nAccuracy: {acc} Training Accuracy: {tracc}, Precision: {prec}, R2: {r2}, Log Loss: {ll}, f1: {f1}, ROC: {roc}")

Learning rate set to 0.056019
0:	learn: 0.6804600	total: 68.7ms	remaining: 1m 8s
1:	learn: 0.6690382	total: 79ms	remaining: 39.4s
2:	learn: 0.6584877	total: 87.5ms	remaining: 29.1s
3:	learn: 0.6489586	total: 95.9ms	remaining: 23.9s
4:	learn: 0.6401934	total: 103ms	remaining: 20.6s
5:	learn: 0.6328871	total: 111ms	remaining: 18.4s
6:	learn: 0.6255634	total: 118ms	remaining: 16.8s
7:	learn: 0.6191960	total: 125ms	remaining: 15.5s
8:	learn: 0.6135438	total: 133ms	remaining: 14.6s
9:	learn: 0.6079149	total: 141ms	remaining: 14s
10:	learn: 0.6032206	total: 149ms	remaining: 13.4s
11:	learn: 0.5986377	total: 156ms	remaining: 12.9s
12:	learn: 0.5939983	total: 164ms	remaining: 12.5s
13:	learn: 0.5897104	total: 173ms	remaining: 12.2s
14:	learn: 0.5860587	total: 182ms	remaining: 11.9s
15:	learn: 0.5827020	total: 190ms	remaining: 11.7s
16:	learn: 0.5793860	total: 199ms	remaining: 11.5s
17:	learn: 0.5763326	total: 207ms	remaining: 11.3s
18:	learn: 0.5737443	total: 215ms	remaining: 11.1s
19:	learn: 

Voting Classifier

In [32]:
vc.fit(train_scaled_x, train["result"])

Learning rate set to 0.056019
0:	learn: 0.6813908	total: 10.5ms	remaining: 10.5s
1:	learn: 0.6705785	total: 18.2ms	remaining: 9.08s
2:	learn: 0.6606505	total: 26.2ms	remaining: 8.72s
3:	learn: 0.6511105	total: 34.3ms	remaining: 8.54s
4:	learn: 0.6419521	total: 42.2ms	remaining: 8.4s
5:	learn: 0.6339560	total: 49.2ms	remaining: 8.15s
6:	learn: 0.6271446	total: 56.3ms	remaining: 7.98s
7:	learn: 0.6202044	total: 63ms	remaining: 7.81s
8:	learn: 0.6147805	total: 70.6ms	remaining: 7.78s
9:	learn: 0.6092567	total: 78.7ms	remaining: 7.79s
10:	learn: 0.6039901	total: 86.7ms	remaining: 7.79s
11:	learn: 0.5991845	total: 94.4ms	remaining: 7.77s
12:	learn: 0.5949411	total: 102ms	remaining: 7.78s
13:	learn: 0.5906406	total: 110ms	remaining: 7.73s
14:	learn: 0.5875273	total: 118ms	remaining: 7.76s
15:	learn: 0.5839105	total: 127ms	remaining: 7.83s
16:	learn: 0.5806820	total: 136ms	remaining: 7.86s
17:	learn: 0.5775759	total: 143ms	remaining: 7.83s
18:	learn: 0.5749070	total: 151ms	remaining: 7.78s
19

In [33]:
preds = vc.predict(test_scaled_x)
acc = accuracy_score(test["result"], preds)
prec = precision_score(test["result"], preds)
r2 = r2_score(test["result"], preds)
acc, prec, r2

(0.7357762777242044, 0.7393939393939394, -0.05711607003340946)

XGboost classifier

In [107]:
from xgboost import XGBClassifier
xgb = XGBClassifier(objective="binary:logistic")
xgb.fit(train[features], train["result"])
preds = xgb.predict(test[features])
acc = accuracy_score(test["result"], preds)
prec = precision_score(test["result"], preds)
r2 = r2_score(test["result"], preds)
acc, prec, r2

(0.7242044358727098, 0.7110694183864915, -0.10341312419545656)

Random forest classifier

In [105]:
rf.fit(train[features], train["result"])
preds = rf.predict(test[features])
acc = accuracy_score(test["result"], preds)
prec = precision_score(test["result"], preds)
r2 = r2_score(test["result"], preds)
acc, prec, r2

(0.7270973963355835, 0.7244094488188977, -0.09183886065494495)

Confusion matrix for random forest

In [106]:
pd.crosstab(index=preds, columns=test["result"])

result,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,386,143
1,140,368


In [3]:
team_id = {}
with open("team_id.csv", "r") as f:
    for line in f.readlines()[1:]:
        team, id = line.split(",")
        team_id[int(id.strip())] = team

In [20]:
#Function to add rolling average columns to dataframe

def rolling_avgs(df, cols, new_cols):
    for col, new_col in zip(cols, new_cols):
        df[new_col + "_3"] = df[col].rolling(3, closed="left").mean()
        df["cumsum"] = df[col].cumsum()
        df["index_val"] = range(1, len(df) + 1)
        df[new_col + "_total"] = df["cumsum"].shift(1).fillna(0) / df["index_val"]
        df.drop(["cumsum", "index_val"], axis=1, inplace=True)

In [21]:
cols = ["kills/set", "errors/set", "total_attacks/set", "hit_pct", "assists/set", "aces/set", "serr/set", "digs/set", "b_solo/set", "b_assist/set", "b_error/set", "pts/set"]
new_cols = [f"rolling_{c}" for c in cols]

for root, dirs, filenames in os.walk("all_schedules"):
    for filename in fnmatch.filter(filenames, "*.csv"):
        df = pd.read_csv(os.path.join(root, filename), delimiter=",")
        rolling_avgs(df, cols, new_cols)
        df.to_csv(os.path.join(root, filename), index=False)
        

In [44]:
bad = []
for key, l in games.items():
    if len(l) != 2:
        bad.append(key)

In [38]:
# Function to separate opponent and venue

def seperate_opp_venue(x):
    opp_ven = x["opponent/venue"]
    res = ""
    if "@" not in opp_ven:
        res = opp_ven.strip()
    elif opp_ven.startswith("@"):
        res =  opp_ven.split("@")[-1].strip()
    else:
        res = opp_ven.split("@")[0].strip()
    return res.split("(")[0]

In [None]:
# Some set data was impossible/incorrect, extract sets from result seems reliable
def sets_from_result(x):
    return sum([int(val.strip()) for val in x["result"].split("-")])

In [19]:
cols = ["kills", "errors", "total_attacks", "assists", "aces", "serr", "digs", "b_solo", "b_assist", "b_error", "pts"]
new_cols = [f"{c}/set" for c in cols]


for root, dirs, filenames in os.walk("all_schedules"):
    for filename in fnmatch.filter(filenames, "*.csv"):
        df = pd.read_csv(os.path.join(root, filename), delimiter=",")
        df["sets_from_result"] = df.apply(lambda x: sum([int(val.strip()) for val in x["result"].split("-")]), 1)
        for col, new_col in zip(cols, new_cols):
            df[new_col] = df.apply(lambda x: x[col] / x["sets_from_result"], 1)
        df.to_csv(os.path.join(root, filename), index=False)