Imports

In [7]:
import pandas as pd
import os
import fnmatch
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import tensorflow as tf
from keras import layers, Sequential
from sklearn.metrics import accuracy_score, precision_score, r2_score, log_loss, f1_score, roc_auc_score
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

Get Dataset

In [2]:
# Columns of end feature set
# t1 will always end up being team first alphabetically
# rolling_*_total is total cumulative average
# rolling_*_3 is 3 game cumulative - first three games of season for every team are NA
# team names are either not included or converted to categorical codes
# result is 0 for t1, 1 for t2


#Currently returns all_df, master_df
def get_h2h_dataset_items():
    master_cols = {
        "date": [],
        "t1": [],
        "t1_rolling_kills/set_total": [],
        "t1_rolling_kills/set_3": [],
        "t1_rolling_errors/set_total": [],
        "t1_rolling_errors/set_3": [],
        "t1_rolling_total_attacks/set_total": [],
        "t1_rolling_total_attacks/set_3": [],
        "t1_rolling_hit_pct_total": [],
        "t1_rolling_hit_pct_3": [],
        "t1_rolling_assists/set_total": [],
        "t1_rolling_assists/set_3": [],
        "t1_rolling_aces/set_total": [],
        "t1_rolling_aces/set_3": [],
        "t1_rolling_serr/set_total": [],
        "t1_rolling_serr/set_3": [],
        "t1_rolling_digs/set_total": [],
        "t1_rolling_digs/set_3": [],
        "t1_rolling_b_solo/set_total": [],
        "t1_rolling_b_solo/set_3": [],
        "t1_rolling_b_assist/set_total": [],
        "t1_rolling_b_assist/set_3": [],
        "t1_rolling_b_error/set_total": [],
        "t1_rolling_b_error/set_3": [],
        "t1_rolling_pts/set_total": [],
        "t1_rolling_pts/set_3": [],
        "t2": [],
        "t2_rolling_kills/set_total": [],
        "t2_rolling_kills/set_3": [],
        "t2_rolling_errors/set_total": [],
        "t2_rolling_errors/set_3": [],
        "t2_rolling_total_attacks/set_total": [],
        "t2_rolling_total_attacks/set_3": [],
        "t2_rolling_hit_pct_total": [],
        "t2_rolling_hit_pct_3": [],
        "t2_rolling_assists/set_total": [],
        "t2_rolling_assists/set_3": [],
        "t2_rolling_aces/set_total": [],
        "t2_rolling_aces/set_3": [],
        "t2_rolling_serr/set_total": [],
        "t2_rolling_serr/set_3": [],
        "t2_rolling_digs/set_total": [],
        "t2_rolling_digs/set_3": [],
        "t2_rolling_b_solo/set_total": [],
        "t2_rolling_b_solo/set_3": [],
        "t2_rolling_b_assist/set_total": [],
        "t2_rolling_b_assist/set_3": [],
        "t2_rolling_b_error/set_total": [],
        "t2_rolling_b_error/set_3": [],
        "t2_rolling_pts/set_total": [],
        "t2_rolling_pts/set_3": [],
        "result": [] #0 for t1, 1 for t2
    }
    
    all_df = {}
    for root, dirs, filenames in os.walk("all_schedules"):
        for filename in fnmatch.filter(filenames, "*.csv"):
            try:
                all_df[filename] = pd.read_csv(os.path.join(root, filename), delimiter=",")
            except pd.errors.EmptyDataError as e:
                print(filename)
                
                
    games = {}

    for key, df in all_df.items():
        team_name1 = key.split("-schedule")[0].strip()
        for i, row in df.iterrows():
            team_name2 = row["opponent"].strip()
            date = row["date"]
            res = f"{date}~{team_name1}~{team_name2}" if team_name1 > team_name2 else f"{date}~{team_name2}~{team_name1}"
            
            if res not in games:
                games[res] = []
            games[res].append((team_name1, row))
    
    
    # Create dataframe from matched up games

    for name, item in games.items():
        if len(item) != 2: continue # Don't know why there are games that don't have two teams stats
        date, t1, t2 = name.split("~")
        row1 = item[0][1] if item[0][0] == t1 else item[1][1]
        row2 = item[0][1] if item[0][0] == t2 else item[1][1]
        winner = 0 if int(row1["result"].split("-")[0].strip()) == 3 else 1
        
        # add row to dataframe - yes it is ugly
        master_cols["result"].append(winner)
        master_cols["date"].append(date)
        master_cols["t1"].append(t1)
        master_cols["t1_rolling_kills/set_total"].append(row1["rolling_kills/set_total"])
        master_cols["t1_rolling_kills/set_3"].append(row1["rolling_kills/set_3"])
        master_cols["t1_rolling_errors/set_total"].append(row1["rolling_errors/set_total"])
        master_cols["t1_rolling_errors/set_3"].append(row1["rolling_errors/set_3"])
        master_cols["t1_rolling_total_attacks/set_total"].append(row1["rolling_total_attacks/set_total"])
        master_cols["t1_rolling_total_attacks/set_3"].append(row1["rolling_total_attacks/set_3"])
        master_cols["t1_rolling_hit_pct_total"].append(row1["rolling_hit_pct_total"])
        master_cols["t1_rolling_hit_pct_3"].append(row1["rolling_hit_pct_3"])
        master_cols["t1_rolling_assists/set_total"].append(row1["rolling_assists/set_total"])
        master_cols["t1_rolling_assists/set_3"].append(row1["rolling_assists/set_3"])
        master_cols["t1_rolling_aces/set_total"].append(row1["rolling_aces/set_total"])
        master_cols["t1_rolling_aces/set_3"].append(row1["rolling_aces/set_3"])
        master_cols["t1_rolling_serr/set_total"].append(row1["rolling_serr/set_total"])
        master_cols["t1_rolling_serr/set_3"].append(row1["rolling_serr/set_3"])
        master_cols["t1_rolling_digs/set_total"].append(row1["rolling_digs/set_total"])
        master_cols["t1_rolling_digs/set_3"].append(row1["rolling_digs/set_3"])
        master_cols["t1_rolling_b_solo/set_total"].append(row1["rolling_b_solo/set_total"])
        master_cols["t1_rolling_b_solo/set_3"].append(row1["rolling_b_solo/set_3"])
        master_cols["t1_rolling_b_assist/set_total"].append(row1["rolling_b_assist/set_total"])
        master_cols["t1_rolling_b_assist/set_3"].append(row1["rolling_b_assist/set_3"])
        master_cols["t1_rolling_b_error/set_total"].append(row1["rolling_b_error/set_total"])
        master_cols["t1_rolling_b_error/set_3"].append(row1["rolling_b_error/set_3"])
        master_cols["t1_rolling_pts/set_total"].append(row1["rolling_pts/set_total"])
        master_cols["t1_rolling_pts/set_3"].append(row1["rolling_pts/set_3"])
        master_cols["t2"].append(t2)
        master_cols["t2_rolling_kills/set_total"].append(row2["rolling_kills/set_total"])
        master_cols["t2_rolling_kills/set_3"].append(row2["rolling_kills/set_3"])
        master_cols["t2_rolling_errors/set_total"].append(row2["rolling_errors/set_total"])
        master_cols["t2_rolling_errors/set_3"].append(row2["rolling_errors/set_3"])
        master_cols["t2_rolling_total_attacks/set_total"].append(row2["rolling_total_attacks/set_total"])
        master_cols["t2_rolling_total_attacks/set_3"].append(row2["rolling_total_attacks/set_3"])
        master_cols["t2_rolling_hit_pct_total"].append(row2["rolling_hit_pct_total"])
        master_cols["t2_rolling_hit_pct_3"].append(row2["rolling_hit_pct_3"])
        master_cols["t2_rolling_assists/set_total"].append(row2["rolling_assists/set_total"])
        master_cols["t2_rolling_assists/set_3"].append(row2["rolling_assists/set_3"])
        master_cols["t2_rolling_aces/set_total"].append(row2["rolling_aces/set_total"])
        master_cols["t2_rolling_aces/set_3"].append(row2["rolling_aces/set_3"])
        master_cols["t2_rolling_serr/set_total"].append(row2["rolling_serr/set_total"])
        master_cols["t2_rolling_serr/set_3"].append(row2["rolling_serr/set_3"])
        master_cols["t2_rolling_digs/set_total"].append(row2["rolling_digs/set_total"])
        master_cols["t2_rolling_digs/set_3"].append(row2["rolling_digs/set_3"])
        master_cols["t2_rolling_b_solo/set_total"].append(row2["rolling_b_solo/set_total"])
        master_cols["t2_rolling_b_solo/set_3"].append(row2["rolling_b_solo/set_3"])
        master_cols["t2_rolling_b_assist/set_total"].append(row2["rolling_b_assist/set_total"])
        master_cols["t2_rolling_b_assist/set_3"].append(row2["rolling_b_assist/set_3"])
        master_cols["t2_rolling_b_error/set_total"].append(row2["rolling_b_error/set_total"])
        master_cols["t2_rolling_b_error/set_3"].append(row2["rolling_b_error/set_3"])
        master_cols["t2_rolling_pts/set_total"].append(row2["rolling_pts/set_total"])
        master_cols["t2_rolling_pts/set_3"].append(row2["rolling_pts/set_3"])
    
    master_df = pd.DataFrame(master_cols)
    # master_df = master_df.dropna() #lose about 7 thousand matches by doing this, could consider only using season averages
    master_df["date"] = pd.to_datetime(master_df["date"])
    master_df["t1_code"] = master_df["t1"].astype("category").cat.codes
    master_df["t2_code"] = master_df["t2"].astype("category").cat.codes
    
    return all_df, master_df

In [5]:
all_df, master_df = get_h2h_dataset_items()
master_df_dropped = master_df.dropna()
master_df_filled = master_df.fillna(0)

Train and Test splits

In [12]:
features = [
       't1_rolling_kills/set_total', 't1_rolling_kills/set_3',
       't1_rolling_errors/set_total', 't1_rolling_errors/set_3',
       't1_rolling_total_attacks/set_total', 't1_rolling_total_attacks/set_3',
       't1_rolling_hit_pct_total', 't1_rolling_hit_pct_3',
       't1_rolling_assists/set_total', 't1_rolling_assists/set_3',
       't1_rolling_aces/set_total', 't1_rolling_aces/set_3',
       't1_rolling_serr/set_total', 't1_rolling_serr/set_3',
       't1_rolling_digs/set_total', 't1_rolling_digs/set_3',
       't1_rolling_b_solo/set_total', 't1_rolling_b_solo/set_3',
       't1_rolling_b_assist/set_total', 't1_rolling_b_assist/set_3',
       't1_rolling_b_error/set_total', 't1_rolling_b_error/set_3',
       't1_rolling_pts/set_total', 't1_rolling_pts/set_3',
       't2_rolling_kills/set_total', 't2_rolling_kills/set_3',
       't2_rolling_errors/set_total', 't2_rolling_errors/set_3',
       't2_rolling_total_attacks/set_total', 't2_rolling_total_attacks/set_3',
       't2_rolling_hit_pct_total', 't2_rolling_hit_pct_3',
       't2_rolling_assists/set_total', 't2_rolling_assists/set_3',
       't2_rolling_aces/set_total', 't2_rolling_aces/set_3',
       't2_rolling_serr/set_total', 't2_rolling_serr/set_3',
       't2_rolling_digs/set_total', 't2_rolling_digs/set_3',
       't2_rolling_b_solo/set_total', 't2_rolling_b_solo/set_3',
       't2_rolling_b_assist/set_total', 't2_rolling_b_assist/set_3',
       't2_rolling_b_error/set_total', 't2_rolling_b_error/set_3',
       't2_rolling_pts/set_total', 't2_rolling_pts/set_3'#, "t1_code", "t2_code"
]

train = master_df_dropped[master_df_dropped["date"] < "2023-11-01"]
test = master_df_dropped[master_df_dropped["date"] >= "2023-11-01"] # Test items come after train items

In [13]:
scaler = StandardScaler()
train_scaled_x = scaler.fit_transform(train[features].values)
test_scaled_x = scaler.fit_transform(test[features].values)

Create/Train Models

In [14]:
cat = CatBoostClassifier()
gbc = GradientBoostingClassifier()
qda = QuadraticDiscriminantAnalysis()
gnb = GaussianNB()
nn = MLPClassifier(alpha=1, max_iter=1000)
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
dc = DecisionTreeClassifier()
logi = LogisticRegression(max_iter=100000)
svc1 = SVC(kernel="linear", C=0.025)
svc2 = SVC(gamma=2, C=1)
ada = AdaBoostClassifier()
xgb = XGBClassifier()
gpc = GaussianProcessClassifier()

classifiers = [
    ("ADA", ada),
    ("QDA", qda),
    ("GNB", gnb),
    ("KNN", knn),
    ("NN", nn),
    ("DT", dc),
    ("RF", rf),
    ("LogReg", logi),
    ("SVC1", svc1),
    ("XGB", xgb),
    ("GBC", gbc),
    ("CAT", cat)
]

vc = VotingClassifier([
    ("ADA", ada),
    ("GNB", gnb),
    ("KNN", knn),
    ("NN", nn),
    ("RF", rf),
    ("LogReg", logi),
    ("SVC1", svc1),
    ("XGB", xgb), 
    ("GBC", gbc),
    ("CAT", cat)
])

In [15]:
for name, clf in classifiers:
    clf.fit(train_scaled_x, train["result"].values)
    trpred = clf.predict(train_scaled_x)
    tracc = accuracy_score(train["result"], trpred)
    preds = clf.predict(test_scaled_x)
    acc = accuracy_score(test["result"], preds)
    prec = precision_score(test["result"], preds)
    r2 = r2_score(test["result"], preds)
    ll = log_loss(test["result"], preds)
    f1 = f1_score(test["result"], preds)
    roc = roc_auc_score(test["result"], preds)
    
    print(f"Model Name: {name}\nAccuracy: {acc} Training Accuracy: {tracc}, Precision: {prec}, R2: {r2}, Log Loss: {ll}, f1: {f1}, ROC: {roc}")

Model Name: ADA
Accuracy: 0.7193828351012537 Training Accuracy: 0.729499184895932, Precision: 0.7165354330708661, R2: -0.12270356342964295, Log Loss: 10.114467826647147, f1: 0.7144259077526988, ROC: 0.7192822542840773
Model Name: QDA
Accuracy: 0.5120540019286403 Training Accuracy: 0.7210638055882018, Precision: 0.5170068027210885, R2: -0.9521924504996542, Log Loss: 17.587356427090914, f1: 0.23100303951367782, ROC: 0.5068734978756334
Model Name: GNB
Accuracy: 0.7029893924783028 Training Accuracy: 0.7139932516965538, Precision: 0.7001972386587771, R2: -0.18829105682587643, Log Loss: 10.705347390403166, f1: 0.6974459724950884, ROC: 0.7028714293155148
Model Name: KNN
Accuracy: 0.6933461909353905 Training Accuracy: 0.7937407589945786, Precision: 0.6895874263261297, R2: -0.22687193529424898, Log Loss: 11.052923604377295, f1: 0.6882352941176471, ROC: 0.6932541129374299
Model Name: NN
Accuracy: 0.7367405978784957 Training Accuracy: 0.7344277211206733, Precision: 0.7389558232931727, R2: -0.0532

In [16]:
for i in range(1, 36):
    knn = KNeighborsClassifier(i)
    knn.fit(train_scaled_x, train["result"].values)
    trpred = knn.predict(train_scaled_x)
    tracc = accuracy_score(train["result"], trpred)
    preds = knn.predict(test_scaled_x)
    acc = accuracy_score(test["result"], preds)
    prec = precision_score(test["result"], preds)
    print(f"Neighbors: {i} - Acc: {acc}, Train Acc: {tracc}, Precision: {prec}")
    

Neighbors: 1 - Acc: 0.6460945033751205, Train Acc: 0.99994313227433, Precision: 0.6395348837209303
Neighbors: 2 - Acc: 0.656702025072324, Train Acc: 0.8245820222163248, Precision: 0.739938080495356
Neighbors: 3 - Acc: 0.686595949855352, Train Acc: 0.8299275884293135, Precision: 0.6837944664031621
Neighbors: 4 - Acc: 0.7020250723240116, Train Acc: 0.7888690904955075, Precision: 0.7616580310880829
Neighbors: 5 - Acc: 0.6933461909353905, Train Acc: 0.7937407589945786, Precision: 0.6895874263261297
Neighbors: 6 - Acc: 0.7010607521697203, Train Acc: 0.7748606740721082, Precision: 0.7410071942446043
Neighbors: 7 - Acc: 0.7029893924783028, Train Acc: 0.777192250824582, Precision: 0.700990099009901
Neighbors: 8 - Acc: 0.7184185149469624, Train Acc: 0.765989308867574, Precision: 0.7494305239179955
Neighbors: 9 - Acc: 0.7135969141755063, Train Acc: 0.766425294764378, Precision: 0.7057692307692308
Neighbors: 10 - Acc: 0.7155255544840887, Train Acc: 0.7589566667930394, Precision: 0.737885462555066

In [17]:
for i in range(36, 50):
    knn = KNeighborsClassifier(i)
    knn.fit(train_scaled_x, train["result"].values)
    trpred = knn.predict(train_scaled_x)
    tracc = accuracy_score(train["result"], trpred)
    preds = knn.predict(test_scaled_x)
    acc = accuracy_score(test["result"], preds)
    prec = precision_score(test["result"], preds)
    print(f"Neighbors: {i} - Acc: {acc}, Train Acc: {tracc}, Precision: {prec}")
    

Neighbors: 36 - Acc: 0.7328833172613307, Train Acc: 0.7384084619175797, Precision: 0.7387755102040816
Neighbors: 37 - Acc: 0.7290260366441659, Train Acc: 0.7382378587405695, Precision: 0.724609375
Neighbors: 38 - Acc: 0.7280617164898746, Train Acc: 0.7382757705576828, Precision: 0.7331975560081466
Neighbors: 39 - Acc: 0.7367405978784957, Train Acc: 0.7375933578496418, Precision: 0.7333333333333333
Neighbors: 40 - Acc: 0.7328833172613307, Train Acc: 0.7372900633127346, Precision: 0.7358870967741935
Neighbors: 41 - Acc: 0.7328833172613307, Train Acc: 0.7376502255753118, Precision: 0.7294117647058823
Neighbors: 42 - Acc: 0.7357762777242044, Train Acc: 0.7371952837699511, Precision: 0.7403651115618661
Neighbors: 43 - Acc: 0.733847637415622, Train Acc: 0.7372900633127346, Precision: 0.7290448343079922
Neighbors: 44 - Acc: 0.7348119575699132, Train Acc: 0.7371952837699511, Precision: 0.7418032786885246
Neighbors: 45 - Acc: 0.7328833172613307, Train Acc: 0.7370057246843841, Precision: 0.73122

In [18]:
for i in range(51, 100):
    knn = KNeighborsClassifier(i)
    knn.fit(train_scaled_x, train["result"].values)
    trpred = knn.predict(train_scaled_x)
    tracc = accuracy_score(train["result"], trpred)
    preds = knn.predict(test_scaled_x)
    acc = accuracy_score(test["result"], preds)
    prec = precision_score(test["result"], preds)
    print(f"Neighbors: {i} - Acc: {acc}, Train Acc: {tracc}, Precision: {prec}")
    

Neighbors: 51 - Acc: 0.7242044358727098, Train Acc: 0.7359631497137658, Precision: 0.7227722772277227
Neighbors: 52 - Acc: 0.7213114754098361, Train Acc: 0.7360579292565492, Precision: 0.7228915662650602
Neighbors: 53 - Acc: 0.7261330761812922, Train Acc: 0.7351859574629412, Precision: 0.7247524752475247
Neighbors: 54 - Acc: 0.7242044358727098, Train Acc: 0.734882662926034, Precision: 0.7254509018036072
Neighbors: 55 - Acc: 0.7213114754098361, Train Acc: 0.7353376047313948, Precision: 0.716796875
Neighbors: 56 - Acc: 0.7232401157184185, Train Acc: 0.7342760738522197, Precision: 0.724
Neighbors: 57 - Acc: 0.7203471552555448, Train Acc: 0.7351480456458278, Precision: 0.7170923379174853
Neighbors: 58 - Acc: 0.7222757955641272, Train Acc: 0.7336505288698487, Precision: 0.7216699801192843
Neighbors: 59 - Acc: 0.7251687560270009, Train Acc: 0.73444667702923, Precision: 0.720703125
Neighbors: 60 - Acc: 0.7261330761812922, Train Acc: 0.733574705235622, Precision: 0.7229862475442044
Neighbors: 

In [25]:
pred = knn.predict_proba(test_scaled_x)
acc = accuracy_score(test["result"], preds)
acc

0.7386692381870781

In [24]:
[item.shape for item in nn.coefs_] # 48 -> 100 -> 1

[(48, 100), (100, 1)]

In [26]:
list(zip(pred, test["result"]))

[(array([0.81818182, 0.18181818]), 0),
 (array([0.23232323, 0.76767677]), 1),
 (array([0.7979798, 0.2020202]), 1),
 (array([0.45454545, 0.54545455]), 1),
 (array([0.12121212, 0.87878788]), 1),
 (array([0.43434343, 0.56565657]), 1),
 (array([0.63636364, 0.36363636]), 0),
 (array([0.68686869, 0.31313131]), 0),
 (array([0.19191919, 0.80808081]), 1),
 (array([0.17171717, 0.82828283]), 1),
 (array([0.37373737, 0.62626263]), 0),
 (array([0.63636364, 0.36363636]), 0),
 (array([0.55555556, 0.44444444]), 0),
 (array([0.55555556, 0.44444444]), 0),
 (array([0.60606061, 0.39393939]), 0),
 (array([0.36363636, 0.63636364]), 1),
 (array([0.33333333, 0.66666667]), 1),
 (array([0.87878788, 0.12121212]), 0),
 (array([0.41414141, 0.58585859]), 1),
 (array([0.90909091, 0.09090909]), 0),
 (array([0.25252525, 0.74747475]), 1),
 (array([0.84848485, 0.15151515]), 0),
 (array([0.82828283, 0.17171717]), 1),
 (array([0.18181818, 0.81818182]), 1),
 (array([0.67676768, 0.32323232]), 0),
 (array([0.54545455, 0.4545