In [1]:
import pandas as pd
import os
import fnmatch

Get all dataframes from root folder

In [2]:
all_df = {}
for root, dirs, filenames in os.walk("all_schedules"):
    for filename in fnmatch.filter(filenames, "*.csv"):
        try:
            all_df[filename] = pd.read_csv(os.path.join(root, filename), delimiter=",")
        except pd.errors.EmptyDataError as e:
            print(filename)

In [4]:
list(all_df.items())[0][1].columns

Index(['date', 'opponent/venue', 'result', 'sets', 'mp', 'kills', 'errors',
       'total_attacks', 'hit_pct', 'assists', 'aces', 'serr', 'digs', 'rerr',
       'b_solo', 'b_assist', 'b_error', 'pts', 'bhe', 'opponent',
       'sets_from_result', 'kills/set', 'errors/set', 'total_attacks/set',
       'assists/set', 'aces/set', 'serr/set', 'digs/set', 'b_solo/set',
       'b_assist/set', 'b_error/set', 'pts/set', 'rolling_kills/set_3',
       'rolling_kills/set_total', 'rolling_errors/set_3',
       'rolling_errors/set_total', 'rolling_total_attacks/set_3',
       'rolling_total_attacks/set_total', 'rolling_hit_pct_3',
       'rolling_hit_pct_total', 'rolling_assists/set_3',
       'rolling_assists/set_total', 'rolling_aces/set_3',
       'rolling_aces/set_total', 'rolling_serr/set_3',
       'rolling_serr/set_total', 'rolling_digs/set_3',
       'rolling_digs/set_total', 'rolling_b_solo/set_3',
       'rolling_b_solo/set_total', 'rolling_b_assist/set_3',
       'rolling_b_assist/set_to

In [3]:
# Columns of end feature set
# t1 will always end up being team first alphabetically
# rolling_*_total is total cumulative average
# rolling_*_3 is 3 game cumulative - first three games of season for every team are NA
# team names are either not included or converted to categorical codes
# result is 0 for t1, 1 for t2

master_cols = {
    "date": [],
    "t1": [],
    "t1_rolling_kills/set_total": [],
    "t1_rolling_kills/set_3": [],
    "t1_rolling_errors/set_total": [],
    "t1_rolling_errors/set_3": [],
    "t1_rolling_total_attacks/set_total": [],
    "t1_rolling_total_attacks/set_3": [],
    "t1_rolling_hit_pct_total": [],
    "t1_rolling_hit_pct_3": [],
    "t1_rolling_assists/set_total": [],
    "t1_rolling_assists/set_3": [],
    "t1_rolling_aces/set_total": [],
    "t1_rolling_aces/set_3": [],
    "t1_rolling_serr/set_total": [],
    "t1_rolling_serr/set_3": [],
    "t1_rolling_digs/set_total": [],
    "t1_rolling_digs/set_3": [],
    "t1_rolling_b_solo/set_total": [],
    "t1_rolling_b_solo/set_3": [],
    "t1_rolling_b_assist/set_total": [],
    "t1_rolling_b_assist/set_3": [],
    "t1_rolling_b_error/set_total": [],
    "t1_rolling_b_error/set_3": [],
    "t1_rolling_pts/set_total": [],
    "t1_rolling_pts/set_3": [],
    "t2": [],
    "t2_rolling_kills/set_total": [],
    "t2_rolling_kills/set_3": [],
    "t2_rolling_errors/set_total": [],
    "t2_rolling_errors/set_3": [],
    "t2_rolling_total_attacks/set_total": [],
    "t2_rolling_total_attacks/set_3": [],
    "t2_rolling_hit_pct_total": [],
    "t2_rolling_hit_pct_3": [],
    "t2_rolling_assists/set_total": [],
    "t2_rolling_assists/set_3": [],
    "t2_rolling_aces/set_total": [],
    "t2_rolling_aces/set_3": [],
    "t2_rolling_serr/set_total": [],
    "t2_rolling_serr/set_3": [],
    "t2_rolling_digs/set_total": [],
    "t2_rolling_digs/set_3": [],
    "t2_rolling_b_solo/set_total": [],
    "t2_rolling_b_solo/set_3": [],
    "t2_rolling_b_assist/set_total": [],
    "t2_rolling_b_assist/set_3": [],
    "t2_rolling_b_error/set_total": [],
    "t2_rolling_b_error/set_3": [],
    "t2_rolling_pts/set_total": [],
    "t2_rolling_pts/set_3": [],
    "result": [] #0 for t1, 1 for t2
}

In [4]:
# Collect one-sided game stats into matching games - "{date}~{team_name1}~{team_name2}"

games = {}

for key, df in all_df.items():
    team_name1 = key.split("-schedule")[0].strip()
    for i, row in df.iterrows():
        team_name2 = row["opponent"].strip()
        date = row["date"]
        res = f"{date}~{team_name1}~{team_name2}" if team_name1 > team_name2 else f"{date}~{team_name2}~{team_name1}"
        
        if res not in games:
            games[res] = []
        games[res].append((team_name1, row))

In [7]:
def game_site(game_info):
    if '@' in game_info:
        if "" in (game_info.split("@")):
            return "away"
        else:
            return "neutral"
    else:
        return "home"
    
# neutral sites, first number is sets won for that team


In [5]:
# Create dataframe from matched up games

for name, item in games.items():
    if len(item) != 2: continue # Don't know why there are games that don't have two teams stats
    date, t1, t2 = name.split("~")
    row1 = item[0][1] if item[0][0] == t1 else item[1][1]
    row2 = item[0][1] if item[0][0] == t2 else item[1][1]
    winner = 0 if int(row1["result"].split("-")[0].strip()) == 3 else 1
    # add row to dataframe - yes it is ugly
    master_cols["result"].append(winner)
    master_cols["date"].append(date)
    master_cols["t1"].append(t1)
    master_cols["t1_rolling_kills/set_total"].append(row1["rolling_kills/set_total"])
    master_cols["t1_rolling_kills/set_3"].append(row1["rolling_kills/set_3"])
    master_cols["t1_rolling_errors/set_total"].append(row1["rolling_errors/set_total"])
    master_cols["t1_rolling_errors/set_3"].append(row1["rolling_errors/set_3"])
    master_cols["t1_rolling_total_attacks/set_total"].append(row1["rolling_total_attacks/set_total"])
    master_cols["t1_rolling_total_attacks/set_3"].append(row1["rolling_total_attacks/set_3"])
    master_cols["t1_rolling_hit_pct_total"].append(row1["rolling_hit_pct_total"])
    master_cols["t1_rolling_hit_pct_3"].append(row1["rolling_hit_pct_3"])
    master_cols["t1_rolling_assists/set_total"].append(row1["rolling_assists/set_total"])
    master_cols["t1_rolling_assists/set_3"].append(row1["rolling_assists/set_3"])
    master_cols["t1_rolling_aces/set_total"].append(row1["rolling_aces/set_total"])
    master_cols["t1_rolling_aces/set_3"].append(row1["rolling_aces/set_3"])
    master_cols["t1_rolling_serr/set_total"].append(row1["rolling_serr/set_total"])
    master_cols["t1_rolling_serr/set_3"].append(row1["rolling_serr/set_3"])
    master_cols["t1_rolling_digs/set_total"].append(row1["rolling_digs/set_total"])
    master_cols["t1_rolling_digs/set_3"].append(row1["rolling_digs/set_3"])
    master_cols["t1_rolling_b_solo/set_total"].append(row1["rolling_b_solo/set_total"])
    master_cols["t1_rolling_b_solo/set_3"].append(row1["rolling_b_solo/set_3"])
    master_cols["t1_rolling_b_assist/set_total"].append(row1["rolling_b_assist/set_total"])
    master_cols["t1_rolling_b_assist/set_3"].append(row1["rolling_b_assist/set_3"])
    master_cols["t1_rolling_b_error/set_total"].append(row1["rolling_b_error/set_total"])
    master_cols["t1_rolling_b_error/set_3"].append(row1["rolling_b_error/set_3"])
    master_cols["t1_rolling_pts/set_total"].append(row1["rolling_pts/set_total"])
    master_cols["t1_rolling_pts/set_3"].append(row1["rolling_pts/set_3"])
    master_cols["t2"].append(t2)
    master_cols["t2_rolling_kills/set_total"].append(row2["rolling_kills/set_total"])
    master_cols["t2_rolling_kills/set_3"].append(row2["rolling_kills/set_3"])
    master_cols["t2_rolling_errors/set_total"].append(row2["rolling_errors/set_total"])
    master_cols["t2_rolling_errors/set_3"].append(row2["rolling_errors/set_3"])
    master_cols["t2_rolling_total_attacks/set_total"].append(row2["rolling_total_attacks/set_total"])
    master_cols["t2_rolling_total_attacks/set_3"].append(row2["rolling_total_attacks/set_3"])
    master_cols["t2_rolling_hit_pct_total"].append(row2["rolling_hit_pct_total"])
    master_cols["t2_rolling_hit_pct_3"].append(row2["rolling_hit_pct_3"])
    master_cols["t2_rolling_assists/set_total"].append(row2["rolling_assists/set_total"])
    master_cols["t2_rolling_assists/set_3"].append(row2["rolling_assists/set_3"])
    master_cols["t2_rolling_aces/set_total"].append(row2["rolling_aces/set_total"])
    master_cols["t2_rolling_aces/set_3"].append(row2["rolling_aces/set_3"])
    master_cols["t2_rolling_serr/set_total"].append(row2["rolling_serr/set_total"])
    master_cols["t2_rolling_serr/set_3"].append(row2["rolling_serr/set_3"])
    master_cols["t2_rolling_digs/set_total"].append(row2["rolling_digs/set_total"])
    master_cols["t2_rolling_digs/set_3"].append(row2["rolling_digs/set_3"])
    master_cols["t2_rolling_b_solo/set_total"].append(row2["rolling_b_solo/set_total"])
    master_cols["t2_rolling_b_solo/set_3"].append(row2["rolling_b_solo/set_3"])
    master_cols["t2_rolling_b_assist/set_total"].append(row2["rolling_b_assist/set_total"])
    master_cols["t2_rolling_b_assist/set_3"].append(row2["rolling_b_assist/set_3"])
    master_cols["t2_rolling_b_error/set_total"].append(row2["rolling_b_error/set_total"])
    master_cols["t2_rolling_b_error/set_3"].append(row2["rolling_b_error/set_3"])
    master_cols["t2_rolling_pts/set_total"].append(row2["rolling_pts/set_total"])
    master_cols["t2_rolling_pts/set_3"].append(row2["rolling_pts/set_3"])

In [6]:
master_df = pd.DataFrame(master_cols)
master_df = master_df.dropna() #lose about 7 thousand matches by doing this, could consider only using season averages
master_df["date"] = pd.to_datetime(master_df["date"])
master_df["t1_code"] = master_df["t1"].astype("category").cat.codes
master_df["t2_code"] = master_df["t2"].astype("category").cat.codes

In [29]:
master_df

Unnamed: 0,date,t1,t1_rolling_kills/set_total,t1_rolling_kills/set_3,t1_rolling_errors/set_total,t1_rolling_errors/set_3,t1_rolling_total_attacks/set_total,t1_rolling_total_attacks/set_3,t1_rolling_hit_pct_total,t1_rolling_hit_pct_3,...,t2_rolling_b_solo/set_3,t2_rolling_b_assist/set_total,t2_rolling_b_assist/set_3,t2_rolling_b_error/set_total,t2_rolling_b_error/set_3,t2_rolling_pts/set_total,t2_rolling_pts/set_3,result,t1_code,t2_code
3,2016-09-02,SMU,10.291667,13.722222,4.520833,6.027778,27.041667,36.055556,0.167250,0.223000,...,0.694444,2.775000,3.700000,0.275000,0.366667,11.487500,15.316667,0,227,10
4,2016-09-02,Arizona,10.296667,13.244444,4.486667,4.977778,30.523333,39.288889,0.153400,0.215000,...,0.444444,1.946667,2.711111,0.120000,0.133333,9.960000,12.266667,1,6,6
5,2016-09-03,Pepperdine,10.750000,13.500000,4.097222,4.911111,29.741667,35.783333,0.189667,0.245333,...,0.555556,2.294444,2.088889,0.238889,0.311111,13.825000,17.400000,0,208,10
6,2016-09-09,Sacramento St.,12.735185,14.466667,4.731481,4.505556,34.303704,38.738889,0.214444,0.256333,...,0.666667,2.919048,3.111111,0.204762,0.111111,14.135714,17.666667,0,228,10
7,2016-09-09,Texas A&M,12.556250,14.555556,4.800000,5.666667,29.306250,34.222222,0.232000,0.258333,...,1.222222,2.887500,3.333333,0.220833,0.222222,14.785417,18.000000,1,268,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121132,2022-12-08,Wisconsin,12.968817,12.055556,4.086022,2.638889,32.076882,31.666667,0.276290,0.311667,...,0.850000,4.178922,4.644444,0.539706,0.644444,16.764951,17.244444,0,325,210
121133,2022-09-01,Pacific,10.000000,13.333333,4.800000,6.400000,26.700000,35.600000,0.144500,0.192667,...,1.555556,1.125000,1.500000,0.312500,0.416667,11.104167,14.805556,0,205,47
121135,2022-10-22,San Francisco,11.865079,13.566667,5.018254,4.472222,32.993651,36.283333,0.197714,0.250333,...,0.244444,1.590909,2.122222,0.287879,0.333333,15.353788,14.277778,0,238,208
121136,2022-11-22,San Francisco,11.646552,11.444444,5.029310,5.388889,32.620115,31.916667,0.196103,0.188000,...,0.505556,1.480460,0.955556,0.287356,0.433333,15.221839,15.111111,1,238,208


In [7]:
games_2023 = master_df[master_df["date"] > "2023-01-01"]

Voting Classifier Model

In [10]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
rf = RandomForestClassifier()
dc = DecisionTreeClassifier()
logi = LogisticRegression()
svc = SVC()
vc = VotingClassifier([("DTC", dc), ("LR", logi), ("SVC", svc)])

In [12]:
features = [
       't1_rolling_kills/set_total', 't1_rolling_kills/set_3',
       't1_rolling_errors/set_total', 't1_rolling_errors/set_3',
       't1_rolling_total_attacks/set_total', 't1_rolling_total_attacks/set_3',
       't1_rolling_hit_pct_total', 't1_rolling_hit_pct_3',
       't1_rolling_assists/set_total', 't1_rolling_assists/set_3',
       't1_rolling_aces/set_total', 't1_rolling_aces/set_3',
       't1_rolling_serr/set_total', 't1_rolling_serr/set_3',
       't1_rolling_digs/set_total', 't1_rolling_digs/set_3',
       't1_rolling_b_solo/set_total', 't1_rolling_b_solo/set_3',
       't1_rolling_b_assist/set_total', 't1_rolling_b_assist/set_3',
       't1_rolling_b_error/set_total', 't1_rolling_b_error/set_3',
       't1_rolling_pts/set_total', 't1_rolling_pts/set_3',
       't2_rolling_kills/set_total', 't2_rolling_kills/set_3',
       't2_rolling_errors/set_total', 't2_rolling_errors/set_3',
       't2_rolling_total_attacks/set_total', 't2_rolling_total_attacks/set_3',
       't2_rolling_hit_pct_total', 't2_rolling_hit_pct_3',
       't2_rolling_assists/set_total', 't2_rolling_assists/set_3',
       't2_rolling_aces/set_total', 't2_rolling_aces/set_3',
       't2_rolling_serr/set_total', 't2_rolling_serr/set_3',
       't2_rolling_digs/set_total', 't2_rolling_digs/set_3',
       't2_rolling_b_solo/set_total', 't2_rolling_b_solo/set_3',
       't2_rolling_b_assist/set_total', 't2_rolling_b_assist/set_3',
       't2_rolling_b_error/set_total', 't2_rolling_b_error/set_3',
       't2_rolling_pts/set_total', 't2_rolling_pts/set_3', "t1_code", "t2_code"
]

train = master_df[master_df["date"] < "2023-11-01"]
test = master_df[master_df["date"] >= "2023-11-01"]

In [12]:
from sklearn.metrics import accuracy_score, precision_score, r2_score

Voting Classifier

In [13]:
vc.fit(train[features], train["result"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
preds = vc.predict(test[features])
acc = accuracy_score(test["result"], preds)
prec = precision_score(test["result"], preds)
r2 = r2_score(test["result"], preds)
acc, prec, r2

(0.7444551591128254, 0.7450199203187251)

XGboost classifier

In [30]:
from xgboost import XGBClassifier
xgb = XGBClassifier(objective="binary:logistic")
xgb.fit(train[features], train["result"])
preds = xgb.predict(test[features])
acc = accuracy_score(test["result"], preds)
prec = precision_score(test["result"], preds)
r2 = r2_score(test["result"], preds)
acc, prec, r2

(0.7386692381870781, 0.7272727272727273, -0.04554180649289785)

Random forest classifier

In [31]:
rf.fit(train[features], train["result"])
preds = rf.predict(test[features])
acc = accuracy_score(test["result"], preds)
prec = precision_score(test["result"], preds)
r2 = r2_score(test["result"], preds)
acc, prec, r2

(0.742526518804243, 0.741106719367589, -0.030109455105548744)

Confusion matrix for random forest

In [106]:
pd.crosstab(index=preds, columns=test["result"])

result,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,386,143
1,140,368


#Neural Network

In [8]:
import numpy as np
from sklearn.preprocessing import StandardScaler

In [9]:
train = master_df[master_df["date"] < "2023-11-01"]
test = master_df[master_df["date"] >= "2023-11-01"]
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# Align columns of train and test sets
train, test = train.align(test, join='inner', axis=1)

In [10]:
scaler = StandardScaler()

# Select only numeric columns for scaling
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Exclude the target variable 'result' and other non-numeric columns
numeric_cols = [col for col in numeric_cols if col != 'result']

# Apply scaling
train[numeric_cols] = scaler.fit_transform(train[numeric_cols])
test[numeric_cols] = scaler.transform(test[numeric_cols])


In [11]:
X_train = train.drop('result', axis=1)
y_train = train['result']
X_test = test.drop('result', axis=1)
y_test = test['result']

In [12]:
X_train['year'] = X_train['date'].dt.year
X_train['month'] = X_train['date'].dt.month
X_train['day'] = X_train['date'].dt.day

X_test['year'] = X_test['date'].dt.year
X_test['month'] = X_test['date'].dt.month
X_test['day'] = X_test['date'].dt.day

X_train = X_train.drop(['date'], axis=1)  # drop the original date column
X_test = X_test.drop(['date'], axis=1)

X_train = X_train.fillna(0)
y_train = y_train.fillna(0)

# Handle infinite values
X_train.replace([np.inf, -np.inf], 0, inplace=True)
y_train.replace([np.inf, -np.inf], 0, inplace=True)

X_train = X_train.astype('float32')
y_train = y_train.astype('float32')

X_test = X_test.fillna(0)
y_test = y_test.fillna(0)

# Handle infinite values
X_test.replace([np.inf, -np.inf], 0, inplace=True)
y_test.replace([np.inf, -np.inf], 0, inplace=True)

X_test = X_test.astype('float32')
y_test = y_test.astype('float32')

In [13]:
import tensorflow as tf
from tensorflow.keras import layers
import random

random.seed(10)
np.random.seed(10)
tf.random.set_seed(10)

# Define the model
model = tf.keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.20)
#model.save('neural_network.h5')

2023-12-23 21:29:27.527668: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-23 21:29:28.424744: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-23 21:29:28.424917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-23 21:29:28.621321: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-23 21:29:29.040380: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-23 21:29:29.045875: I tensorflow/core/platform/cpu_feature_guard.cc:1

Epoch 1/10


2023-12-23 21:29:33.776761: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 100274328 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")


 1/33 [..............................] - ETA: 0s - loss: 0.4606 - accuracy: 0.7500

Test Accuracy: 0.7434908151626587


In [15]:
probabilities = model.predict(X_test).flatten()

predictions = (probabilities >= 0.5).astype(int)



In [20]:
def t1_to_moneyline(percent):
    percent = float(percent)
    t1_percent = 1-percent
    if t1_percent == 0.5:
        return "+100"
    elif t1_percent > 0.5:
        line = round((t1_percent / (1 - t1_percent)) * 100)
        return f"-{line}"
    else:
        line = round(((1 - t1_percent) / t1_percent) * 100)
        return f"+{line}"
    
def t2_to_moneyline(percent):
    percent = float(percent)
    if percent == 0.5:
        return "+100"
    elif percent > 0.5:
        line = round((percent / (1 - percent)) * 100)
        return f"-{line}"
    else:
        line = round(((1 - percent) / percent) * 100)
        return f"+{line}"

In [21]:
# 0 means t1 win, 1 means t2 win
results = X_test.join(games_2023[['date', 't1', 't2']], how='left')
results["probs"] = probabilities
results["predictions"] = predictions
results["actual"] = y_test.values
results["t1_ML"] = results["probs"].apply(t1_to_moneyline)
results["t2_ML"] = results["probs"].apply(t2_to_moneyline)
results[["date","t1","t2","probs","predictions","actual", "t1_ML", "t2_ML"]]

Unnamed: 0,date,t1,t2,probs,predictions,actual,t1_ML,t2_ML
4933,2023-11-04,Hofstra,Col. of Charleston,0.400265,0,1.0,-150,+150
4934,2023-11-05,Hofstra,Col. of Charleston,0.422552,0,1.0,-137,+137
4935,2023-11-11,Stony Brook,Hofstra,0.445844,0,0.0,-124,+124
4936,2023-11-12,Stony Brook,Hofstra,0.522887,1,0.0,+110,-110
4958,2023-11-01,Oklahoma,Baylor,0.421214,0,1.0,-137,+137
...,...,...,...,...,...,...,...,...
9625,2023-11-11,The Citadel,Furman,0.518836,1,0.0,+108,-108
9632,2023-11-19,Washington St.,Utah,0.167502,0,0.0,-497,+497
9635,2023-11-10,Washington St.,UCLA,0.199388,0,1.0,-402,+402
9639,2023-11-03,Morgan St.,Howard,0.785346,1,1.0,+366,-366


In [22]:
eval_df = pd.DataFrame({"probs": probabilities, "predictions": predictions, "actual": y_test.values})
eval_df

Unnamed: 0,probs,predictions,actual
0,0.400265,0,1.0
1,0.422552,0,1.0
2,0.445844,0,0.0
3,0.522887,1,0.0
4,0.421214,0,1.0
...,...,...,...
1032,0.518836,1,0.0
1033,0.167502,0,0.0
1034,0.199388,0,1.0
1035,0.785346,1,1.0


In [23]:
above_75 = eval_df[eval_df["probs"] > 0.75]
below_25 = eval_df[eval_df["probs"] < 0.25]
# accuracy when model is pretty certain t2 is going to win
sum(above_75["predictions"] == above_75["actual"]) / len(above_75)

0.9659863945578231

In [24]:
# accuracy when model is pretty certain  t1 is going to win
sum(below_25["predictions"] == below_25["actual"]) / len(below_25)

0.8214285714285714

In [25]:
# accuracy when model is not too certain, 0.25 to 0.75
less_than_75 = eval_df[eval_df["probs"] < 0.75]
middle = less_than_75[less_than_75["probs"] > 0.25]
sum(middle["predictions"] == middle["actual"]) / len(middle)

0.6540983606557377