In [1]:
import pandas as pd
import os
import fnmatch

Get all dataframes from root folder

In [2]:
all_df = {}
for root, dirs, filenames in os.walk("all_schedules"):
    for filename in fnmatch.filter(filenames, "*.csv"):
        try:
            all_df[filename] = pd.read_csv(os.path.join(root, filename), delimiter=",")
        except pd.errors.EmptyDataError as e:
            print(filename)

In [3]:
list(all_df.items())[0][1].columns

Index(['date', 'opponent/venue', 'result', 'sets', 'mp', 'kills', 'errors',
       'total_attacks', 'hit_pct', 'assists', 'aces', 'serr', 'digs', 'rerr',
       'b_solo', 'b_assist', 'b_error', 'pts', 'bhe', 'opponent',
       'sets_from_result', 'kills/set', 'errors/set', 'total_attacks/set',
       'assists/set', 'aces/set', 'serr/set', 'digs/set', 'b_solo/set',
       'b_assist/set', 'b_error/set', 'pts/set', 'rolling_kills/set_3',
       'rolling_kills/set_total', 'rolling_errors/set_3',
       'rolling_errors/set_total', 'rolling_total_attacks/set_3',
       'rolling_total_attacks/set_total', 'rolling_hit_pct_3',
       'rolling_hit_pct_total', 'rolling_assists/set_3',
       'rolling_assists/set_total', 'rolling_aces/set_3',
       'rolling_aces/set_total', 'rolling_serr/set_3',
       'rolling_serr/set_total', 'rolling_digs/set_3',
       'rolling_digs/set_total', 'rolling_b_solo/set_3',
       'rolling_b_solo/set_total', 'rolling_b_assist/set_3',
       'rolling_b_assist/set_to

In [4]:
# Columns of end feature set
# t1 will always end up being team first alphabetically
# rolling_*_total is total cumulative average
# rolling_*_3 is 3 game cumulative - first three games of season for every team are NA
# team names are either not included or converted to categorical codes
# result is 0 for t1, 1 for t2

master_cols = {
    "date": [],
    "t1": [],
    "t1_rolling_kills/set_total": [],
    "t1_rolling_kills/set_3": [],
    "t1_rolling_errors/set_total": [],
    "t1_rolling_errors/set_3": [],
    "t1_rolling_total_attacks/set_total": [],
    "t1_rolling_total_attacks/set_3": [],
    "t1_rolling_hit_pct_total": [],
    "t1_rolling_hit_pct_3": [],
    "t1_rolling_assists/set_total": [],
    "t1_rolling_assists/set_3": [],
    "t1_rolling_aces/set_total": [],
    "t1_rolling_aces/set_3": [],
    "t1_rolling_serr/set_total": [],
    "t1_rolling_serr/set_3": [],
    "t1_rolling_digs/set_total": [],
    "t1_rolling_digs/set_3": [],
    "t1_rolling_b_solo/set_total": [],
    "t1_rolling_b_solo/set_3": [],
    "t1_rolling_b_assist/set_total": [],
    "t1_rolling_b_assist/set_3": [],
    "t1_rolling_b_error/set_total": [],
    "t1_rolling_b_error/set_3": [],
    "t1_rolling_pts/set_total": [],
    "t1_rolling_pts/set_3": [],
    "t2": [],
    "t2_rolling_kills/set_total": [],
    "t2_rolling_kills/set_3": [],
    "t2_rolling_errors/set_total": [],
    "t2_rolling_errors/set_3": [],
    "t2_rolling_total_attacks/set_total": [],
    "t2_rolling_total_attacks/set_3": [],
    "t2_rolling_hit_pct_total": [],
    "t2_rolling_hit_pct_3": [],
    "t2_rolling_assists/set_total": [],
    "t2_rolling_assists/set_3": [],
    "t2_rolling_aces/set_total": [],
    "t2_rolling_aces/set_3": [],
    "t2_rolling_serr/set_total": [],
    "t2_rolling_serr/set_3": [],
    "t2_rolling_digs/set_total": [],
    "t2_rolling_digs/set_3": [],
    "t2_rolling_b_solo/set_total": [],
    "t2_rolling_b_solo/set_3": [],
    "t2_rolling_b_assist/set_total": [],
    "t2_rolling_b_assist/set_3": [],
    "t2_rolling_b_error/set_total": [],
    "t2_rolling_b_error/set_3": [],
    "t2_rolling_pts/set_total": [],
    "t2_rolling_pts/set_3": [],
    "result": [] #0 for t1, 1 for t2
}

In [5]:
# Collect one-sided game stats into matching games - "{date}~{team_name1}~{team_name2}"

games = {}

for key, df in all_df.items():
    team_name1 = key.split("-schedule")[0].strip()
    for i, row in df.iterrows():
        team_name2 = row["opponent"].strip()
        date = row["date"]
        res = f"{date}~{team_name1}~{team_name2}" if team_name1 > team_name2 else f"{date}~{team_name2}~{team_name1}"
        
        if res not in games:
            games[res] = []
        games[res].append((team_name1, row))

In [6]:
# Create dataframe from matched up games

for name, item in games.items():
    if len(item) != 2: continue # Don't know why there are games that don't have two teams stats
    date, t1, t2 = name.split("~")
    row1 = item[0][1] if item[0][0] == t1 else item[1][1]
    row2 = item[0][1] if item[0][0] == t2 else item[1][1]
    winner = 0 if int(row1["result"].split("-")[0].strip()) == 3 else 1
    
    # add row to dataframe - yes it is ugly
    master_cols["result"].append(winner)
    master_cols["date"].append(date)
    master_cols["t1"].append(t1)
    master_cols["t1_rolling_kills/set_total"].append(row1["rolling_kills/set_total"])
    master_cols["t1_rolling_kills/set_3"].append(row1["rolling_kills/set_3"])
    master_cols["t1_rolling_errors/set_total"].append(row1["rolling_errors/set_total"])
    master_cols["t1_rolling_errors/set_3"].append(row1["rolling_errors/set_3"])
    master_cols["t1_rolling_total_attacks/set_total"].append(row1["rolling_total_attacks/set_total"])
    master_cols["t1_rolling_total_attacks/set_3"].append(row1["rolling_total_attacks/set_3"])
    master_cols["t1_rolling_hit_pct_total"].append(row1["rolling_hit_pct_total"])
    master_cols["t1_rolling_hit_pct_3"].append(row1["rolling_hit_pct_3"])
    master_cols["t1_rolling_assists/set_total"].append(row1["rolling_assists/set_total"])
    master_cols["t1_rolling_assists/set_3"].append(row1["rolling_assists/set_3"])
    master_cols["t1_rolling_aces/set_total"].append(row1["rolling_aces/set_total"])
    master_cols["t1_rolling_aces/set_3"].append(row1["rolling_aces/set_3"])
    master_cols["t1_rolling_serr/set_total"].append(row1["rolling_serr/set_total"])
    master_cols["t1_rolling_serr/set_3"].append(row1["rolling_serr/set_3"])
    master_cols["t1_rolling_digs/set_total"].append(row1["rolling_digs/set_total"])
    master_cols["t1_rolling_digs/set_3"].append(row1["rolling_digs/set_3"])
    master_cols["t1_rolling_b_solo/set_total"].append(row1["rolling_b_solo/set_total"])
    master_cols["t1_rolling_b_solo/set_3"].append(row1["rolling_b_solo/set_3"])
    master_cols["t1_rolling_b_assist/set_total"].append(row1["rolling_b_assist/set_total"])
    master_cols["t1_rolling_b_assist/set_3"].append(row1["rolling_b_assist/set_3"])
    master_cols["t1_rolling_b_error/set_total"].append(row1["rolling_b_error/set_total"])
    master_cols["t1_rolling_b_error/set_3"].append(row1["rolling_b_error/set_3"])
    master_cols["t1_rolling_pts/set_total"].append(row1["rolling_pts/set_total"])
    master_cols["t1_rolling_pts/set_3"].append(row1["rolling_pts/set_3"])
    master_cols["t2"].append(t2)
    master_cols["t2_rolling_kills/set_total"].append(row2["rolling_kills/set_total"])
    master_cols["t2_rolling_kills/set_3"].append(row2["rolling_kills/set_3"])
    master_cols["t2_rolling_errors/set_total"].append(row2["rolling_errors/set_total"])
    master_cols["t2_rolling_errors/set_3"].append(row2["rolling_errors/set_3"])
    master_cols["t2_rolling_total_attacks/set_total"].append(row2["rolling_total_attacks/set_total"])
    master_cols["t2_rolling_total_attacks/set_3"].append(row2["rolling_total_attacks/set_3"])
    master_cols["t2_rolling_hit_pct_total"].append(row2["rolling_hit_pct_total"])
    master_cols["t2_rolling_hit_pct_3"].append(row2["rolling_hit_pct_3"])
    master_cols["t2_rolling_assists/set_total"].append(row2["rolling_assists/set_total"])
    master_cols["t2_rolling_assists/set_3"].append(row2["rolling_assists/set_3"])
    master_cols["t2_rolling_aces/set_total"].append(row2["rolling_aces/set_total"])
    master_cols["t2_rolling_aces/set_3"].append(row2["rolling_aces/set_3"])
    master_cols["t2_rolling_serr/set_total"].append(row2["rolling_serr/set_total"])
    master_cols["t2_rolling_serr/set_3"].append(row2["rolling_serr/set_3"])
    master_cols["t2_rolling_digs/set_total"].append(row2["rolling_digs/set_total"])
    master_cols["t2_rolling_digs/set_3"].append(row2["rolling_digs/set_3"])
    master_cols["t2_rolling_b_solo/set_total"].append(row2["rolling_b_solo/set_total"])
    master_cols["t2_rolling_b_solo/set_3"].append(row2["rolling_b_solo/set_3"])
    master_cols["t2_rolling_b_assist/set_total"].append(row2["rolling_b_assist/set_total"])
    master_cols["t2_rolling_b_assist/set_3"].append(row2["rolling_b_assist/set_3"])
    master_cols["t2_rolling_b_error/set_total"].append(row2["rolling_b_error/set_total"])
    master_cols["t2_rolling_b_error/set_3"].append(row2["rolling_b_error/set_3"])
    master_cols["t2_rolling_pts/set_total"].append(row2["rolling_pts/set_total"])
    master_cols["t2_rolling_pts/set_3"].append(row2["rolling_pts/set_3"])

In [7]:
master_df = pd.DataFrame(master_cols)
master_df = master_df.dropna() #lose about 7 thousand matches by doing this, could consider only using season averages
master_df["date"] = pd.to_datetime(master_df["date"])
master_df["t1_code"] = master_df["t1"].astype("category").cat.codes
master_df["t2_code"] = master_df["t2"].astype("category").cat.codes

In [8]:
master_df

Unnamed: 0,date,t1,t1_rolling_kills/set_total,t1_rolling_kills/set_3,t1_rolling_errors/set_total,t1_rolling_errors/set_3,t1_rolling_total_attacks/set_total,t1_rolling_total_attacks/set_3,t1_rolling_hit_pct_total,t1_rolling_hit_pct_3,...,t2_rolling_b_solo/set_3,t2_rolling_b_assist/set_total,t2_rolling_b_assist/set_3,t2_rolling_b_error/set_total,t2_rolling_b_error/set_3,t2_rolling_pts/set_total,t2_rolling_pts/set_3,result,t1_code,t2_code
3,2016-09-02,SMU,10.291667,13.722222,4.520833,6.027778,27.041667,36.055556,0.167250,0.223000,...,0.694444,2.775000,3.700000,0.275000,0.366667,11.487500,15.316667,1,227,10
4,2016-09-02,Arizona,10.296667,13.244444,4.486667,4.977778,30.523333,39.288889,0.153400,0.215000,...,0.444444,1.946667,2.711111,0.120000,0.133333,9.960000,12.266667,0,6,6
5,2016-09-03,Pepperdine,10.750000,13.500000,4.097222,4.911111,29.741667,35.783333,0.189667,0.245333,...,0.555556,2.294444,2.088889,0.238889,0.311111,13.825000,17.400000,1,208,10
6,2016-09-09,Sacramento St.,12.735185,14.466667,4.731481,4.505556,34.303704,38.738889,0.214444,0.256333,...,0.666667,2.919048,3.111111,0.204762,0.111111,14.135714,17.666667,1,228,10
7,2016-09-09,Texas A&M,12.556250,14.555556,4.800000,5.666667,29.306250,34.222222,0.232000,0.258333,...,1.222222,2.887500,3.333333,0.220833,0.222222,14.785417,18.000000,0,268,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60563,2022-12-08,Wisconsin,12.968817,12.055556,4.086022,2.638889,32.076882,31.666667,0.276290,0.311667,...,0.850000,4.178922,4.644444,0.539706,0.644444,16.764951,17.244444,0,325,210
60564,2022-09-01,Pacific,10.000000,13.333333,4.800000,6.400000,26.700000,35.600000,0.144500,0.192667,...,1.555556,1.125000,1.500000,0.312500,0.416667,11.104167,14.805556,0,205,47
60566,2022-10-22,San Francisco,11.865079,13.566667,5.018254,4.472222,32.993651,36.283333,0.197714,0.250333,...,0.244444,1.590909,2.122222,0.287879,0.333333,15.353788,14.277778,0,238,208
60567,2022-11-22,San Francisco,11.646552,11.444444,5.029310,5.388889,32.620115,31.916667,0.196103,0.188000,...,0.505556,1.480460,0.955556,0.287356,0.433333,15.221839,15.111111,1,238,208


In [9]:
games_2023 = master_df[master_df["date"] > "2023-01-01"]
games_2023

Unnamed: 0,date,t1,t1_rolling_kills/set_total,t1_rolling_kills/set_3,t1_rolling_errors/set_total,t1_rolling_errors/set_3,t1_rolling_total_attacks/set_total,t1_rolling_total_attacks/set_3,t1_rolling_hit_pct_total,t1_rolling_hit_pct_3,...,t2_rolling_b_solo/set_3,t2_rolling_b_assist/set_total,t2_rolling_b_assist/set_3,t2_rolling_b_error/set_total,t2_rolling_b_error/set_3,t2_rolling_pts/set_total,t2_rolling_pts/set_3,result,t1_code,t2_code
4913,2023-08-29,Iona,7.483333,9.977778,3.833333,5.111111,23.750000,31.666667,0.113000,0.150667,...,0.444444,2.541667,3.388889,0.187500,0.250000,13.583333,18.111111,1,115,106
4914,2023-09-01,UConn,8.450000,11.266667,4.266667,5.688889,24.900000,33.200000,0.121750,0.162333,...,0.111111,2.700000,3.611111,0.483333,0.805556,14.266667,18.000000,1,286,106
4915,2023-09-02,Lehigh,8.880000,10.133333,3.693333,3.688889,23.773333,26.888889,0.190000,0.259000,...,0.583333,2.309524,2.000000,0.345238,0.555556,15.500000,18.055556,0,133,106
4916,2023-09-05,Sacred Heart,8.400000,10.777778,4.866667,6.444444,25.600000,33.333333,0.118200,0.141333,...,0.538889,2.120833,1.155556,0.302083,0.000000,15.362500,17.188889,1,229,106
4917,2023-09-09,Seton Hall,11.964583,13.572222,5.570833,6.077778,33.752083,35.005556,0.165625,0.211000,...,0.288889,2.413333,2.655556,0.291667,0.166667,15.865000,16.716667,1,242,106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9636,2023-09-30,USC Upstate,11.802222,11.800000,4.627778,4.755556,33.152222,32.288889,0.204600,0.220000,...,1.050000,2.923810,4.000000,0.317857,0.000000,12.844048,15.900000,1,297,216
9637,2023-10-27,USC Upstate,11.600000,12.388889,4.909091,5.694444,34.778030,39.055556,0.186500,0.172333,...,1.366667,2.820635,2.877778,0.300794,0.083333,13.501587,14.688889,1,297,216
9638,2023-10-18,Morgan St.,9.534127,11.616667,5.919048,6.083333,29.050794,31.716667,0.119905,0.182333,...,0.416667,2.396667,1.944444,0.374167,0.000000,14.507500,15.416667,1,165,110
9639,2023-11-03,Morgan St.,9.480769,9.944444,5.917308,5.916667,29.310897,32.722222,0.117462,0.120000,...,0.366667,2.365385,2.077778,0.287821,0.000000,14.886538,14.944444,1,165,110


In [10]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
rf = RandomForestClassifier()
dc = DecisionTreeClassifier()
logi = LogisticRegression()
svc = SVC()
vc = VotingClassifier([("DTC", dc), ("LR", logi), ("SVC", svc)])

In [75]:
features = [
       't1_rolling_kills/set_total', 't1_rolling_kills/set_3',
       't1_rolling_errors/set_total', 't1_rolling_errors/set_3',
       't1_rolling_total_attacks/set_total', 't1_rolling_total_attacks/set_3',
       't1_rolling_hit_pct_total', 't1_rolling_hit_pct_3',
       't1_rolling_assists/set_total', 't1_rolling_assists/set_3',
       't1_rolling_aces/set_total', 't1_rolling_aces/set_3',
       't1_rolling_serr/set_total', 't1_rolling_serr/set_3',
       't1_rolling_digs/set_total', 't1_rolling_digs/set_3',
       't1_rolling_b_solo/set_total', 't1_rolling_b_solo/set_3',
       't1_rolling_b_assist/set_total', 't1_rolling_b_assist/set_3',
       't1_rolling_b_error/set_total', 't1_rolling_b_error/set_3',
       't1_rolling_pts/set_total', 't1_rolling_pts/set_3',
       't2_rolling_kills/set_total', 't2_rolling_kills/set_3',
       't2_rolling_errors/set_total', 't2_rolling_errors/set_3',
       't2_rolling_total_attacks/set_total', 't2_rolling_total_attacks/set_3',
       't2_rolling_hit_pct_total', 't2_rolling_hit_pct_3',
       't2_rolling_assists/set_total', 't2_rolling_assists/set_3',
       't2_rolling_aces/set_total', 't2_rolling_aces/set_3',
       't2_rolling_serr/set_total', 't2_rolling_serr/set_3',
       't2_rolling_digs/set_total', 't2_rolling_digs/set_3',
       't2_rolling_b_solo/set_total', 't2_rolling_b_solo/set_3',
       't2_rolling_b_assist/set_total', 't2_rolling_b_assist/set_3',
       't2_rolling_b_error/set_total', 't2_rolling_b_error/set_3',
       't2_rolling_pts/set_total', 't2_rolling_pts/set_3', "t1_code", "t2_code"
]

train = master_df[master_df["date"] < "2023-11-01"]
test = master_df[master_df["date"] >= "2023-11-01"]

In [12]:
from sklearn.metrics import accuracy_score, precision_score, r2_score

Voting Classifier

In [13]:
vc.fit(train[features], train["result"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
preds = vc.predict(test[features])
acc = accuracy_score(test["result"], preds)
prec = precision_score(test["result"], preds)
r2 = r2_score(test["result"], preds)
acc, prec, r2

(0.7444551591128254, 0.7450199203187251)

XGboost classifier

In [30]:
from xgboost import XGBClassifier
xgb = XGBClassifier(objective="binary:logistic")
xgb.fit(train[features], train["result"])
preds = xgb.predict(test[features])
acc = accuracy_score(test["result"], preds)
prec = precision_score(test["result"], preds)
r2 = r2_score(test["result"], preds)
acc, prec, r2

(0.7386692381870781, 0.7272727272727273, -0.04554180649289785)

Random forest classifier

In [31]:
rf.fit(train[features], train["result"])
preds = rf.predict(test[features])
acc = accuracy_score(test["result"], preds)
prec = precision_score(test["result"], preds)
r2 = r2_score(test["result"], preds)
acc, prec, r2

(0.742526518804243, 0.741106719367589, -0.030109455105548744)

Confusion matrix for random forest

In [106]:
pd.crosstab(index=preds, columns=test["result"])

result,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,386,143
1,140,368


In [3]:
team_id = {}
with open("team_id.csv", "r") as f:
    for line in f.readlines()[1:]:
        team, id = line.split(",")
        team_id[int(id.strip())] = team

In [17]:
#Function to add rolling average columns to dataframe

def rolling_avgs(df, cols, new_cols):
    for col, new_col in zip(cols, new_cols):
        df[new_col + "_3"] = df[col].rolling(3, closed="left").mean()
        df["cumsum"] = df[col].cumsum()
        df["index_val"] = range(1, len(df) + 1)
        df[new_col + "_total"] = df["cumsum"].shift(1).fillna(0) / df["index_val"]
        df.drop(["cumsum", "index_val"], axis=1, inplace=True)

In [18]:
cols = ["kills/set", "errors/set", "total_attacks/set", "hit_pct", "assists/set", "aces/set", "serr/set", "digs/set", "b_solo/set", "b_assist/set", "b_error/set", "pts/set"]
new_cols = [f"rolling_{c}" for c in cols]

for root, dirs, filenames in os.walk("all_schedules"):
    for filename in fnmatch.filter(filenames, "*.csv"):
        df = pd.read_csv(os.path.join(root, filename), delimiter=",")
        rolling_avgs(df, cols, new_cols)
        df.to_csv(os.path.join(root, filename), index=False)
        

In [19]:
bad = []
for key, l in games.items():
    if len(l) != 2:
        bad.append(key)

In [20]:
# Function to separate opponent and venue

def seperate_opp_venue(x):
    opp_ven = x["opponent/venue"]
    res = ""
    if "@" not in opp_ven:
        res = opp_ven.strip()
    elif opp_ven.startswith("@"):
        res =  opp_ven.split("@")[-1].strip()
    else:
        res = opp_ven.split("@")[0].strip()
    return res.split("(")[0]

In [21]:
# Some set data was impossible/incorrect, extract sets from result seems reliable
def sets_from_result(x):
    return sum([int(val.strip()) for val in x["result"].split("-")])

In [22]:
cols = ["kills", "errors", "total_attacks", "assists", "aces", "serr", "digs", "b_solo", "b_assist", "b_error", "pts"]
new_cols = [f"{c}/set" for c in cols]


for root, dirs, filenames in os.walk("all_schedules"):
    for filename in fnmatch.filter(filenames, "*.csv"):
        df = pd.read_csv(os.path.join(root, filename), delimiter=",")
        df["sets_from_result"] = df.apply(lambda x: sum([int(val.strip()) for val in x["result"].split("-")]), 1)
        for col, new_col in zip(cols, new_cols):
            df[new_col] = df.apply(lambda x: x[col] / x["sets_from_result"], 1)
        df.to_csv(os.path.join(root, filename), index=False)

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

In [76]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# Align columns of train and test sets
train, test = train.align(test, join='inner', axis=1)

In [77]:
scaler = StandardScaler()

# Select only numeric columns for scaling
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Exclude the target variable 'result' and other non-numeric columns
numeric_cols = [col for col in numeric_cols if col != 'result']

# Apply scaling
train[numeric_cols] = scaler.fit_transform(train[numeric_cols])
test[numeric_cols] = scaler.transform(test[numeric_cols])


In [78]:
X_train = train.drop('result', axis=1)
y_train = train['result']
X_test = test.drop('result', axis=1)
y_test = test['result']

In [80]:
X_train['year'] = X_train['date'].dt.year
X_train['month'] = X_train['date'].dt.month


X_test['year'] = X_test['date'].dt.year
X_test['month'] = X_test['date'].dt.month


X_train = X_train.drop(['date'], axis=1)  # drop the original date column
X_test = X_test.drop(['date'], axis=1)

X_train = X_train.fillna(0)
y_train = y_train.fillna(0)

# Handle infinite values
X_train.replace([np.inf, -np.inf], 0, inplace=True)
y_train.replace([np.inf, -np.inf], 0, inplace=True)

X_train = X_train.astype('float32')
y_train = y_train.astype('float32')

X_test = X_test.fillna(0)
y_test = y_test.fillna(0)

# Handle infinite values
X_test.replace([np.inf, -np.inf], 0, inplace=True)
y_test.replace([np.inf, -np.inf], 0, inplace=True)

X_test = X_test.astype('float32')
y_test = y_test.astype('float32')

In [98]:
import tensorflow as tf
from tensorflow.keras import layers

# Define the model
model = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [99]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [97]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")


Test Accuracy: 0.746383786201477
