## Tune 2022 Model for Improved Accuracy
### - Baseline = 25/32 (78%)
### - Best of 1000 = 26/32 (81%)
### - Best of 10000 = 27/32 (84%)

In [40]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.metrics import accuracy_score
warnings.filterwarnings("ignore", 'This pattern has match groups')
warnings.filterwarnings("ignore", 'This pattern is interpreted')
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.display.max_columns = 100000

In [34]:
tourney_teams_2023 = pd.read_csv('data/2022_team_results.csv')
tourney_teams_2023['Adj_OtoD_Margin'] = tourney_teams_2023['adjoe']-tourney_teams_2023['adjde']
teams_2023_final = pd.read_csv('data/2022_fffinal.csv')
teams_2023_final = teams_2023_final.rename(columns={"TeamName": "team"})
tourney_teams_2023_join = tourney_teams_2023.merge(teams_2023_final, on="team")
championship_candidates = tourney_teams_2023_join[(tourney_teams_2023_join["Adj_OtoD_Margin"] > 25) & (tourney_teams_2023_join["TO%"] < 18)]
championship_candidates.sort_values("Adj_OtoD_Margin", ascending=False)

Unnamed: 0,rank,team,conf,record,adjoe,oe Rank,adjde,de Rank,barthag,rank.1,proj. W,Proj. L,Pro Con W,Pro Con L,Con Rec.,sos,ncsos,consos,Proj. SOS,Proj. Noncon SOS,Proj. Con SOS,elite SOS,elite noncon SOS,Opp OE,Opp DE,Opp Proj. OE,Opp Proj DE,Con Adj OE,Con Adj DE,Qual O,Qual D,Qual Barthag,Qual Games,FUN,ConPF,ConPA,ConPoss,ConOE,ConDE,ConSOSRemain,Conf Win%,WAB,WAB Rk,"Fun Rk, adjt",Fun adjt,Adj_OtoD_Margin,eFG%,eFG% Def,FTR,FTR Def,OR%,DR%,TO%,TO% Def.,3P%,3pD%,2p%,2p%D,ft%,ft%D,3P rate,3P rate D,arate,arateD
0,1,Gonzaga,WCC,26-3,121.393077,3,88.462846,4,0.974399,1,26.965166,3.034834,13,1,13-1,0.583557,0.492788,0.623477,0.583395,0.492788,0.623477,0.752212,0.777635,104.362214,99.575644,104.243616,99.525107,120.872315,88.183632,138.914015,98.487598,0.981204,12,-0.007154,1271,942,1063.55,1.195054,0.885713,0,0.928571,7.017371,8,192,72.798387,32.930231,59.4,43.2,29.7,22.2,29.0,23.0,15.9,17.0,37.9,30.7,60.9,41.6,73.1,69.5,35.4,33.9,55.0,40.6
1,2,Houston,Amer,29-5,116.970661,10,89.185771,7,0.957662,2,29.846047,5.153953,15,3,15-3,0.643936,0.56209,0.678483,0.648525,0.56209,0.678483,0.771671,0.819246,104.797261,98.680693,104.989664,98.678953,116.467094,91.355916,114.370176,97.26482,0.86565,8,-0.032147,1320,1099,1188.0375,1.111076,0.925055,0,0.833333,6.451982,12,244,64.027806,27.784889,53.6,43.5,28.7,35.1,37.8,27.3,17.1,21.7,34.1,29.0,55.1,43.5,66.9,70.5,39.1,42.9,59.5,55.4
2,3,Kansas,B12,28-6,119.854423,5,93.094649,24,0.948123,3,28.956748,6.043252,14,4,14-4,0.769667,0.628368,0.857039,0.760603,0.628368,0.857039,0.657761,0.793914,107.34916,94.803196,107.049472,94.908286,119.445183,93.929611,123.063636,97.046808,0.938851,19,0.05323,1352,1244,1254.3,1.077892,0.991788,0,0.777778,10.286754,1,65,69.120781,26.759774,54.1,46.9,32.8,27.8,33.4,28.9,17.8,18.4,35.5,30.1,54.5,47.9,72.4,70.7,33.8,34.2,53.9,47.5
7,8,Kentucky,SEC,26-7,119.339957,6,94.313452,37,0.937412,8,26.90221,7.09779,14,4,14-4,0.673246,0.47811,0.792595,0.671645,0.47811,0.792595,0.687466,0.797577,106.312806,97.448141,106.071971,97.34896,120.369675,95.318171,124.731772,97.150597,0.94654,16,0.030388,1395,1246,1248.8875,1.116994,0.997688,0,0.777778,6.954816,9,110,67.708941,25.026506,53.3,46.5,27.2,25.1,37.8,24.9,16.6,17.4,34.9,30.3,53.7,47.2,73.0,71.5,28.1,36.0,53.0,46.2


In [38]:
# Functions for running tournament simulator\n",
# Needs: High ADJOE and Low ADJDE (take care of this with margin), Low TOR\n",
# Wants: Low FTRD, High EFG, 3P_D, 2P_O\n",
# Other Notes...\n",
# Needs: ADJOE-ADJDE > 30, TOR < 16.5\n",
# Nice to have: FTRD < 31\n",
# Marg(0.61)up, TOR(0.14)*1.2down, Low FTRD(0.11)*.85down, Low 3P_D(0.08)down, High 2P_O(0.06)*0.5up\n",

def stat_scaler(df, stat, inverse_stat=False):
    stat_min = df[stat].min()
    stat_max = df[stat].max()
    if inverse_stat == False:
        scaled_data = (df[stat]-stat_min)/(stat_max-stat_min) # Normalized stat (0-1 Min-Max Scaler)
        return scaled_data
    else: 
        scaled_data = 1-((df[stat]-stat_min)/(stat_max-stat_min))  # Inversed Normalized stat (0-1 Max Min Scaler)
        return scaled_data


def multi_stat_scaler(df, stat_weights, inverse_stat_flags):
    for stat in stat_weights.columns:
        inverse_flag = inverse_stat_flags[stat][0]
        if inverse_flag == False:
            df[stat] = stat_scaler(df, stat)
        else:
            df[stat] = stat_scaler(df, stat, inverse_stat=True)
    return df
    

def team_power_rating_generator(scaled_df, team, stat_weights):
    team_df = scaled_df[scaled_df["team"].str.contains(team)]
    team_pr = 0
    for stat in stat_weights:
        team_pr += team_df[stat].iloc[0] * stat_weights[stat].iloc[0]
    return team_pr


def game_predictor(scaled_df, stat_weights, team1, team2):
    # Team 1 Power Rating
    team1_pr = team_power_rating_generator(scaled_df, team1, stat_weights)

    # Team 2 Power Rating
    team2_pr = team_power_rating_generator(scaled_df, team2, stat_weights)

    result = team1_pr > team2_pr
    margin_of_pred = abs(team1_pr - team2_pr)
    if result == True:
        #print("Winner: " + str(team1))
        return team1, margin_of_pred
    else:
        #print("Winner: " + str(team2))
        return team2, margin_of_pred

def play_round(teams_remaining, stats_df, stat_weights):
    num_games = len(teams_remaining)//2
    #print(num_games)
    winners_list = []
    margin_of_pred_list = []
    for game in range(0,num_games):
        team1_index = game*2
        team2_index = game*2+1
        team1_name = teams_remaining[team1_index]
        #print("")
        #print(team1_name)
        team2_name = teams_remaining[team2_index]
        #print(team2_name)
        winner, margin_of_pred = game_predictor(stats_df,stat_weights,team1_name,team2_name)
        winners_list.append(winner)
        margin_of_pred_list.append(margin_of_pred)
    return winners_list, margin_of_pred_list

In [51]:
# Initialize Year
year = 2023

# User
username = "tuning"

# Load Dataset
tourney_teams_2023 = pd.read_csv('data/2022_team_results.csv')
tourney_teams_2023['Adj_OtoD_Margin'] = tourney_teams_2023['adjoe']-tourney_teams_2023['adjde']
teams_2023_final = pd.read_csv('data/2022_fffinal.csv')
teams_2023_final = teams_2023_final.rename(columns={"TeamName": "team"})
tourney_teams_2023_join = tourney_teams_2023.merge(teams_2023_final, on="team")
championship_candidates = tourney_teams_2023_join[(tourney_teams_2023_join["Adj_OtoD_Margin"] > 25) & (tourney_teams_2023_join["TO%"] < 18)]
championship_candidates.sort_values("Adj_OtoD_Margin", ascending=False)

# Scale Tournament Data
stat_weights = pd.read_csv('model_weights/'+username+'_model_weights.csv')
inverse_stat_flags = pd.read_csv('model_weights/inverse_stat_flags.csv')

# Reduce df down to 5 main stats:
stat_weights = stat_weights[['Adj_OtoD_Margin', 'TO%', '3pD%', '2p%D', 'ft%D']]

best_accuracy_weights = None
best_accuracy_score = 0.84
high_accuracy_weights = []
high_accuracy_scores = []

for i in range(0, 10000):
    # Set randomly tuned weights
    adj_otod_marg = np.random.uniform(35, 55)
    to = np.random.uniform(18, 32)
    threep = np.random.uniform(2, 12)
    twopd = np.random.uniform(5, 20)
    ftd = np.random.uniform(1, 10)
    random_tuned_weights = [adj_otod_marg,to,threep,twopd,ftd]
    #random_tuned_weights = np.random.dirichlet(np.ones(5),size=1)*100
    for index, stat in enumerate(stat_weights.columns):
        #stat_weights[stat] = random_tuned_weights[0][index] # use with direchlet randomized method
        stat_weights[stat] = random_tuned_weights[index]
    scaled_stat_df = multi_stat_scaler(tourney_teams_2023_join, stat_weights, inverse_stat_flags)


    # Initialize Teams
    ## 2022
    df_round64 = ['Gonzaga','Georgia St.','Boise St.','Memphis','Connecticut','New Mexico St.','Arkansas','Vermont','Alabama','Notre Dame','Texas Tech','Montana St.','Michigan St.','Davidson','Duke','Cal St. Fullerton','Baylor','Norfolk St.','North Carolina','Marquette',"Saint Mary's",'Indiana','UCLA','Akron','Texas','Virginia Tech','Purdue','Yale','Murray St.','San Francisco','Kentucky',"Saint Peter's",'Arizona','Wright St.','Seton Hall','TCU','Houston','UAB','Illinois','Chattanooga','Colorado St.','Michigan','Tennessee','Longwood','Ohio St.','Loyola Chicago','Villanova','Delaware','Kansas','Texas Southern','San Diego St.','Creighton','Iowa','Richmond','Providence','South Dakota St.','LSU','Iowa St.','Wisconsin','Colgate','USC','Miami (FL)','Auburn','Jacksonville St.']
    df_round32 = []
    df_round16 = []
    df_round8 = []
    df_round4 = []
    df_round2 = []
    df_round1 = []
    margins64 = ["None here b/c no games played yet, go to next round"]
    margins32 = []
    margins16 = []
    margins8 = []
    margins4 = []
    margins2 = []
    margins1 = []

    """
    # 2023
    df_round68 = ["Texas A&M Corpus Chris","Southeastern Louisiana","Texas Southern","Fairleigh Dickinson","Mississippi St.","Pittsburgh","Arizona St.","Nevada"]
    df_round64 = ['Alabama','Texas A&M Corpus Chris','Maryland','West Virginia','San Diego St.','College of Charleston','Virginia','Furman',
                'Creighton','North Carolina St.','Baylor','UC Santa Barbara','Missouri','Utah St.','Arizona','Princeton',
                'Purdue','Texas Southern','Memphis','Florida Atlantic',"Duke",'Oral Roberts','Tennessee','Louisiana Lafayette','Kentucky','Providence',
                'Kansas St.','Montana St.','Michigan St.','USC','Marquette','Vermont',
                'Houston','Northern Kentucky','Iowa','Auburn','Miami FL','Drake','Indiana','Kent St.','Iowa St.','Mississippi St.',
                'Xavier','Kennesaw St.','Texas A&M','Penn St.','Texas','Colgate',
                'Kansas','Howard','Arkansas','Illinois',"Saint Mary's",'VCU','Connecticut','Iona','TCU','Nevada',
                'Gonzaga','Grand Canyon','Northwestern','Boise St.','UCLA','UNC Asheville']
    df_round32 = []
    df_round16 = []
    df_round8 = []
    df_round4 = []
    df_round2 = []
    df_round1 = []
    margins64 = ["None here b/c no games played yet, go to next round"]
    margins32 = []
    margins16 = []
    margins8 = []
    margins4 = []
    margins2 = []
    margins1 = []
    """

    # Combine list of rounds
    list_of_rounds = [df_round64,df_round32,df_round16,df_round8,df_round4,df_round2,df_round1]
    pred_win_margins = [margins64,margins32,margins16,margins8,margins4,margins2,margins1]

    # Play the tournament
    #print("This is March! Let the tournament begin...")
    inv_round_counter = 5
    for round in range(0,6):
        remaining_teams = 2**inv_round_counter
        winners, margins = play_round(list_of_rounds[round],scaled_stat_df, stat_weights)
        list_of_rounds[round+1] = winners
        winners_df = pd.DataFrame(winners)
        #winners_df = winners_df.rename(index={0: "Remaining Teams"})
        win_csv_name = "sim_output/"+str(year)+"/"+str(username)+"/roundof"+str(remaining_teams)+"_winners.csv"
        winners_df.to_csv(win_csv_name)
        pred_win_margins[round] = margins
        margins_df = pd.DataFrame(margins)
        #margins_df = margins_df.rename(index={0: "Predictor Score Margin of Victory"})
        margin_csv_name = "sim_output/"+str(year)+"/"+str(username)+"/roundof"+str(remaining_teams)+"_margins.csv"
        margins_df.to_csv(margin_csv_name)
        inv_round_counter -= 1
    champion = list_of_rounds[6][0]
    #print("")
    #print("")
    #print("***Congrats, your predicted champion is "+str(champion)+"!***")
    #print("For the results of all the rounds, check your file directory for each game's winner, or use the two cells below for quick access.")

    first_round_actuals = pd.read_csv('2022_first_round_actuals.csv')
    y_actuals = first_round_actuals['2022_actuals'].values
    y_preds = list_of_rounds[1]
    accuracy = accuracy_score(y_pred=y_preds, y_true=y_actuals)
    if accuracy > best_accuracy_score:
        high_accuracy_scores.append(accuracy)
        high_accuracy_weights.append(random_tuned_weights)
        best_accuracy_score = accuracy
        best_accuracy_weights = random_tuned_weights
    #print("")
    #print("")
    #print("Accuracy Score: "+str(accuracy))


print("Best accuracy Score: "+str(best_accuracy_score))
print("Best weights: "+str(best_accuracy_weights))

# Make simulator for weights
# Do a run for each stat being 100 once
# Then spread them out
# Maybe squeeze down to 5 stats at first Adj_OtoD_Margin, TOR, FTRD, 3P_D, 2P_O
# and cycle through these with randomized weights adding up to 100
# Also make function that assigns those random weights equalling 100

Best accuracy Score: 0.84375
Best weights: [52.21413744238761, 30.07249739129223, 4.834800559547173, 13.167447400912831, 7.757857851120745]


In [46]:
high_accuracy_scores

[]

In [47]:
high_accuracy_weights

[]

In [6]:
first_round_actuals = pd.read_csv('2022_first_round_actuals.csv')

In [11]:
first_round_actuals['2022_actuals'].values

array(['Gonzaga', 'Memphis', 'New Mexico St.', 'Arkansas', 'Notre Dame',
       'Texas Tech', 'Michigan St.', 'Duke', 'Baylor', 'North Carolina',
       "Saint Mary's", 'UCLA', 'Texas', 'Purdue', 'Murray St.',
       "Saint Peter's", 'Arizona', 'TCU', 'Houston', 'Illinois',
       'Michigan', 'Tennessee', 'Ohio St.', 'Villanova', 'Kansas',
       'Creighton', 'Richmond', 'Providence', 'Iowa St.', 'Wisconsin',
       'Miami (FL)', 'Auburn'], dtype=object)

In [43]:
y_preds

['Gonzaga',
 'Memphis',
 'Connecticut',
 'Vermont',
 'Notre Dame',
 'Texas Tech',
 'Michigan St.',
 'Duke',
 'Baylor',
 'Marquette',
 'Indiana',
 'UCLA',
 'Texas',
 'Purdue',
 'San Francisco',
 'Kentucky',
 'Arizona',
 'Seton Hall',
 'Houston',
 'Illinois',
 'Colorado St.',
 'Tennessee',
 'Ohio St.',
 'Villanova',
 'Kansas',
 'San Diego St.',
 'Iowa',
 'Providence',
 'LSU',
 'Wisconsin',
 'USC',
 'Auburn']

### Randomize weights to find upsets

In [91]:
def find_upset_weights(team1, team2):
    stat_weights = pd.read_csv('model_weights/'+username+'_model_weights.csv')
    inverse_stat_flags = pd.read_csv('model_weights/inverse_stat_flags.csv')
    # Reduce df down to 5 main stats:
    stat_weights = stat_weights[['adjoe','adjde', 'TO%', '3pD%', '2p%D', 'ft%D']]

    # Empty List for weights that predict correct upset
    upset_weights = []

    for i in range(0, 1000):
        random_tuned_weights = np.random.dirichlet(np.ones(6),size=1)*100
        for index, stat in enumerate(stat_weights.columns):
            stat_weights[stat] = random_tuned_weights[0][index] # use with direchlet randomized method
        scaled_stat_df = multi_stat_scaler(tourney_teams_2023_join, stat_weights, inverse_stat_flags)
        winner, margin_of_pred = game_predictor(scaled_stat_df, stat_weights, team1, team2)
        if winner == team2:
            upset_weights.append(random_tuned_weights)
    
    output = []
    for row in upset_weights:
        for value in row:
            output.append(value)

    upset_weight_df = pd.DataFrame(output, columns = ['adjoe','adjde','TOR','3p%D','2p%','ft%D'])
    return upset_weight_df


In [95]:
saint_peters_weights = find_upset_weights("Kentucky","Saint Peter's")

In [96]:
saint_peters_r2_weights = find_upset_weights("Murray St.","Saint Peter's")

In [97]:
saint_peters_r3_weights = find_upset_weights("Purdue","Saint Peter's")

In [98]:
nmst_weights = find_upset_weights("Connecticut", "New Mexico St.")

In [99]:
richmond_weights = find_upset_weights("Iowa", "Richmond")

In [100]:
unc_r2_weights = find_upset_weights("Baylor","North Carolina")

In [101]:
unc_r3_weights = find_upset_weights("UCLA","North Carolina")

In [102]:
unc_r5_weights = find_upset_weights("Duke","North Carolina")

In [107]:
print(saint_peters_weights.mean())
print(saint_peters_r2_weights.mean())
print(unc_r3_weights.mean())
print(unc_r5_weights.mean())


adjoe     6.042278
adjde    16.613419
TOR      24.600537
3p%D     15.685860
2p%      16.455558
ft%D     20.602349
dtype: float64
adjoe    11.253813
adjde    15.947109
TOR      17.858774
3p%D     16.745331
2p%      17.400477
ft%D     20.794496
dtype: float64
adjoe    16.594652
adjde    16.501438
TOR      16.373442
3p%D     16.811842
2p%      16.829514
ft%D     16.889113
dtype: float64
adjoe    14.934379
adjde    16.425319
TOR      16.531777
3p%D     17.437843
2p%      17.393210
ft%D     17.277472
dtype: float64


In [108]:
print(richmond_weights.mean())
print(nmst_weights.mean())

adjoe     8.583033
adjde    19.417075
TOR      16.971283
3p%D     18.250585
2p%      16.524693
ft%D     20.253331
dtype: float64
adjoe    14.233881
adjde    16.406859
TOR      17.614053
3p%D     15.736182
2p%      17.955238
ft%D     18.053788
dtype: float64


In [110]:
nmst_weights

Unnamed: 0,adjoe,adjde,TOR,3p%D,2p%,ft%D
0,16.137148,0.579506,12.106464,7.813640,30.302490,33.060752
1,16.971633,1.527421,38.681160,0.065482,30.619965,12.134339
2,18.846569,18.513178,17.921411,0.444664,6.103660,38.170518
3,0.996953,18.579981,9.915338,42.692235,21.151110,6.664383
4,5.810341,6.586283,62.350203,2.182525,8.189452,14.881196
...,...,...,...,...,...,...
436,21.780097,4.507539,4.846627,44.868776,20.376149,3.620812
437,3.124330,2.061615,1.452018,48.452146,29.445297,15.464594
438,5.048208,18.782506,1.725411,30.641423,20.417074,23.385378
439,8.773114,14.862547,11.732455,27.750078,16.264786,20.617019
