In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, log_loss



In [2]:
test_years = [
    '201617', '201718','201819'
]
train_years = [
    '200001','200102', '200203', '200304', '200405', '200506', '200607', '200708', '200809', '200910',
    '201011', '201112', '201213','201314', '201415','201516'
]
selected = 1

# private method used in sortAllGamesBySeed
def swap(row):   
    d = {
        'season': [row.pop('season')],
        'teamAName': [row.pop('teamBName')],
        'teamASeed': [row.pop('teamBSeed')],
        'teamBName': [row.pop('teamAName')],
        'teamBSeed': [row.pop('teamASeed')],
    }
    updated_row = pd.DataFrame(data=d)
    return updated_row
    
# left = higherSeed, right = lowerSeed
def sortAllGamesBySeed(year):
    playoff_data = pd.read_csv("./data/playoff/playoffdata" + year + ".csv")

    ## Swap columns so higher seed (lower value) on the left, lower seed (higher value) on the right
    updated_data = playoff_data.copy()
    updated_data.columns = ['season', 'teamAName', 'teamASeed', 'teamBName', 'teamBSeed']
    for index, row in updated_data.iterrows():
        if row['teamASeed'] > row['teamBSeed']:
            updated_data.loc[index,['teamAName', 'teamBName']] = updated_data.loc[index,['teamBName', 'teamAName']].values
            updated_data.loc[index,['teamASeed', 'teamBSeed']] = updated_data.loc[index,['teamBSeed', 'teamASeed']].values
    return updated_data


# break down the bracket into rounds
def breakdownBracket(updated_data):
    first_round = None
    second_round = None
    third_round = None
    fourth_round = None
    semi_final = updated_data.iloc[60 : 62]
    final = updated_data.iloc[62 : 63]
    for region_index in range(0, 60, 15): 
        first = updated_data.iloc[region_index : region_index + 8]
        first_round = first if first_round is None else pd.concat([first_round, first])

        second = updated_data.iloc[region_index + 8 : region_index + 12, :]  
        second_round = second if second_round is None else pd.concat([second_round, second])

        third = updated_data.iloc[region_index + 12 : region_index + 14]
        third_round = third if third_round is None else pd.concat([third_round, third])

        fourth = updated_data.iloc[region_index + 14 : region_index + 15]
        fourth_round = fourth if fourth_round is None else pd.concat([fourth_round, fourth])
    return first_round, second_round, third_round, fourth_round, semi_final, final


def swapGames(playoff_data):
#     playoff_data = pd.read_csv("./data/playoff/playoffdata" + year + ".csv")

    ## Swap columns so higher seed (lower value) on the left, lower seed (higher value) on the right
    updated_data = playoff_data.copy()
    updated_data.columns = ['season', 'teamAName', 'teamASeed', 'teamBName', 'teamBSeed']
    for index, row in updated_data.iterrows():
        if row['teamASeed'] > row['teamBSeed']:
            updated_data.loc[index,['teamAName', 'teamBName']] = updated_data.loc[index,['teamBName', 'teamAName']].values
            updated_data.loc[index,['teamASeed', 'teamBSeed']] = updated_data.loc[index,['teamBSeed', 'teamASeed']].values
    return updated_data

def calculateLabels(year, games):
    playoff_results = pd.read_csv("./data/playoff/playoffdata" + year + ".csv")
    winner_names = playoff_results['winnerName']
    labels = winner_names.copy()
    for index, row in games.iterrows():
        if row['teamAName'] == winner_names.loc[index]:
            labels.loc[index] = 1
        else:
            labels.loc[index] = 0
    return labels.to_frame()


def compareTeams(team_a_info, team_b_info):
    if len(team_a_info.columns) == len(team_b_info.columns):
        diff = team_a_info.copy()
        for col in team_a_info.columns:
            diff[col] = team_a_info[col].values - team_b_info[col].values
        return diff
    else: 
        print("Team A column length: ", len(team_A.columns))
        print("Team B column length: ", len(team_B.columns))
        print("Something is wrong in the data")
        return None

    # find difference between two teams' stats
def compareTeamsByName(teamAName, teamBName, year):
    team_A = getTeamDataByYear(year, teamAName)
    team_B = getTeamDataByYear(year, teamBName)
    if len(team_A.columns) == len(team_B.columns):
        diff = team_A.copy()
        for col in team_A.columns:
            diff[col] = team_A[col].values[0] - team_B[col].values[0]
        return diff
    else: 
        print("Team A column length: ", len(team_A.columns))
        print("Team B column length: ", len(team_B.columns))
        print("Something is wrong in the data")
        return None

def getTeamInfos(teamA, teamB, year):
    data = pd.read_excel(io="./data/formatted/basic/basicdata" + year + ".xlsx")
    team_basic_data = data.copy()
    
    team_a = teamA.copy()
    team_b = teamB.copy()

    team_a = team_a.merge(team_basic_data, left_on='teamAName', right_on='School', how='left')
    team_a = team_a.rename(columns={"teamASeed": "seed"})
    team_a.drop('teamAName', axis=1, inplace=True)
#     team_a.drop('teamASeed', axis=1, inplace=True)
    team_a.drop('School', axis=1, inplace=True)
    team_a.drop('ID', axis=1, inplace=True)
    team_a.drop('PersonalFouls', axis=1, inplace=True)

    team_b = team_b.merge(team_basic_data, left_on='teamBName', right_on='School', how='left')
    team_b = team_b.rename(columns={"teamBSeed": "seed"})
    team_b.drop('teamBName', axis=1, inplace=True)
#     team_b.drop('teamBSeed', axis=1, inplace=True)
    team_b.drop('School', axis=1, inplace=True)
    team_b.drop('ID', axis=1, inplace=True)
    team_b.drop('PersonalFouls', axis=1, inplace=True)
        
    return team_a, team_b
    
def getAllGames(years):
    all_games = None
    all_labels = None
    for year in years:
        data = pd.read_csv("./data/playoff/playoffdata" + year + ".csv")
        playoff_data = swapGames(data.copy())
        labels = calculateLabels(year, playoff_data)
        
        team_a = playoff_data.iloc[:, 1:3]
        team_b = playoff_data.iloc[:, 3:5]
        
        team_a, team_b = getTeamInfos(team_a, team_b, year)
        
        diff_games = compareTeams(team_a, team_b)
        
        all_labels = labels if all_labels is None else pd.concat([all_labels, labels])
        all_games = diff_games if all_games is None else pd.concat([all_games, diff_games])
    
    return all_games, all_labels


def normalizeData(data): 
    updated_data = data.copy()
    for col in data.columns:
        rows = data[col].values
        min_val = min(rows)
        rows = (rows + abs(min_val))
        updated_data.update(pd.DataFrame({col: rows}))
    return updated_data


def execute(model, x_train, y_train, x_test, y_test, epochs):
    mse = list()
    logloss = list()
    score = list()
    for i in range(epochs):
        model.fit(x_train, y_train['winnerName'].tolist())
        pred = model.predict(x_test)
        pred_proba = model.predict_proba(x_test)
        mse.append(mean_squared_error(y_test, pred))
        logloss.append(log_loss(y_test, pred_proba))
        score.append(model.score(x_test, y_test))
#     print("MSE: ", mse)
#     print("Log Loss for Proba: ", log_loss(y_test, pred_proba))
#     print("Score:", model.score(x_test, y_test))
#     print('----------------------------------------')
    return model, pred, mse, logloss, score


def findWinners(predicted_labels, games):
    team_a_names = games['teamAName'].values
    team_b_names = games['teamBName'].values
    team_a_seeds = games['teamASeed'].values
    team_b_seeds = games['teamBSeed'].values
    winners = list()
    seeds = list()
    for i in range(len(predicted_labels)):
        winner = predicted_labels[i]
        if winner == 1: 
            winners.append(team_a_names[i])
            seeds.append(team_a_seeds[i])
        elif winner == 0: 
            winners.append(team_b_names[i])
            seeds.append(team_b_seeds[i])
    return winners, seeds

def makeGames(players, seeds):
    if len(players) == 1: 
        pring("Champion is", players[0])
    elif len(players) % 2 != 0: 
        print("MISSING PLAYERS")
    else:
        team_a = list()
        team_a_seed = list()
        team_b = list()
        team_b_seed = list()
        for i in range(len(players)): 
            if i % 2 == 0:
                team_a.append(players[i])
                team_a_seed.append(seeds[i])
            else: 
                team_b.append(players[i])
                team_b_seed.append(seeds[i])
        games = pd.DataFrame({
            'teamAName': team_a,
            'teamASeed': team_a_seed, 
            'teamBName': team_b, 
            'teamBSeed': team_b_seed
        })
        return games

In [3]:
x_train, y_train = getAllGames(train_years)
x_test, y_test = getAllGames([test_years[selected]])

In [4]:
# norm_x_train = normalizeData(x_train)
# norm_x_test = normalizeData(x_test)
# print(norm_x_train)

In [5]:
# gbr = GradientBoostingRegressor(n_estimators=100, max_depth=5)
rfc = RandomForestClassifier(n_estimators=64)

In [15]:
epochs = 1000
trained_model, pred_labels, mse, logloss, score = execute(rfc, x_train, y_train, x_test, y_test, epochs)

In [16]:
pd.DataFrame({
    'Name': ["max", "mean", 'median', 'min'],
    'score': [max(score), np.mean(score), np.median(score), min(score)],
    'logloss':[max(logloss), np.mean(logloss), np.median(logloss), min(logloss)],
    'mse': [max(mse), np.mean(mse), np.median(mse), min(mse)]
})

Unnamed: 0,Name,score,logloss,mse
0,max,0.873016,0.928606,0.253968
1,mean,0.812905,0.411215,0.187095
2,median,0.809524,0.409309,0.190476
3,min,0.746032,0.367234,0.126984


In [23]:
# MSE:  0.19047619047619047
# Log Loss for Proba:  0.4015518423510818
# Score: 0.8095238095238095
# ----------------------------------------
# MSE:  0.1746031746031746
# Log Loss for Proba:  0.40287693665755603
# Score: 0.8253968253968254
# print(trained_model.feature_importances_)
# print(x_train.columns.values

plt.figure()
plt.plot()

[0.02878578 0.12369566 0.01861908 0.05863103 0.07755478 0.02115154
 0.01894971 0.02395399 0.01575075 0.01843879 0.0158915  0.09873662
 0.03463489 0.07204687 0.0725971  0.02733489 0.02563662 0.02930533
 0.02722867 0.03430031 0.04987157 0.02724822 0.02646684 0.02674631
 0.02642315]
['seed' 'OverallWin' 'OverallLoss' 'SimpleRanking' 'ScheduleStrength'
 'ConferenceWin' 'ConferenceLoss' 'HomeWin' 'HomeLoss' 'AwayWin'
 'AwayLoss' 'Points' 'PointsAgainst' 'FieldGoals' 'FieldGoalsAtt' '3P'
 '3PAtt' 'FreeThrows' 'FreeThrowsAtt' 'OffensiveReb' 'TotalReb' 'Assists'
 'Steals' 'Blocks' 'TurnoverPct']


In [17]:
data = pd.read_csv("./data/playoff/playoffdata" + test_years[selected] + ".csv")
playoff_data = swapGames(data.copy())
games = breakdownBracket(playoff_data)[0]
winners = pd.DataFrame({'winnerName': data['winnerName'].tolist(), 'seed': data['winnerSeed'].tolist()})
actual_winners = breakdownBracket(winners)[0]['winnerName'].tolist()
actual_seeds = breakdownBracket(winners)[0]['seed'].tolist()
winner_names, seeds = findWinners(pred_labels[:len(games)], games)

In [10]:
win_above_seed = 0 
higher_seed_won = 0
prefer_higher_seed=0
missed = 0
for i in range(len(winner_names)):
    if winner_names[i] == actual_winners[i]: 
        if actual_seeds[i] > 8:
#             print("Win above seed!")
            win_above_seed += 1
        else:
#             print("Higher seed won")
            higher_seed_won += 1
    elif winner_names[i] != actual_winners[i]:
        if actual_seeds[i] > 8:
#             print("uh oh, model prefers higher seed")
            prefer_higher_seed +=1
        else:
#             print("Guessed: " + winner_names[i] + " Actual: " + actual_winners[i])
            missed += 1
    else: 
        print("what is this condition???")
        
print("Right: Win above seed!", win_above_seed)
print("Right: Higher seed won", higher_seed_won)
print("Wrong: uh oh, model prefers higher seed", prefer_higher_seed)
print("Wrong: missed but choose higher seed", missed)
print(len(winner_names))

Right: Win above seed! 3
Right: Higher seed won 20
Wrong: uh oh, model prefers higher seed 6
Wrong: missed but choose higher seed 3
32


In [18]:
print("PREDICT")
print(winner_names)
print('------------------------------------')
while (len(winner_names) > 1):
    games = makeGames(winner_names, seeds)
    team_a, team_b = getTeamInfos(games.iloc[:, :2], games.iloc[:, 2:], test_years[selected])
    diff = compareTeams(team_a, team_b)
    predicted_labels = trained_model.predict(diff)
    winner_names, seeds = findWinners(predicted_labels, games)
    print(winner_names)
    print('------------------------------------')


PREDICT
['Villanova', 'Alabama', 'West Virginia', 'Marshall', 'St. Bonaventure', 'Texas Tech', 'Arkansas', 'Purdue', 'Kansas', 'Seton Hall', 'Clemson', 'Auburn', 'Texas Christian', 'Michigan State', 'Rhode Island', 'Duke', 'Xavier', 'Missouri', 'Ohio State', 'North Carolina-Greensboro', 'Houston', 'Michigan', 'Texas A&M', 'North Carolina', 'Maryland-Baltimore County', 'Creighton', 'Kentucky', 'Arizona', 'Miami (FL)', 'Tennessee', 'Nevada', 'Georgia State']
------------------------------------
['Villanova', 'West Virginia', 'Texas Tech', 'Purdue', 'Kansas', 'Auburn', 'Michigan State', 'Duke', 'Xavier', 'Ohio State', 'Michigan', 'North Carolina', 'Creighton', 'Kentucky', 'Tennessee', 'Nevada']
------------------------------------
['Villanova', 'Purdue', 'Kansas', 'Duke', 'Xavier', 'Michigan', 'Kentucky', 'Nevada']
------------------------------------
['Villanova', 'Kansas', 'Michigan', 'Nevada']
------------------------------------
['Villanova', 'Michigan']
------------------------------

In [19]:
print("ACTUAL RESULT")
results = pd.read_csv("./data/playoff/playoffdata" + test_years[selected] + ".csv")
first, second, third, fourth, semi, final = breakdownBracket(results)
print(first['winnerName'].values)
print('----------')
print(second['winnerName'].values)
print('----------')
print(third['winnerName'].values)
print('----------')
print(fourth['winnerName'].values)
print('----------')
print(semi['winnerName'].values)
print('----------')
print(final['winnerName'].values)
print('----------')


ACTUAL RESULT
['Villanova' 'Alabama' 'West Virginia' 'Marshall' 'Florida' 'Texas Tech'
 'Butler' 'Purdue' 'Kansas' 'Seton Hall' 'Clemson' 'Auburn' 'Syracuse'
 'Michigan State' 'Rhode Island' 'Duke' 'Xavier' 'Florida State'
 'Ohio State' 'Gonzaga' 'Houston' 'Michigan' 'Texas A&M' 'North Carolina'
 'Maryland-Baltimore County' 'Kansas State' 'Kentucky' 'Buffalo'
 'Loyola (IL)' 'Tennessee' 'Nevada' 'Cincinnati']
----------
['Villanova' 'West Virginia' 'Texas Tech' 'Purdue' 'Kansas' 'Clemson'
 'Syracuse' 'Duke' 'Florida State' 'Gonzaga' 'Michigan' 'Texas A&M'
 'Kansas State' 'Kentucky' 'Loyola (IL)' 'Nevada']
----------
['Villanova' 'Texas Tech' 'Kansas' 'Duke' 'Florida State' 'Michigan'
 'Kansas State' 'Loyola (IL)']
----------
['Villanova' 'Kansas' 'Michigan' 'Loyola (IL)']
----------
['Villanova' 'Michigan']
----------
['Villanova']
----------
