In [61]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor

In [50]:
# To get basic data of a team in a tournament
def getTeamDataByYear(year, team):
    basic_data = pd.read_excel(io="./data/formatted/basic/basicdata" + str(year) + ".xlsx")
    row = basic_data.loc[basic_data['School'] == team]
    team_data = row.copy() #prevent modifying basic_data
    team_data.drop('ID', axis=1, inplace=True)
    team_data.drop('School', axis=1, inplace=True)
    return team_data

def calculateLabels(year, games):
    playoff_results = pd.read_csv("./data/playoff/playoffdata" + str(year) + ".csv")
    winner_names = playoff_results['winnerName']
    labels = winner_names.copy()
    for index, row in games.iterrows():
        if row['teamAName'] == winner_names.loc[index]:
            labels.loc[index] = 1
        else:
            labels.loc[index] = 0
    return labels.to_frame()
    
# private method used in sortAllGamesBySeed
def swap(row):   
    d = {
        'season': [row.pop('season')],
        'teamAName': [row.pop('teamBName')],
        'teamASeed': [row.pop('teamBSeed')],
        'teamBName': [row.pop('teamAName')],
        'teamBSeed': [row.pop('teamASeed')],
    }
    updated_row = pd.DataFrame(data=d)
    return updated_row
    
# left = higherSeed, right = lowerSeed
def sortAllGamesBySeed(year):
    playoff_data = pd.read_csv("./data/playoff/playoffdata" + str(year) + ".csv")

    ## Swap columns so higher seed (lower value) on the left, lower seed (higher value) on the right
    updated_data = playoff_data.copy()
    updated_data.columns = ['season', 'teamAName', 'teamASeed', 'teamBName', 'teamBSeed']
    for index, row in updated_data.iterrows():
        if row['teamASeed'] > row['teamBSeed']:
            updated_data.loc[index,['teamAName', 'teamBName']] = updated_data.loc[index,['teamBName', 'teamAName']].values
            updated_data.loc[index,['teamASeed', 'teamBSeed']] = updated_data.loc[index,['teamBSeed', 'teamASeed']].values
    return updated_data

# break down the bracket into rounds
def breakdownBracket(updated_data):
    first_round = None
    second_round = None
    third_round = None
    fourth_round = None
    semi_final = updated_data.iloc[60 : 62]
    final = updated_data.iloc[62 : 63]
    for region_index in range(0, 60, 15): 
        first = updated_data.iloc[region_index : region_index + 8]
        first_round = first if first_round is None else pd.concat([first_round, first])

        second = updated_data.iloc[region_index + 8 : region_index + 12]  
        second_round = second if second_round is None else pd.concat([second_round, second])

        third = updated_data.iloc[region_index + 12 : region_index + 14]
        third_round = third if third_round is None else pd.concat([third_round, third])

        fourth = updated_data.iloc[region_index + 14 : region_index + 15]
        fourth_round = fourth if fourth_round is None else pd.concat([fourth_round, fourth])
    return first_round, second_round, third_round, fourth_round, semi_final, final
 
# find difference between two teams' stats
def compareTwoTeams(teamAName, teamBName, year):
    team_A = getTeamDataByYear(year, teamAName)
    team_B = getTeamDataByYear(year, teamBName)
    if len(team_A.columns) == len(team_B.columns):
        diff = team_A.copy()
        for col in team_A.columns:
            diff[col] = team_A[col].values[0] - team_B[col].values[0]
        return diff
    else: 
        print("Team A column length: ", len(team_A.columns))
        print("Team B column length: ", len(team_B.columns))
        print("Something is wrong in the data")
        return None

# compareTwoTeams('Duke', 'Arizona', 200001)
def extractData(data, year):
    x_train = None
    for index, game in data.iterrows():
        diff = compareTwoTeams(game['teamAName'], game['teamBName'], year)
        x_train = diff if x_train is None else pd.concat([x_train, diff])
    return x_train

In [64]:
target_year = 200001
train_data = sortAllGamesBySeed(target_year)
first, second, third, fourth, semi, final = breakdownBracket(train_data)

x_train = extractData(first, target_year)
x_test = extractData(second, target_year)
    
train_labels = calculateLabels(target_year, train_data)
y_first, y_second, y_third, y_fourth, y_semi, y_final = breakdownBracket(train_labels)

accuracy = []
model = GradientBoostingRegressor(n_estimators=100, max_depth=5)
results = model.fit(x_train, y_first)
preds = model.predict(x_test)
preds[preds < .5] = 0
preds[preds >= .5] = 1

print(preds)


[1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1.]


  y = column_or_1d(y, warn=True)


In [68]:
y_test = y_second['winnerName'].values.tolist()
accuracy.append(np.mean(preds == y_test))
# print ("Finished iteration:", i)
print ("The accuracy is", sum(accuracy)/len(accuracy))

The accuracy is 0.875
