In [145]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, log_loss

In [80]:
# To get basic data of a team in a tournament
def getTeamDataByYear(year, team):
    basic_data = pd.read_excel(io="./data/formatted/basic/basicdata" + year + ".xlsx")
    row = basic_data.loc[basic_data['School'] == team]
    team_data = row.copy() #prevent modifying basic_data
    team_data.drop('ID', axis=1, inplace=True)
    team_data.drop('School', axis=1, inplace=True)
    team_data['teamSeed'] = getTeamSeedByYear(year, team)
    return team_data
    
def getTeamSeedByYear(year, team):
    games = pd.read_csv("./data/playoff/playoffdata" + year + ".csv")
    first = breakdownBracket(games)[0]
    for index, row in first.iterrows():
        if (row['winnerName'] == team):
            return row['winnerSeed']
        elif (row['loserName'] == team):
            return row['loserSeed']
    print("cannot find team %s seed", team)
    return -1

def calculateLabels(year, games):
    playoff_results = pd.read_csv("./data/playoff/playoffdata" + year + ".csv")
    winner_names = playoff_results['winnerName']
    labels = winner_names.copy()
    for index, row in games.iterrows():
        if row['teamAName'] == winner_names.loc[index]:
            labels.loc[index] = 1
        else:
            labels.loc[index] = 0
    return labels.to_frame()
    
# private method used in sortAllGamesBySeed
def swap(row):   
    d = {
        'season': [row.pop('season')],
        'teamAName': [row.pop('teamBName')],
        'teamASeed': [row.pop('teamBSeed')],
        'teamBName': [row.pop('teamAName')],
        'teamBSeed': [row.pop('teamASeed')],
    }
    updated_row = pd.DataFrame(data=d)
    return updated_row
    
# left = higherSeed, right = lowerSeed
def sortAllGamesBySeed(year):
    playoff_data = pd.read_csv("./data/playoff/playoffdata" + year + ".csv")

    ## Swap columns so higher seed (lower value) on the left, lower seed (higher value) on the right
    updated_data = playoff_data.copy()
    updated_data.columns = ['season', 'teamAName', 'teamASeed', 'teamBName', 'teamBSeed']
    for index, row in updated_data.iterrows():
        if row['teamASeed'] > row['teamBSeed']:
            updated_data.loc[index,['teamAName', 'teamBName']] = updated_data.loc[index,['teamBName', 'teamAName']].values
            updated_data.loc[index,['teamASeed', 'teamBSeed']] = updated_data.loc[index,['teamBSeed', 'teamASeed']].values
    return updated_data

# break down the bracket into rounds
def breakdownBracket(updated_data):
    first_round = None
    second_round = None
    third_round = None
    fourth_round = None
    semi_final = updated_data.iloc[60 : 62]
    final = updated_data.iloc[62 : 63]
    for region_index in range(0, 60, 15): 
        first = updated_data.iloc[region_index : region_index + 8]
        first_round = first if first_round is None else pd.concat([first_round, first])

        second = updated_data.iloc[region_index + 8 : region_index + 12]  
        second_round = second if second_round is None else pd.concat([second_round, second])

        third = updated_data.iloc[region_index + 12 : region_index + 14]
        third_round = third if third_round is None else pd.concat([third_round, third])

        fourth = updated_data.iloc[region_index + 14 : region_index + 15]
        fourth_round = fourth if fourth_round is None else pd.concat([fourth_round, fourth])
    return first_round, second_round, third_round, fourth_round, semi_final, final
 
# find difference between two teams' stats
def compareTwoTeams(teamAName, teamBName, year):
    team_A = getTeamDataByYear(year, teamAName)
    team_B = getTeamDataByYear(year, teamBName)
    if len(team_A.columns) == len(team_B.columns):
        diff = team_A.copy()
        for col in team_A.columns:
            diff[col] = team_A[col].values[0] - team_B[col].values[0]
        return diff
    else: 
        print("Team A column length: ", len(team_A.columns))
        print("Team B column length: ", len(team_B.columns))
        print("Something is wrong in the data")
        return None

# compareTwoTeams('Duke', 'Arizona', 200001)
def extractData(data, year):
    x_train = None
    for index, game in data.iterrows():
        diff = compareTwoTeams(game['teamAName'], game['teamBName'], year)
        x_train = diff if x_train is None else pd.concat([x_train, diff])
    return x_train


In [78]:
## Train and Test with one year
target_year = '200001'
train_data = sortAllGamesBySeed(target_year)
first, second, third, fourth, semi, final = breakdownBracket(train_data)

x_train = extractData(first, target_year)
x_test = extractData(second, target_year)
    
train_labels = calculateLabels(target_year, train_data)
y_first, y_second, y_third, y_fourth, y_semi, y_final = breakdownBracket(train_labels)

accuracy = []
model = GradientBoostingRegressor(n_estimators=100, max_depth=5)
results = model.fit(x_train, y_first)
preds = model.predict(x_test)
preds[preds < .5] = 0
preds[preds >= .5] = 1

print(preds)


[9.99770907e-01 9.99770907e-01 9.99770907e-01 9.99770907e-01
 9.99770907e-01 9.99770907e-01 4.61335499e-03 3.74239163e-04
 3.74239163e-04 9.99770907e-01 3.90361294e-01 3.03136910e-01
 9.99770907e-01 9.99770907e-01 9.99770907e-01 9.00854487e-01]


  y = column_or_1d(y, warn=True)


In [77]:
y_test = y_second['winnerName'].values.tolist()
accuracy.append(np.mean(preds == y_test))
# print ("Finished iteration:", i)
print ("The accuracy is", sum(accuracy)/len(accuracy))

The accuracy is 0.875


In [165]:
#### DIFFERENT FROM ABOVE
#### USE ALL DATA TO TRAIN


def swapGames(playoff_data):
#     playoff_data = pd.read_csv("./data/playoff/playoffdata" + year + ".csv")

    ## Swap columns so higher seed (lower value) on the left, lower seed (higher value) on the right
    updated_data = playoff_data.copy()
    updated_data.columns = ['season', 'teamAName', 'teamASeed', 'teamBName', 'teamBSeed']
    for index, row in updated_data.iterrows():
        if row['teamASeed'] > row['teamBSeed']:
            updated_data.loc[index,['teamAName', 'teamBName']] = updated_data.loc[index,['teamBName', 'teamAName']].values
            updated_data.loc[index,['teamASeed', 'teamBSeed']] = updated_data.loc[index,['teamBSeed', 'teamASeed']].values
    return updated_data

def calculateLabels(year, games):
    playoff_results = pd.read_csv("./data/playoff/playoffdata" + year + ".csv")
    winner_names = playoff_results['winnerName']
    labels = winner_names.copy()
    for index, row in games.iterrows():
        if row['teamAName'] == winner_names.loc[index]:
            labels.loc[index] = 1
        else:
            labels.loc[index] = 0
    return labels.to_frame()


def compareTeams(team_a_info, team_b_info):
    if len(team_a_info.columns) == len(team_b_info.columns):
        diff = team_a_info.copy()
        for col in team_a_info.columns:
            diff[col] = team_a_info[col].values - team_b_info[col].values
        return diff
    else: 
        print("Team A column length: ", len(team_A.columns))
        print("Team B column length: ", len(team_B.columns))
        print("Something is wrong in the data")
        return None
    
test_years = [
    '201617', '201718', '201819'
]
train_years = [
    '200001','200102', '200203', '200304', '200405', '200506', '200607', '200708', '200809', '200910',
    '201011', '201112', '201213','201314', '201415','201516'
]

def getAllGames(years):
    all_games = None
    all_labels = None
    for year in years:
        data = pd.read_csv("./data/playoff/playoffdata" + year + ".csv")
        playoff_data = swapGames(data.copy())
        labels = calculateLabels(year, playoff_data)
    
        data = pd.read_excel(io="./data/formatted/basic/basicdata" + year + ".xlsx")
        team_basic_data = data.copy()
        
        team_a = playoff_data.iloc[:, 1:3]
        team_b = playoff_data.iloc[:, 3:5]
        
        team_a = team_a.merge(team_basic_data, left_on='teamAName', right_on='School', how='left')
        team_a = team_a.rename(columns={"teamASeed": "seed"})
        team_a.drop('teamAName', axis=1, inplace=True)
        team_a.drop('School', axis=1, inplace=True)
        team_a.drop('ID', axis=1, inplace=True)
        team_a.drop('PersonalFouls', axis=1, inplace=True)
        
        team_b = team_b.merge(team_basic_data, left_on='teamBName', right_on='School', how='left')
        team_b = team_b.rename(columns={"teamBSeed": "seed"})
        team_b.drop('teamBName', axis=1, inplace=True)
        team_b.drop('School', axis=1, inplace=True)
        team_b.drop('ID', axis=1, inplace=True)
        team_b.drop('PersonalFouls', axis=1, inplace=True)
        
        diff_games = compareTeams(team_a, team_b)
        
        all_labels = labels if all_labels is None else pd.concat([all_labels, labels])
        all_games = diff_games if all_games is None else pd.concat([all_games, diff_games])
    
    return all_games, all_labels


In [107]:
x_train, y_train = getAllGames(train_years)
x_test, y_test = getAllGames(test_years)

In [109]:
## Check to see if any value is NAN
# x_train.isna().sum()

In [None]:
model = GradientBoostingRegressor(n_estimators=100, max_depth=5)
# model = RandomForestClassifier(n_estimators=64)

In [162]:
for i in range(1000):
    results = model.fit(x_train, y_train['winnerName'].tolist())
    
preds = model.predict(x_test)
mse = mean_squared_error(y_test, preds)
print("MSE: ", mse)
print("Log Loss:", log_loss(y_test, preds))

MSE:  0.13639371952622203
Log Loss: 0.45248701856557083


In [164]:
preds[preds < .5] = 0
preds[preds >= .5] = 1
print("accurracy:", sum(preds)/len(preds))

accurracy: 0.7671957671957672
