# Predicting March Madness Matchup Scores
### Using past NCAA team data to predict how a school fares in the tournament

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import random
import math
import heapq

## Creating Training Data

### Load in past tournament matchup data
**Initial Columns:** year, team1, score1, team2, score2, winner (team1/team2 with higher score)
**New Columns:** wins (number of times the winner won in the tournament), losses (number of times the winner lost in the tournament, either 0 or 1)

In [64]:
tournament = pd.read_csv('ncaa_matchups.csv')
wins = tournament.groupby(['year','winner']).count()[['team1']].rename(columns={'team1':'wins'})
wins['losses'] = np.where(wins['wins'] < 6, 1, 0)

losses1 = tournament.loc[~tournament['team1'].isin(set(wins.index.get_level_values(1)))][['year',\
                                                        'team1']].rename(columns={'team1':'team'})
losses2 = tournament.loc[~tournament['team2'].isin(set(wins.index.get_level_values(1)))][['year',\
                                                        'team2']].rename(columns={'team2':'team'})

losses = losses1.append(losses2)
losses['losses_y'] = 1
losses.head()

Unnamed: 0,year,team,losses_y
1,2018,Virginia Tech,1
19,2018,Texas Christian,1
31,2018,Missouri,1
67,2017,Southern Methodist,1
95,2017,Minnesota,1


In [65]:
test = pd.merge(tournament, wins, how='left', on=['year','winner'])
test.head()

Unnamed: 0,year,team1,score1,team2,score2,winner,wins,losses
0,2018,Villanova,87,Radford,61,Villanova,6,0
1,2018,Virginia Tech,83,Alabama,86,Alabama,1,1
2,2018,West Virginia,85,Murray State,68,West Virginia,2,1
3,2018,Wichita State,75,Marshall,81,Marshall,1,1
4,2018,Florida,77,St. Bonaventure,62,Florida,1,1


In [67]:
for yr in range(2014,2018):
    # Load data
    dfa = pd.read_csv('tm'+str(yr)+'.csv')
    dfb = pd.read_csv('opp'+str(yr)+'.csv')
    dfc = pd.read_csv('tma'+str(yr)+'.csv')
    dfd = pd.read_csv('oppa'+str(yr)+'.csv')
    df = pd.merge(pd.merge(pd.merge(dfa, dfb, how='inner', on=['school']), \
                  dfc, how='inner', on=['school']), dfd, how='inner', on=['school'])

    # Manipulate data
    cols = ['tm_pts','opp_pts','mp','fga','tpa','fta','orb','trb','ast','stl','blk','to','pf',\
          'opp_mp','opp_fga','opp_tpa','opp_fta','opp_orb','opp_trb','opp_ast','opp_stl','opp_blk','opp_to','opp_pf']
    # Normalize stats to reflect per-game averages
    df[cols] = df.apply(lambda row: row[cols]/row['gp'], axis=1)
    # Create column to store year
    df['year'] = yr
    # Subtract out tournament games when calculating each school's record
    df = pd.merge(df, wins, how='left', left_on=['year','school'], right_on=['year','winner'])
    cols = ['gp','w','aw']
    df[cols] = df.fillna(0).apply(lambda row: row[cols]-row['wins'], axis=1)
    cols = ['l','al']
    df[cols] = df.fillna(0).apply(lambda row: row[cols]-row['losses'], axis=1)
    df = pd.merge(df, losses, how='left', left_on=['year','school'], right_on=['year','team'])
    df[cols] = df.fillna(0).apply(lambda row: row[cols]-row['losses_y'], axis=1)
    # Calculate total and away records for each school
    df['rec'] = df.apply(lambda row: row['w']/(row['w']+row['l']), axis=1)
    df['arec'] = df.apply(lambda row: row['aw']/(row['aw']+row['al']), axis=1)
    # Drop unnecessary columns
    cols = ['gp','w','l','aw','al','wins','losses','losses_y','team']
    df = df.drop(cols, axis=1)

    # Store yearly dataframe
    df.to_csv('df'+str(yr)+'.csv', index=False)

# Combine yearly dataframes into a single dataframe
df = pd.read_csv('df2014.csv')
for yr in range(2015,2018):
    df_new = pd.read_csv('df'+str(yr)+'.csv')
    df = df.append(df_new)
    
df.to_csv('df.csv')
df.head()

Unnamed: 0,school,srs,sos,tm_pts,opp_pts,mp,fga,fg_pct,tpa,tp_pct,...,opp_ast_pct,opp_stl_pct,opp_blk_pct,opp_efg_pct,opp_to_pct,opp_orb_pct,opp_ft_fga,year,rec,arec
0,Abilene Christian,-19.6,-4.12,71.419355,71.870968,40.483871,53.677419,0.443,19.322581,0.402,...,53.6,9.7,12.0,0.518,17.8,30.2,0.323,2014,0.354839,0.0
1,Air Force,-4.08,1.71,66.0,69.133333,40.333333,52.033333,0.435,21.766667,0.331,...,58.6,10.5,14.1,0.499,15.6,29.4,0.302,2014,0.4,0.272727
2,Akron,1.16,-0.48,68.588235,66.941176,40.441176,54.558824,0.438,20.294118,0.346,...,46.7,9.9,9.1,0.479,16.7,32.5,0.259,2014,0.617647,0.428571
3,Alabama A&M,-13.86,-10.58,64.533333,67.0,40.5,53.366667,0.41,18.7,0.324,...,45.3,9.8,8.9,0.452,17.3,34.3,0.365,2014,0.466667,0.3125
4,Alabama-Birmingham,0.78,-0.77,73.096774,70.322581,40.483871,61.903226,0.417,16.935484,0.301,...,50.1,9.9,9.1,0.465,13.5,29.8,0.201,2014,0.580645,0.5


In [5]:
ncaa_a = pd.merge(pd.merge(tournament, df, how='left', left_on=['year','team1'], right_on=['year','school']), \
                  df, how='inner', left_on=['year','team2'], right_on=['year','school'])
cols = ['team2','score2','school_x','school_y','winner']
ncaa_a = ncaa_a.drop(cols,axis=1).rename(columns={'team1':'team','score1':'score'})

ncaa_b = pd.merge(pd.merge(tournament, df, how='left', left_on=['year','team2'], right_on=['year','school']), \
                  df, how='inner', left_on=['year','team1'], right_on=['year','school'])
cols = ['team1','score1','school_x','school_y','winner']
ncaa_b = ncaa_b.drop(cols,axis=1).rename(columns={'team2':'team','score2':'score'})

ncaa = ncaa_a.append(ncaa_b)
ncaa.to_csv('ncaa.csv')
ncaa.head()

Unnamed: 0,year,team,score,srs_x,sos_x,tm_pts_x,opp_pts_x,mp_x,fga_x,fg_pct_x,...,opp_trb_pct_y,opp_ast_pct_y,opp_stl_pct_y,opp_blk_pct_y,opp_efg_pct_y,opp_to_pct_y,opp_orb_pct_y,opp_ft_fga_y,rec_y,arec_y
0,2017,Villanova,76,23.8,9.28,77.194444,62.666667,40.0,54.111111,0.495,...,55.5,43.9,9.3,7.9,0.499,18.5,34.6,0.215,0.555556,0.421053
1,2017,Wisconsin,84,19.4,9.4,72.378378,62.378378,40.540541,57.702703,0.455,...,51.8,49.9,7.7,11.8,0.51,14.3,30.2,0.196,0.666667,0.4
2,2017,Virginia,76,20.63,10.93,66.058824,56.352941,40.588235,53.617647,0.46,...,48.4,46.2,6.8,7.4,0.526,17.8,27.6,0.283,0.828571,0.785714
3,2017,Florida,80,22.4,11.01,77.888889,66.5,40.416667,58.833333,0.45,...,46.9,47.3,9.6,9.9,0.487,19.2,27.2,0.268,0.771429,0.615385
4,2017,Southern Methodist,65,18.71,4.45,74.257143,60.0,40.0,55.257143,0.473,...,49.7,53.6,7.9,8.5,0.501,15.3,31.2,0.172,0.735294,0.555556


In [6]:
S = len(ncaa.columns)
fs = np.random.randint(2, size=S-3)
features = []
for i in range(len(fs)):
    if fs[i]: features += [i]

X = ncaa.iloc[:,3:].iloc[:,features].values
y = ncaa['score'].values

lm = LinearRegression().fit(X, y)
lm.score(X,y)
lm.predict(X)

def sum_sq_err(y_obs, y_pred):
    """
    inputs: y_obs, array of observed target values
            y_pred, array of predicted target values
    output: sse, sum of squared errors
    """
    return sum((y_obs-y_pred)**2)

def aic(y_obs, y_pred, k):
    """
    inputs: y_obs, array of observed target values
            y_pred, array of predicted target values
            k, number of features in model
    output: AIC (Akaike Information Criterion) for OLS, measure that rewards simple models
    """
    sse = sum_sq_err(y_obs, y_pred)
    n = len(y_pred)
    return 2*k + n*math.log(sse/n)

print(S)
print(sum_sq_err(y, lm.predict(X)))
print(aic(y, lm.predict(X), len(features)))

123
41532.25
2331.4712193735236


In [7]:
def selection(df_x, df_y, parents, threshold):
    """
    inputs: df_x, dataframe of dependent variable observations
            df_y, dataframe of independent variable observations
            parents, list of 'parent' vectors which determine features being used
            threshold, float on [0,1] which determines what portion of the population 'survives'
    output: portion of parents which are deemed most fit (based on AIC)
    """
    fitness = {}
    th = int(len(parents)*threshold)
    for s in parents:
        p = []
        for i in range(len(s)):
            if s[i]:
                p += [i]
        X = df_x.iloc[:,p].values
        y = df_y.values
        lm = LinearRegression().fit(X, y)
        fitness[aic(y, lm.predict(X), len(p))] = s
    best = heapq.nsmallest(th, fitness.keys())
    return [fitness[x] for x in best]

DFx = ncaa.iloc[:,3:]
DFy = ncaa['score']

N = 10
P = len(DFx.columns)
ps = np.random.randint(2, size=(N,P))
t = 0.2

#print(selection(DFx, DFy, ps, t))

def crossover(parents, O):
    """
    inputs: parents, list of 'parent' vectors which determine features being used
            O, number of offspring generated
    output: offspring, list of 'child' vectors generated from parents
    """
    if len(parents) < 2:
        return
    l = len(parents[0])
    offspring = []
    for i in range(O):
        inds = np.random.choice(len(parents), 2, replace=False)
        cutoff = np.random.randint(1,l-1)
        offspring += [np.append(parents[inds[0]][:cutoff],parents[inds[1]][cutoff:])]
    return offspring

group = selection(DFx, DFy, ps, t)
#print(crossover(group, int(N-N*t)))

def mutation(offspring, r, P):
    """
    inputs: offspring, new generation of vectors which determine features being used
            r, float on [0,1] - rate at which mutation occurs
            P, number of parameters to choose from
    output: new list of features which has some random removals and additions
    """
    if len(offspring) < 1 or r > 1 or r < 0:
        return
    mutated = []
    for c in offspring:
        m = np.copy(c)
        for i in range(P):
            # Mutate with rate, r
            if np.random.choice([True,False], 1, p=[r,1-r]):
                m[i] = 1-c[i]
        mutated += [m]
    return mutated

children = crossover(group, int(N-N*t))
print(mutation([children[0]], 1/P, 94))

[array([1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0])]


In [8]:
def evolution(df_x, df_y, G):
    """
    inputs: df_x, dataframe of dependent variable observations
            df_y, dataframe of independent variable observations
            G, number of generations
    output: most fit feature vector after G generations
    """
    N = 500
    t = 0.2
    P = len(df_x.columns)
    r = 1/P
    
    # Initialize G0 of parents (randomly select features for N parents)
    ps = np.random.randint(2, size=(N,P))
    
    # Go through evolutionary process (selection, crossover, mutation) for G generations
    gen = ps
    num_children = int(N-N*t)
    for g in range(G):
        survivors = selection(df_x, df_y, gen, t)
        children = mutation(crossover(survivors, num_children), r, P)
        gen = survivors + children
        
    best = selection(df_x, df_y, gen, 1/N)[0]
    return best

DFx = ncaa.iloc[:,3:]
DFy = ncaa['score']

fs = DFx.columns
use = evolution(DFx, DFy, 10)

features = []
for i in range(len(fs)):
    if use[i]:
        features += [fs[i]]
print(features)

with open('features2G10.txt', 'w') as f:
    for item in features:
        f.write("%s\n" % item)

['srs_x', 'tm_pts_x', 'fga_x', 'tpa_x', 'ast_x', 'opp_mp_x', 'opp_fga_x', 'opp_fta_x', 'opp_ft_pct_x', 'ortg_x', 'ts_pct_x', 'ast_pct_x', 'stl_pct_x', 'opp_ortg_x', 'opp_ftr_x', 'opp_stl_pct_x', 'rec_x', 'arec_x', 'sos_y', 'tm_pts_y', 'opp_pts_y', 'mp_y', 'trb_y', 'stl_y', 'opp_fg_pct_y', 'opp_tpa_y', 'opp_fta_y', 'opp_ft_pct_y', 'opp_orb_y', 'opp_trb_y', 'opp_ast_y', 'opp_to_y', 'pace_y', 'ortg_y', 'ftr_y', 'stl_pct_y', 'opp_pace_y', 'opp_ortg_y', 'opp_ts_pct_y']


In [9]:
features = pd.read_csv('featuresG10000.txt', header=None)

X = ncaa.iloc[:,3:][features[0]].values
y = ncaa['score'].values

lm = LinearRegression().fit(X, y)
print(lm.score(X,y))
print(len(features))
print(lm.coef_)
print(lm.intercept_)
fl = features.rename(columns={0:'X'})
fl['coef'] = lm.coef_
fl.to_csv('feature_list.csv', index=False)
fl.head()

0.4295703919581584
14
[ 2.51476437e-01  3.14554371e+00  4.40362200e+00  2.16853089e+01
 -1.07258280e+00  2.21898172e+01 -6.31847824e+00 -4.87701773e-01
  5.05480272e-01  3.29007274e-01 -1.80337840e+03 -3.03044089e+02
 -3.36340366e+02  1.83094382e+03]
-62.43409297577972


Unnamed: 0,X,coef
0,sos_x,0.251476
1,ast_x,3.145544
2,opp_mp_x,4.403622
3,tpar_x,21.685309
4,ast_pct_x,-1.072583


In [10]:
ncaa_a = pd.merge(pd.merge(tournament, df, how='left', left_on=['year','team1'], right_on=['year','school']), \
                  df, how='inner', left_on=['year','team2'], right_on=['year','school'])
cols = ['school_x','school_y']
ncaa_a = ncaa_a.drop(cols,axis=1)

fs1 = ['sos_x','ast_x','opp_mp_x','tpar_x','ast_pct_x','rec_x','arec_x', \
      'srs_y', 'opp_pts_y', 'fga_y', 'fg_pct_y', 'tp_pct_y', 'tpar_y', 'efg_pct_y']
fs2 = ['sos_y','ast_y','opp_mp_y','tpar_y','ast_pct_y','rec_y','arec_y', \
      'srs_x', 'opp_pts_x', 'fga_x', 'fg_pct_x', 'tp_pct_x', 'tpar_x', 'efg_pct_x']
# fs1 = ['srs_x','mp_x','fg_pct_x','fta_x','ast_x','opp_fga_x','opp_fg_pct_x', \
#       'opp_fta_x','opp_ast_x','opp_to_x','ftr_x','ast_pct_x','opp_ftr_x', \
#       'opp_ast_pct_x','opp_to_pct_x','rec_x','arec_x','sos_y','tm_pts_y','opp_pts_y', \
#       'fta_y','opp_fga_y','opp_orb_y','ortg_y','opp_ortg_y','opp_ftr_y','opp_to_pct_y']
# fs2 = ['srs_y','mp_y','fg_pct_y','fta_y','ast_y','opp_fga_y','opp_fg_pct_y', \
#       'opp_fta_y','opp_ast_y','opp_to_y','ftr_y','ast_pct_y','opp_ftr_y', \
#       'opp_ast_pct_y','opp_to_pct_y','rec_y','arec_y','sos_x','tm_pts_x','opp_pts_x', \
#       'fta_x','opp_fga_x','opp_orb_x','ortg_x','opp_ortg_x','opp_ftr_x','opp_to_pct_x']
coef = fl['coef'].tolist()
ncaa_a['pscore1'] = ncaa_a[fs1].mul(coef).sum(axis=1).add(lm.intercept_)
ncaa_a['pscore2'] = ncaa_a[fs2].mul(coef).sum(axis=1).add(lm.intercept_)
ncaa_a['pwinner'] = ncaa_a.apply(lambda row: row['team1'] if row['pscore1'] > row['pscore2'] else row['team2'], axis=1)
ncaa_a['correct'] = ncaa_a.apply(lambda row: 1 if row['winner'] == row['pwinner'] else 0, axis=1)

cols = ['year','team1','team2','score1','score2','pscore1','pscore2','winner','pwinner','correct']

ncaa_a[cols].to_csv('training_results.csv', index=False)
ncaa_a[cols].head()

Unnamed: 0,year,team1,team2,score1,score2,pscore1,pscore2,winner,pwinner,correct
0,2017,Villanova,Mount St. Mary's,76,56,83.458511,57.516818,Villanova,Villanova,1
1,2017,Wisconsin,Virginia Tech,84,74,81.151817,66.873004,Wisconsin,Wisconsin,1
2,2017,Virginia,North Carolina-Wilmington,76,71,77.276474,68.959442,Virginia,Virginia,1
3,2017,Florida,East Tennessee State,80,65,79.74384,68.302464,Florida,Florida,1
4,2017,Southern Methodist,Southern California,65,66,73.174194,66.349767,Southern California,Southern Methodist,0


In [11]:
# Load data
dfa = pd.read_csv('tm2018.csv')
dfb = pd.read_csv('opp2018.csv')
dfc = pd.read_csv('tma2018.csv')
dfd = pd.read_csv('oppa2018.csv')
df_test = pd.merge(pd.merge(pd.merge(dfa, dfb, how='inner', on=['school']), \
                   dfc, how='inner', on=['school']), dfd, how='inner', on=['school'])

# Manipulate data
cols = ['tm_pts','opp_pts','mp','fga','tpa','fta','orb','trb','ast','stl','blk','to','pf',\
      'opp_mp','opp_fga','opp_tpa','opp_fta','opp_orb','opp_trb','opp_ast','opp_stl','opp_blk','opp_to','opp_pf']
df_test[cols] = df_test.apply(lambda row: row[cols]/row['gp'], axis=1)
df_test['year'] = 2018
df_test = pd.merge(df_test, wins, how='left', left_on=['year','school'], right_on=['year','winner'])
cols = ['gp','w','aw']
df_test[cols] = df_test.fillna(0).apply(lambda row: row[cols]-row['wins'], axis=1)
cols = ['l','al']
df_test[cols] = df_test.fillna(0).apply(lambda row: row[cols]-row['losses'], axis=1)
df_test['rec'] = df_test.apply(lambda row: row['w']/(row['w']+row['l']), axis=1)
df_test['arec'] = df_test.apply(lambda row: row['aw']/(row['aw']+row['al']), axis=1)
cols = ['gp','w','l','aw','al','wins','losses']
df_test = df_test.drop(cols, axis=1)

df_test.to_csv('df2018.csv', index=False)

# Predict with data
ncaa_a = pd.merge(pd.merge(tournament, df_test, how='inner', left_on=['year','team1'], right_on=['year','school']), \
                  df_test, how='inner', left_on=['year','team2'], right_on=['year','school'])
cols = ['school_x','school_y']
ncaa_a = ncaa_a.drop(cols,axis=1)
ncaa_a.to_csv('test_matchups.csv')

fs1 = ['sos_x','ast_x','opp_mp_x','tpar_x','ast_pct_x','rec_x','arec_x', \
      'srs_y', 'opp_pts_y', 'fga_y', 'fg_pct_y', 'tp_pct_y', 'tpar_y', 'efg_pct_y']
fs2 = ['sos_y','ast_y','opp_mp_y','tpar_y','ast_pct_y','rec_y','arec_y', \
      'srs_x', 'opp_pts_x', 'fga_x', 'fg_pct_x', 'tp_pct_x', 'tpar_x', 'efg_pct_x']
# fs1 = ['srs_x','mp_x','fg_pct_x','fta_x','ast_x','opp_fga_x','opp_fg_pct_x', \
#       'opp_fta_x','opp_ast_x','opp_to_x','ftr_x','ast_pct_x','opp_ftr_x', \
#       'opp_ast_pct_x','opp_to_pct_x','rec_x','arec_x','sos_y','tm_pts_y','opp_pts_y', \
#       'fta_y','opp_fga_y','opp_orb_y','ortg_y','opp_ortg_y','opp_ftr_y','opp_to_pct_y']
# fs2 = ['srs_y','mp_y','fg_pct_y','fta_y','ast_y','opp_fga_y','opp_fg_pct_y', \
#       'opp_fta_y','opp_ast_y','opp_to_y','ftr_y','ast_pct_y','opp_ftr_y', \
#       'opp_ast_pct_y','opp_to_pct_y','rec_y','arec_y','sos_x','tm_pts_x','opp_pts_x', \
#       'fta_x','opp_fga_x','opp_orb_x','ortg_x','opp_ortg_x','opp_ftr_x','opp_to_pct_x']
coef = fl['coef'].tolist()
ncaa_a['pscore1'] = ncaa_a[fs1].mul(coef).sum(axis=1).add(lm.intercept_)
ncaa_a['pscore2'] = ncaa_a[fs2].mul(coef).sum(axis=1).add(lm.intercept_)
ncaa_a['pwinner'] = ncaa_a.apply(lambda row: row['team1'] if row['pscore1'] > row['pscore2'] else row['team2'], axis=1)
ncaa_a['correct'] = ncaa_a.apply(lambda row: 1 if row['winner'] == row['pwinner'] else 0, axis=1)

cols = ['year','team1','team2','score1','score2','pscore1','pscore2','winner','pwinner','correct']

ncaa_a[cols].to_csv('test_results.csv', index=False)
ncaa_a[cols].head()

Unnamed: 0,year,team1,team2,score1,score2,pscore1,pscore2,winner,pwinner,correct
0,2018,Villanova,Radford,87,61,89.680429,67.762149,Villanova,Villanova,1
1,2018,Villanova,Alabama,81,58,87.015605,70.751719,Villanova,Villanova,1
2,2018,Virginia Tech,Alabama,83,86,74.415223,74.632264,Alabama,Alabama,1
3,2018,Villanova,West Virginia,90,78,85.554818,76.490751,Villanova,Villanova,1
4,2018,Villanova,Texas Tech,71,59,81.571331,74.741819,Villanova,Villanova,1


In [12]:
teams = pd.read_csv('team_list.csv')
tl = teams['team'].tolist()

while len(tl) > 1:
    remove_list = []
    for i in range(0,len(tl),2):
        team1 = tl[i]
        team2 = tl[i+1]
        dfa = df_test.loc[df_test['school'] == team1].reset_index()
        dfb = df_test.loc[df_test['school'] == team2].reset_index()
        ncaa = pd.concat([dfa, dfb], axis=1)
        fs1 = []
        fs2 = []
        #pscore1 = ncaa[fs1].mul(coef).sum(axis=1).add(lm.intercept_)
        #pscore2 = ncaa[fs2].mul(coef).sum(axis=1).add(lm.intercept_)
        remove_list += [team1]
    for remove_elt in remove_list:
        tl.remove(remove_elt)
        
ncaa.head()

Unnamed: 0,index,school,srs,sos,tm_pts,opp_pts,mp,fga,fg_pct,tpa,...,opp_ast_pct,opp_stl_pct,opp_blk_pct,opp_efg_pct,opp_to_pct,opp_orb_pct,opp_ft_fga,year,rec,arec
0,130,Iona,0.76,-2.19,79.470588,76.529412,40.735294,61.058824,0.461,24.882353,...,59.3,7.2,9.8,0.471,17.8,32.1,0.258,2018,0.685714,0.6


In [14]:
new_tournament = pd.read_csv('matchups2019-2.csv')
dfa = pd.read_csv('tm2019.csv')
dfb = pd.read_csv('opp2019.csv')
dfc = pd.read_csv('tma2019.csv')
df = pd.merge(pd.merge(dfa, dfb, how='inner', on=['school']), \
              dfc, how='inner', on=['school'])

# Manipulate data
cols = ['tm_pts','opp_pts','mp','fga','tpa','fta','orb','trb','ast','stl','blk','to','pf',\
      'opp_mp','opp_fga','opp_tpa','opp_fta','opp_orb','opp_trb','opp_ast','opp_stl','opp_blk','opp_to','opp_pf']
df[cols] = df.apply(lambda row: row[cols]/row['gp'], axis=1)
df['year'] = 2019
df['rec'] = df.apply(lambda row: row['w']/(row['w']+row['l']), axis=1)
df['arec'] = df.apply(lambda row: row['aw']/(row['aw']+row['al']), axis=1)
cols = ['gp','w','l','aw','al']
df = df.drop(cols, axis=1)

df.to_csv('df2019.csv', index=False)

ncaa = pd.merge(pd.merge(new_tournament, df, how='left', left_on=['year','team1'], right_on=['year','school']), \
                  df, how='inner', left_on=['year','team2'], right_on=['year','school'])
cols = ['team2','school_x','school_y']
ncaa.to_csv('ncaa2019-1.csv')

# Predict with data
fs1 = ['sos_x','ast_x','opp_mp_x','tpar_x','ast_pct_x','rec_x','arec_x', \
      'srs_y', 'opp_pts_y', 'fga_y', 'fg_pct_y', 'tp_pct_y', 'tpar_y', 'efg_pct_y']
fs2 = ['sos_y','ast_y','opp_mp_y','tpar_y','ast_pct_y','rec_y','arec_y', \
      'srs_x', 'opp_pts_x', 'fga_x', 'fg_pct_x', 'tp_pct_x', 'tpar_x', 'efg_pct_x']
coef = fl['coef'].tolist()
ncaa['pscore1'] = ncaa[fs1].mul(coef).sum(axis=1).add(lm.intercept_)
ncaa['pscore2'] = ncaa[fs2].mul(coef).sum(axis=1).add(lm.intercept_)
ncaa['pwinner'] = ncaa.apply(lambda row: row['team1'] if row['pscore1'] > row['pscore2'] else row['team2'], axis=1)

cols = ['year','team1','team2','pscore1','pscore2','pwinner']

ncaa[cols].to_csv('predictions-2.csv', index=False)
ncaa[cols].head()


Unnamed: 0,year,team1,team2,pscore1,pscore2,pwinner
0,2019,Duke,Central Florida,78.895822,61.696945,Duke
1,2019,Liberty,Virginia Tech,66.072146,73.44437,Virginia Tech
2,2019,Maryland,Louisiana State,70.550721,75.477265,Louisiana State
3,2019,Minnesota,Michigan State,60.23975,76.40566,Michigan State
4,2019,Gonzaga,Baylor,79.93021,62.376221,Gonzaga
