# March Madness 2017 Prediction Pipeline - Smaller Feature Set

The following notebook shows the steps used to preprocess the tournament data, create new features, evaluate features, create the datasets used to train the predictive models on, train the predictive models on the appropriate datasets, and come up with probability predictions for all games in a season's March Madness tournament. This version uses a smaller number of the available features, selected through methods such as Recursive Feature Elimination, RELIEF, and Correlation Attribute Evaluation.

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier


import warnings
warnings.filterwarnings("ignore")

Loading all of the Datasets from their respective CSV Files

In [2]:
pom = pd.read_csv('PomeroyOnlyWinrateRank.csv')
det = pd.read_csv('RegularSeasonDetailedResults.csv')
reg = pd.read_csv('RegularSeasonCompactResults.csv')
det = pd.read_csv('RegularSeasonDetailedResults2017.csv')
seasons = pd.read_csv('Seasons.csv')
teams = pd.read_csv('Teams.csv')
tcomp = pd.read_csv('TourneyCompactResults.csv')
tdet = pd.read_csv('TourneyDetailedResults.csv')
seeds = pd.read_csv('TourneySeeds.csv')
slots = pd.read_csv('TourneySlots.csv')
sub = pd.read_csv('SampleSubmission.csv')

In [3]:
det.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [4]:
det.columns

Index([u'Season', u'Daynum', u'Wteam', u'Wscore', u'Lteam', u'Lscore', u'Wloc',
       u'Numot', u'Wfgm', u'Wfga', u'Wfgm3', u'Wfga3', u'Wftm', u'Wfta',
       u'Wor', u'Wdr', u'Wast', u'Wto', u'Wstl', u'Wblk', u'Wpf', u'Lfgm',
       u'Lfga', u'Lfgm3', u'Lfga3', u'Lftm', u'Lfta', u'Lor', u'Ldr', u'Last',
       u'Lto', u'Lstl', u'Lblk', u'Lpf'],
      dtype='object')

Creating a seed attribute for every team. Teams that did not make the NCAA Finals in their respective season are given a seed of 17.

In [5]:
# This block creates a new feature called seed for each team in each season
def seed_to_int(seed):
    '''
    str -> int
    
    Gets only the digits from the seeding. Returns as an integer.
    '''
    s_int = int(seed[1:3])
    return s_int

past_2003 = seeds['Season'] >= 2003
seeds['n_seed'] = seeds.Seed.apply(seed_to_int)
seeds.drop(labels=['Seed'], inplace=True, axis=1)  # This is the string label
seeds = seeds[past_2003].reset_index(drop=True)

stats = ['Season', 'Wteam', 'Lteam', 'Wscore', 'Lscore', 'Wfgm', 'Lfgm', 'Wfga3', 'Lfga3', 'Wfgm3', 'Lfgm3',
                   'Wftm', 'Lftm', 'Wto', 'Lto', 'Wblk', 'Lblk', 'Wpf', 'Lpf', 'Wdr', 'Ldr']

det_stats = det[stats]
det_stats['Wseed'] = 17
det_stats['Lseed'] = 17

for ii, row in det_stats.iterrows():

    find_season = seeds['Season'] == row[0]
    find_team_winner = seeds['Team'] == row[1]

    temp = seeds[find_season & find_team_winner]
    if len(temp) > 0:
        winner_seed = temp.iloc[0, 2]
        row[-2] = winner_seed

    find_team_loser = seeds['Team'] == row[2]
    temp = seeds[find_season & find_team_loser]
    if len(temp) > 0:
        loser_seed = temp.iloc[0, 2]
        row[-1] = loser_seed

det['Wseed'] = det_stats['Wseed']
det['Lseed'] = det_stats['Lseed']


The following block is used to create a new dataset for training the model on. The range is used to define which years from the detailed Massey statistics to use for the training set, with the second year input being non-inclusive. To test a model on an earlier year's tournament, change the second year in the range argument.

In [6]:
dataset = pd.DataFrame()
for i in range(2003, 2018):

    year = det['Season'] == i
    important_stats = ['Wteam', 'Lteam', 'Wscore', 'Lscore', 'Wfgm', 'Lfgm', 'Wfga3', 'Lfga3', 'Wfgm3', 'Lfgm3',
                       'Wftm', 'Lftm', 'Wto', 'Lto', 'Wblk', 'Lblk', 'Wpf', 'Lpf', 'Wdr', 'Ldr', 'Wseed', 'Lseed']
    data_year = det[year]
    data_year = data_year[important_stats]
    data_year = data_year.sample(frac=1).reset_index(drop=True)
    split_index = len(data_year) // 2
    winners_year = data_year[:split_index]
    losers_year = data_year[split_index:]
    losers_year = losers_year.reset_index().drop('index', 1)
    
    
    X_winners = np.zeros([winners_year.shape[0], 12])
    X_losers = np.zeros([losers_year.shape[0], 12])

    #Create training examples out of winners
    for ii, row in winners_year.iterrows():
        winner = row[0]
        loser = row[1]

        # Calculate seasonal average stats for winner
        is_winner_win = data_year['Wteam'] == winner
        winner_wins = data_year[is_winner_win]
        is_winner_loss = data_year['Lteam'] == winner
        winner_losses = data_year[is_winner_loss]
        winner_wins = winner_wins.drop(['Wteam', 'Lteam'], axis=1)
        winner_wins = winner_wins.rename(
            columns={'Wscore': 'score', 'Wfgm': 'fgm', 'Wfga3': 'fga3', 'Wfgm3': 'fgm3', 'Wftm': 'ftm', 'Wto': 'to',
                     'Wblk': 'blk', 'Wpf': 'pf', 'Wdr': 'dr', 'Wseed': 'seed'})
        winner_losses = winner_losses.drop(['Wteam', 'Lteam'], 1)
        winner_losses = winner_losses.rename(
            columns={'Lscore': 'score', 'Lfgm': 'fgm', 'Lfga3': 'fga3', 'Lfgm3': 'fgm3', 'Lftm': 'ftm', 'Lto': 'to',
                     'Lblk': 'blk', 'Lpf': 'pf', 'Ldr': 'dr', 'Lseed': 'seed'})
        winner_data = pd.concat([winner_wins, winner_losses], axis=0)

        winner_averages = [winner_data["score"].mean(), winner_data["fgm"].mean(), winner_data["fga3"].mean(),
                           winner_data["fgm3"].mean(), winner_data["ftm"].mean(), winner_data["to"].mean(),
                           winner_data["blk"].mean(), winner_data["pf"].mean(), winner_data["dr"].mean(), winner_data["seed"].iloc[0]]
        
        find_winner = pom['team_id'] == winner
        find_loser = pom['team_id'] == loser
        find_season = pom['Season'] == i
        winner_stats = pom[find_winner & find_season].drop(['Season', 'team_id'], axis=1)
  
        if len(winner_stats) > 0:
            winner_pom = winner_stats.values[0]
            winner_averages.append(winner_pom[0])
            winner_averages.append(winner_pom[1])
        else:
            winner_averages.append(0)
            winner_averages.append(0)
            

        winner_averages = np.array(winner_averages)

        # Calculate seasonal average stats for loser
        is_loser_win = data_year['Wteam'] == loser
        loser_wins = data_year[is_loser_win]
        is_loser_loss = data_year['Lteam'] == loser
        loser_losses = data_year[is_loser_loss]
        loser_wins = loser_wins.drop(['Wteam', 'Lteam'], 1)
        loser_wins = loser_wins.rename(
            columns={'Wscore': 'score', 'Wfgm': 'fgm', 'Wfga3': 'fga3', 'Wfgm3': 'fgm3', 'Wftm': 'ftm', 'Wto': 'to',
                     'Wblk': 'blk', 'Wpf': 'pf', 'Wdr': 'dr', 'Wseed': 'seed'})
        loser_losses = loser_losses.drop(['Wteam', 'Lteam'], 1)
        loser_losses = loser_losses.rename(
            columns={'Lscore': 'score', 'Lfgm': 'fgm', 'Lfga3': 'fga3', 'Lfgm3': 'fgm3', 'Lftm': 'ftm', 'Lto': 'to',
                     'Lblk': 'blk', 'Lpf': 'pf', 'Ldr': 'dr', 'Lseed': 'seed'})
        loser_data = pd.concat([loser_wins, loser_losses], axis=0)
        loser_averages = [loser_data["score"].mean(), loser_data["fgm"].mean(), loser_data["fga3"].mean(),
                          loser_data["fgm3"].mean(), loser_data["ftm"].mean(), loser_data["to"].mean(),
                          loser_data["blk"].mean(), loser_data["pf"].mean(), loser_data["dr"].mean(), loser_data["seed"].iloc[0]]
        
        loser_stats = pom[find_loser & find_season].drop(['Season', 'team_id'], axis=1)
        if len(loser_stats) > 0:
            loser_pom = loser_stats.values[0]
            loser_averages.append(loser_pom[0])
            loser_averages.append(loser_pom[1])
        else:
            loser_averages.append(0)
            loser_averages.append(0)
        loser_averages = np.array(loser_averages)
        # Find the stat difference between winner and loser
        difference = np.subtract(winner_averages, loser_averages)
        X_winners[ii, :] = difference

    for ii, row in losers_year.iterrows():
        winner = row[0]
        loser = row[1]

        # Calculate seasonal average stats for winner
        is_winner_win = data_year['Wteam'] == winner
        winner_wins = data_year[is_winner_win]
        is_winner_loss = data_year['Lteam'] == winner
        winner_losses = data_year[is_winner_loss]
        winner_wins = winner_wins.drop(['Wteam', 'Lteam'], axis=1)
        winner_wins = winner_wins.rename(
            columns={'Wscore': 'score', 'Wfgm': 'fgm', 'Wfga3': 'fga3', 'Wfgm3': 'fgm3', 'Wftm': 'ftm', 'Wto': 'to',
                     'Wblk': 'blk', 'Wpf': 'pf', 'Wdr': 'dr', 'Wseed': 'seed'})
        winner_losses = winner_losses.drop(['Wteam', 'Lteam'], 1)
        winner_losses = winner_losses.rename(
            columns={'Lscore': 'score', 'Lfgm': 'fgm', 'Lfga3': 'fga3', 'Lfgm3': 'fgm3', 'Lftm': 'ftm', 'Lto': 'to',
                     'Lblk': 'blk', 'Lpf': 'pf', 'Ldr': 'dr', 'Lseed': 'seed'})
        winner_data = pd.concat([winner_wins, winner_losses], axis=0)
        winner_averages = [winner_data["score"].mean(), winner_data["fgm"].mean(), winner_data["fga3"].mean(),
                           winner_data["fgm3"].mean(), winner_data["ftm"].mean(), winner_data["to"].mean(),
                           winner_data["blk"].mean(), winner_data["pf"].mean(), winner_data["dr"].mean(), winner_data["seed"].iloc[0]]
        find_winner = pom['team_id'] == winner
        find_loser = pom['team_id'] == loser
        find_season = pom['Season'] == i
        winner_stats = pom[find_winner & find_season].drop(['Season', 'team_id'], axis=1)
  
        if len(winner_stats) > 0:
            winner_pom = winner_stats.values[0]
            winner_averages.append(winner_pom[0])
            winner_averages.append(winner_pom[1])
        else:
            winner_averages.append(0)
            winner_averages.append(0)
            
        winner_averages = np.array(winner_averages)

        # Calculate seasonal average stats for loser
        is_loser_win = data_year['Wteam'] == loser
        loser_wins = data_year[is_loser_win]
        is_loser_loss = data_year['Lteam'] == loser
        loser_losses = data_year[is_loser_loss]
        loser_wins = loser_wins.drop(['Wteam', 'Lteam'], 1)
        loser_wins = loser_wins.rename(
            columns={'Wscore': 'score', 'Wfgm': 'fgm', 'Wfga3': 'fga3', 'Wfgm3': 'fgm3', 'Wftm': 'ftm', 'Wto': 'to',
                     'Wblk': 'blk', 'Wpf': 'pf', 'Wdr': 'dr', 'Wseed': 'seed'})
        loser_losses = loser_losses.drop(['Wteam', 'Lteam'], 1)
        loser_losses = loser_losses.rename(
            columns={'Lscore': 'score', 'Lfgm': 'fgm', 'Lfga3': 'fga3', 'Lfgm3': 'fgm3', 'Lftm': 'ftm', 'Lto': 'to',
                     'Lblk': 'blk', 'Lpf': 'pf', 'Ldr': 'dr', 'Lseed': 'seed'})
        loser_data = pd.concat([loser_wins, loser_losses], axis=0)
        loser_averages = [loser_data["score"].mean(), loser_data["fgm"].mean(), loser_data["fga3"].mean(),
                          loser_data["fgm3"].mean(), loser_data["ftm"].mean(), loser_data["to"].mean(),
                          loser_data["blk"].mean(), loser_data["pf"].mean(), loser_data["dr"].mean(), loser_data["seed"].iloc[0]]
        loser_stats = pom[find_loser & find_season].drop(['Season', 'team_id'], axis=1)
        if len(loser_stats) > 0:
            loser_pom = loser_stats.values[0]
            loser_averages.append(loser_pom[0])
            loser_averages.append(loser_pom[1])
        else:
            loser_averages.append(0)
            loser_averages.append(0)
        
        loser_averages = np.array(loser_averages)

        # Find the stat difference between loser and winner
        difference = np.subtract(loser_averages, winner_averages)
        X_losers[ii, :] = difference

    # Combine and Shuffle Winners and Losers Training Examples for the Current Season
    X_winners = pd.DataFrame(X_winners)
    X_winners.columns = ["score", "fgm", "fga3", "fgm3", "ftm", "to", "blk", "pf", "dr", "seed", "winrate", "rank"]
    X_winners['Label'] = 'Win'
    X_losers = pd.DataFrame(X_losers)
    X_losers.columns = ["score", "fgm", "fga3", "fgm3", "ftm", "to", "blk", "pf", "dr", "seed", "winrate", "rank"]
    X_losers['Label'] = 'Lose'

    X_year = pd.concat([X_winners, X_losers], axis=0)
    X_year = X_year.sample(frac=1).reset_index(drop=True)
    print X_year.shape
    dataset = pd.concat([dataset, X_year], axis=0)
    print dataset.shape

(4616, 13)
(4616, 13)
(4571, 13)
(9187, 13)
(4675, 13)
(13862, 13)
(4757, 13)
(18619, 13)
(5043, 13)
(23662, 13)
(5163, 13)
(28825, 13)
(5249, 13)
(34074, 13)
(5263, 13)
(39337, 13)
(5246, 13)
(44583, 13)
(5253, 13)
(49836, 13)
(5320, 13)
(55156, 13)
(5362, 13)
(60518, 13)
(5354, 13)
(65872, 13)
(5369, 13)
(71241, 13)
(5395, 13)
(76636, 13)


Trains a logistic regression classification model on the dataset.

In [7]:
def train_classifier(dataset):
    '''
    DataFrame - > Scikit Classifier Object
    
    Takes the generated dataset and trains a logistic regression classifier on it.
    '''
    X_train = dataset.drop(['Label'], 1)
    Y_train = dataset['Label']

    logistic_regression = LogisticRegression()
    params = {'C': np.logspace(start=-5, stop=3, num=9)}
    clf = GridSearchCV(logistic_regression, params, scoring='neg_log_loss', refit=True)
    #clf = RandomForestClassifier(n_jobs=7)
    model = clf.fit(X_train, Y_train)
    return model

clf = train_classifier(dataset)

This function grabs the team IDs and year from the submission file to apply the classifier on.

In [8]:
def get_year_t1_t2(id):
    """Return a list with ints `year`, `team1` and `team2`."""
    return [int(x) for x in id.split('_')]

year = []
team_one = []
team_two = []
for ii, row in sub.iterrows():
    values = get_year_t1_t2(row.Id)
    year.append(values[0])
    team_one.append(values[1])
    team_two.append(values[2])


prediction_df = pd.DataFrame(
    {'Year': year,
     'TeamOne': team_one,
     'TeamTwo': team_two
    })

Function that creates a test set for a given year to evaluate the model on.

In [15]:
def make_testset(season):
    '''
    int -> DataFrame
    
    Creates a test set for a given year
    '''
    is_year = prediction_df['Year'] == season
    prediction_year = prediction_df[is_year].drop('Year',1)
    prediction_year = prediction_year.reset_index().drop('index',1)

    X_test = np.zeros([prediction_year.shape[0],12])
    for ii, row in prediction_year.iterrows():
        #Extract team IDs, go into season data for just that team, find average values, use those to create test set
        #row[0] is team 1, row[1] is team 2

        is_currentyear = det['Season'] == season
        det_currentyear = det[is_currentyear]
        is_teamone_win = det_currentyear['Wteam'] == row[0]
        is_teamone_lose = det_currentyear['Lteam'] == row[0]
        teamone_winners_data = det_currentyear[is_teamone_win]
        teamone_losers_data = det_currentyear[is_teamone_lose]
        is_teamtwo_win = det_currentyear['Wteam'] == row[1]
        is_teamtwo_lose = det_currentyear['Lteam'] == row[1]
        teamtwo_winners_data = det_currentyear[is_teamtwo_win]
        teamtwo_losers_data = det_currentyear[is_teamtwo_lose]

        #Team One
        #extract the appropriate columns
        teamone_winners_data = teamone_winners_data[['Wscore', 'Wfgm', 'Wfga3', 'Wfgm3', 'Wftm', 'Wto', 'Wblk', 'Wpf', 'Wdr', 'Wseed']]
        teamone_losers_data = teamone_losers_data[['Lscore', 'Lfgm', 'Lfga3', 'Lfgm3', 'Lftm', 'Lto', 'Lblk', 'Lpf', 'Ldr', 'Lseed']]
        #rename the columns and concatenate
        teamone_winners_data = teamone_winners_data.rename(columns={'Wscore': 'score', 'Wfgm': 'fgm', 'Wfga3': 'fga3', 'Wfgm3': 'fgm3', 'Wftm': 'ftm', 'Wto': 'to', 'Wblk': 'blk', 'Wpf': 'pf', 'Wdr': 'dr', 'Wseed': 'seed'})
        teamone_losers_data = teamone_losers_data.rename(columns={'Lscore': 'score', 'Lfgm': 'fgm', 'Lfga3': 'fga3', 'Lfgm3': 'fgm3', 'Lftm': 'ftm', 'Lto': 'to', 'Lblk': 'blk', 'Lpf': 'pf', 'Ldr': 'dr', 'Lseed': 'seed'})
        teamone_losers_data
        teamone_data = pd.concat([teamone_winners_data, teamone_losers_data], 0)
        

        #Create a single row of averages out of the data
        teamone_averages = [teamone_data["score"].mean(), teamone_data["fgm"].mean(), teamone_data["fga3"].mean(), teamone_data["fgm3"].mean(), teamone_data["ftm"].mean(), teamone_data["to"].mean(), teamone_data["blk"].mean(), teamone_data["pf"].mean(), teamone_data["dr"].mean(), teamone_data["seed"].iloc[0]]
        find_teamone = pom['team_id'] == row[0]
        find_teamtwo = pom['team_id'] == row[1]
        find_season = pom['Season'] == season
        
        teamone_stats = pom[find_teamone & find_season].drop(['Season', 'team_id'], axis=1)
        teamone_averages.append(teamone_stats.values[0][0])
        teamone_averages.append(teamone_stats.values[0][1])
        teamone_averages = np.array(teamone_averages)

        #Team Two
        #extract the appropriate columns
        teamtwo_winners_data = teamtwo_winners_data[['Wscore', 'Wfgm', 'Wfga3', 'Wfgm3', 'Wftm', 'Wto', 'Wblk', 'Wpf', 'Wdr', 'Wseed']]
        teamtwo_losers_data = teamtwo_losers_data[['Lscore', 'Lfgm', 'Lfga3', 'Lfgm3', 'Lftm', 'Lto', 'Lblk', 'Lpf', 'Ldr', 'Lseed']]

        #rename the columns and concatenate
        teamtwo_winners_data = teamtwo_winners_data.rename(columns={'Wscore': 'score', 'Wfgm': 'fgm', 'Wfga3': 'fga3', 'Wfgm3': 'fgm3', 'Wftm': 'ftm', 'Wto': 'to', 'Wblk': 'blk', 'Wpf': 'pf', 'Wdr': 'dr', 'Wseed': 'seed'})
        teamtwo_losers_data = teamtwo_losers_data.rename(columns={'Lscore': 'score', 'Lfgm': 'fgm', 'Lfga3': 'fga3', 'Lfgm3': 'fgm3', 'Lftm': 'ftm', 'Lto': 'to', 'Lblk': 'blk', 'Lpf': 'pf', 'Ldr': 'dr', 'Lseed': 'seed'})
        teamtwo_data = pd.concat([teamtwo_winners_data, teamtwo_losers_data], 0)

        #Create a single row of averages out of the data
        teamtwo_averages = [teamtwo_data["score"].mean(), teamtwo_data["fgm"].mean(), teamtwo_data["fga3"].mean(), teamtwo_data["fgm3"].mean(), teamtwo_data["ftm"].mean(), teamtwo_data["to"].mean(), teamtwo_data["blk"].mean(), teamtwo_data["pf"].mean(), teamtwo_data["dr"].mean(), teamtwo_data["seed"].iloc[0]]
        teamtwo_stats = pom[find_teamtwo & find_season].drop(['Season', 'team_id'], axis=1)
        teamtwo_averages.append(teamtwo_stats.values[0][0])
        teamtwo_averages.append(teamtwo_stats.values[0][1])
        teamtwo_averages = np.array(teamtwo_averages)

        #Take the difference between Team One and Team Two
        difference = np.subtract(teamone_averages,teamtwo_averages)
        X_test[ii,:] = difference
    return X_test

In [10]:
preds_2017 = clf.predict_proba(make_testset(2017))[:,1]
sub.Pred = preds_2017
sub.to_csv('2017LogReg.csv')

Feature Selection and Importance Analysis

In [11]:
X_train = dataset.drop(['Label'], 1)
Y_train = dataset['Label']

model = LogisticRegression()

rfe = RFE(model, 1)
rfe = rfe.fit(X_train,Y_train)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
print(["score", "fgm", "fga3", "fgm3", "ftm", "to", "blk", "pf", "dr", "seed", "winrate", "rank"])

[False False False False False False False False False False  True False]
[10 11  6  5  9 12  2  4  7  3  1  8]
['score', 'fgm', 'fga3', 'fgm3', 'ftm', 'to', 'blk', 'pf', 'dr', 'seed', 'winrate', 'rank']


In [12]:
X_train = dataset.drop(['Label'], 1)
Y_train = dataset['Label']
model = ExtraTreesClassifier()
model.fit(X_train, Y_train)
#relative importance of each attribute
print(model.feature_importances_)
print(["score", "fgm", "fga3", "fgm3", "ftm", "to"])
print(["blk", "pf", "dr", "seed", "winrate", "rank"])

[ 0.08484895  0.07699812  0.05287596  0.0550626   0.05770146  0.06420229
  0.05870672  0.05780151  0.0603426   0.07335194  0.16372104  0.19438682]
['score', 'fgm', 'fga3', 'fgm3', 'ftm', 'to']
['blk', 'pf', 'dr', 'seed', 'winrate', 'rank']


Testing a model with using a very small subset of 6 of the original features.

In [18]:
def make_testset_low(season):
    '''
    int -> DataFrame
    
    Creates a test set for a given year
    '''
        
    is_year = prediction_df['Year'] == season
    prediction_year = prediction_df[is_year].drop('Year',1)
    prediction_year = prediction_year.reset_index().drop('index',1)

    X_test = np.zeros([prediction_year.shape[0],6])
    
    for ii, row in prediction_year.iterrows():
        #Extract team IDs, go into season data for just that team, find average values, use those to create test set
        #row[0] is team 1, row[1] is team 2

        is_currentyear = det['Season'] == season
        det_currentyear = det[is_currentyear]
        is_teamone_win = det_currentyear['Wteam'] == row[0]
        is_teamone_lose = det_currentyear['Lteam'] == row[0]
        teamone_winners_data = det_currentyear[is_teamone_win]
        teamone_losers_data = det_currentyear[is_teamone_lose]
        is_teamtwo_win = det_currentyear['Wteam'] == row[1]
        is_teamtwo_lose = det_currentyear['Lteam'] == row[1]
        teamtwo_winners_data = det_currentyear[is_teamtwo_win]
        teamtwo_losers_data = det_currentyear[is_teamtwo_lose]

        #Team One
        #extract the appropriate columns
        teamone_winners_data = teamone_winners_data[['Wscore', 'Wfgm', 'Wfga3', 'Wfgm3', 'Wftm', 'Wto', 'Wblk', 'Wpf', 'Wdr', 'Wseed']]
        teamone_losers_data = teamone_losers_data[['Lscore', 'Lfgm', 'Lfga3', 'Lfgm3', 'Lftm', 'Lto', 'Lblk', 'Lpf', 'Ldr', 'Lseed']]
        #rename the columns and concatenate
        teamone_winners_data = teamone_winners_data.rename(columns={'Wscore': 'score', 'Wfgm': 'fgm', 'Wfga3': 'fga3', 'Wfgm3': 'fgm3', 'Wftm': 'ftm', 'Wto': 'to', 'Wblk': 'blk', 'Wpf': 'pf', 'Wdr': 'dr', 'Wseed': 'seed'})
        teamone_losers_data = teamone_losers_data.rename(columns={'Lscore': 'score', 'Lfgm': 'fgm', 'Lfga3': 'fga3', 'Lfgm3': 'fgm3', 'Lftm': 'ftm', 'Lto': 'to', 'Lblk': 'blk', 'Lpf': 'pf', 'Ldr': 'dr', 'Lseed': 'seed'})
        teamone_losers_data
        teamone_data = pd.concat([teamone_winners_data, teamone_losers_data], 0)
        

        #Create a single row of averages out of the data
        teamone_averages = [teamone_data["fgm"].mean(), teamone_data["blk"].mean(), teamone_data["pf"].mean(), teamone_data["dr"].mean(), teamone_data["seed"].iloc[0]]
        find_teamone = pom['team_id'] == row[0]
        find_teamtwo = pom['team_id'] == row[1]
        find_season = pom['Season'] == season
        
        teamone_stats = pom[find_teamone & find_season].drop(['Season', 'team_id'], axis=1)
        teamone_averages.append(teamone_stats.values[0][0])
        #teamone_averages.append(teamone_stats.values[0][1])
        teamone_averages = np.array(teamone_averages)

        #Team Two
        #extract the appropriate columns
        teamtwo_winners_data = teamtwo_winners_data[['Wscore', 'Wfgm', 'Wfga3', 'Wfgm3', 'Wftm', 'Wto', 'Wblk', 'Wpf', 'Wdr', 'Wseed']]
        teamtwo_losers_data = teamtwo_losers_data[['Lscore', 'Lfgm', 'Lfga3', 'Lfgm3', 'Lftm', 'Lto', 'Lblk', 'Lpf', 'Ldr', 'Lseed']]

        #rename the columns and concatenate
        teamtwo_winners_data = teamtwo_winners_data.rename(columns={'Wscore': 'score', 'Wfgm': 'fgm', 'Wfga3': 'fga3', 'Wfgm3': 'fgm3', 'Wftm': 'ftm', 'Wto': 'to', 'Wblk': 'blk', 'Wpf': 'pf', 'Wdr': 'dr', 'Wseed': 'seed'})
        teamtwo_losers_data = teamtwo_losers_data.rename(columns={'Lscore': 'score', 'Lfgm': 'fgm', 'Lfga3': 'fga3', 'Lfgm3': 'fgm3', 'Lftm': 'ftm', 'Lto': 'to', 'Lblk': 'blk', 'Lpf': 'pf', 'Ldr': 'dr', 'Lseed': 'seed'})
        teamtwo_data = pd.concat([teamtwo_winners_data, teamtwo_losers_data], 0)

        #Create a single row of averages out of the data
        teamtwo_averages = [teamtwo_data["fgm"].mean(), teamtwo_data["blk"].mean(), teamtwo_data["pf"].mean(), teamtwo_data["dr"].mean(), teamtwo_data["seed"].iloc[0]]
        teamtwo_stats = pom[find_teamtwo & find_season].drop(['Season', 'team_id'], axis=1)
        teamtwo_averages.append(teamtwo_stats.values[0][0])
        #teamtwo_averages.append(teamtwo_stats.values[0][1])
        teamtwo_averages = np.array(teamtwo_averages)

        #Take the difference between Team One and Team Two
        difference = np.subtract(teamone_averages,teamtwo_averages)
        X_test[ii,:] = difference
    return X_test

In [46]:
preds_2016 = clf_low.predict_proba(make_testset_low(2016))[:,1]
sub.Pred = preds_2016
sub.to_csv('2016LogisticRegressionLowerDim.csv')