In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from sportsreference.ncaab.teams import Teams
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
# create csv file from dataset
# takes over 2 hours to download data
'''
dataset = pd.DataFrame()
teams = Teams()
for team in tqdm(teams):
    try:
        dataset = pd.concat([dataset, team.schedule.dataframe_extended])
    except:
        continue
dataset.to_csv('C:/Users/negre/Documents/GitHub/NCAA-Basketball-Predictions/data.csv',index=None)
'''

"\ndataset = pd.DataFrame()\nteams = Teams()\nfor team in tqdm(teams):\n    try:\n        dataset = pd.concat([dataset, team.schedule.dataframe_extended])\n    except:\n        continue\ndataset.to_csv('C:/Users/negre/Documents/GitHub/NCAA-Basketball-Predictions/data.csv',index=None)\n"

In [21]:
FIELDS_TO_DROP = ['date', 'location', 'away_defensive_rating', 'home_defensive_rating',
                  'away_defensive_rebound_percentage', 'home_defensive_rebound_percentage',
                  'losing_abbr', 'winner', 'winning_abbr',
                  'home_ranking', 'away_ranking']

FIELDS_TO_DROP2 = ['winning_name', 'losing_name', 'away_points', 'home_points']

# load dataset from csv file
dataset = pd.read_csv('C:/Users/negre/Documents/GitHub/NCAA-Basketball-Predictions/data.csv')

# make reduced dataset
X = dataset.drop(FIELDS_TO_DROP, 1).dropna().drop_duplicates()

# get labels from reduced dataset
y = X[['home_points', 'away_points']].values

# dataset of winning and losing team names
team_names = X[['winning_name', 'losing_name']]

# dataset of just features (labels removed)
x = X.drop(FIELDS_TO_DROP2, 1)

# check sizes of feastures and labels datasets
print(x.shape, y.shape)

# create training and test set
X_train, X_test, y_train, y_test, team_train, team_test = train_test_split(x, y, team_names)

# setup RandomForestRegressor model to use max num. of threads and verbose
parameters = {'verbose':1,
              'n_jobs':-1,
              'bootstrap': False,
              'min_samples_leaf': 3,
              'n_estimators': 50,
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 6}
model = RandomForestRegressor(**parameters)

# fit the model
model.fit(X_train, y_train)

(5761, 71) (5761, 2)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished


RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=6, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=1, warm_start=False)

In [68]:
# print predictions
predict_data = model.predict(X_test).astype(int)
p_dataset = pd.DataFrame({'Home Points': predict_data[:, 0], 'Away Points': predict_data[:, 1]})
display(p_dataset)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done  50 out of  50 | elapsed:    0.0s finished


Unnamed: 0,Home Points,Away Points
0,59,66
1,58,71
2,56,62
3,80,54
4,88,67
...,...,...
1436,58,67
1437,73,49
1438,88,74
1439,67,76


In [23]:
# print test data
test_data = np.concatenate((team_test, y_test), axis=1)
t_dataset = pd.DataFrame({'Winner': test_data[:, 0], 'Loser': test_data[:, 1], 'Home Points': test_data[:, 2], 'Away Points': test_data[:, 3]})
display(t_dataset)

Unnamed: 0,Winner,Loser,Home Points,Away Points
0,San Diego State,Fresno State,55,64
1,NJIT,Kennesaw State,55,76
2,North Dakota State,Texas A&M-Corpus Christi,45,57
3,Louisville,USC Upstate,76,50
4,Nicholls State,New Orleans,92,68
...,...,...,...,...
1436,Baylor,Oklahoma,54,65
1437,Virginia Tech,Coppin State,74,42
1438,Bucknell,Loyola (MD),98,83
1439,Oregon State,Wyoming,63,83


In [8]:
team_set = set()
for team in teams:
    team_set.add(team.name)

In [11]:
def inputs():
    home = None
    while home is None:
        home = input('Enter Home Team Name (Enter 1 for team list): ')
        if home == '1':
            print(sorted(team_set))
            home = None
        elif home not in team_set:
            print('Invalid Home Team Name Entered.')
            home = None
        else:
            break
            
    away = None
    while away is None:
        away = input('Enter Away Team Name (Enter 1 for team list): ')
        if away == '1':
            print(sorted(team_set))
            away = None
        elif away not in team_set:
            print('Invalid Away Team Name Entered.')
            away = None
        elif away == home:
            print('Enter a DIFFERENT team name.')
            away = None
        else:
            break
            
    return(home, away)

In [16]:
def teamObjs(home, away):
    teams = Teams()
    h_obj, a_obj = None, None
    for team in teams:
        if team.name == home:
            h_obj = team
        elif team.name == away:
            a_obj = team
    return h_obj, a_obj

In [55]:
def makePrediction(home, away):
    df = pd.DataFrame({'away_assist_percentage': away.assist_percentage, 
                       'away_assists': away.assists, 
                       'away_block_percentage': away.block_percentage, 
                       'away_blocks': away.blocks,
                       'away_defensive_rebounds': away.defensive_rebounds,
                       'away_effective_field_goal_percentage': away.effective_field_goal_percentage,
                       'away_field_goal_attempts': away.field_goal_attempts,
                       'away_field_goal_percentage': away.field_goal_percentage,
                       'away_field_goals': away.field_goals,
                       'away_free_throw_attempt_rate': away.free_throw_attempt_rate,
                       'away_free_throw_attempts': away.free_throw_attempts,
                       'away_free_throw_percentage': away.free_throw_percentage,
                       'away_free_throws': away.free_throws,
                       'away_losses': away.losses,
                       'away_minutes_played': away.minutes_played,
                       'away_offensive_rating': away.offensive_rating,
                       'away_offensive_rebound_percentage': away.offensive_rebound_percentage,
                       'away_offensive_rebounds': away.offensive_rebounds,
                       'away_personal_fouls': away.personal_fouls,
                       'away_steal_percentage': away.steal_percentage,
                       'away_steals': away.steals,
                       'away_three_point_attempt_rate': away.three_point_attempt_rate,
                       'away_three_point_field_goal_attempts': away.three_point_field_goal_attempts,
                       'away_three_point_field_goal_percentage': away.three_point_field_goal_percentage,
                       'away_three_point_field_goals': away.three_point_field_goals,
                       'away_total_rebound_percentage': away.total_rebound_percentage,
                       'away_total_rebounds': away.total_rebounds,
                       'away_true_shooting_percentage': away.true_shooting_percentage,
                       'away_turnover_percentage': away.turnover_percentage,
                       'away_turnovers': away.turnovers,
                       'away_two_point_field_goal_attempts': away.two_point_field_goal_attempts,
                       'away_two_point_field_goal_percentage': away.two_point_field_goal_percentage,
                       'away_two_point_field_goals': away.two_point_field_goals,
                       'away_win_percentage': away.win_percentage,
                       'away_wins': away.wins,
                       'home_assist_percentage': home.assist_percentage,
                       'home_assists': home.assists,
                       'home_block_percentage': home.block_percentage,
                       'home_blocks': home.blocks,
                       'home_defensive_rebounds': home.defensive_rebounds,
                       'home_effective_field_goal_percentage': home.effective_field_goal_percentage,
                       'home_field_goal_attempts': home.field_goal_attempts,
                       'home_field_goal_percentage': home.field_goal_percentage,
                       'home_field_goals': home.field_goals,
                       'home_free_throw_attempt_rate': home.free_throw_attempt_rate,
                       'home_free_throw_attempts': home.free_throw_attempts,
                       'home_free_throw_percentage': home.free_throw_percentage,
                       'home_free_throws': home.free_throws,
                       'home_losses': home.losses,
                       'home_minutes_played': home.minutes_played,
                       'home_offensive_rating': home.offensive_rating,
                       'home_offensive_rebound_percentage': home.offensive_rebound_percentage,
                       'home_offensive_rebounds': home.offensive_rebounds,
                       'home_personal_fouls': home.personal_fouls,
                       'home_steal_percentage': home.steal_percentage,
                       'home_steals': home.steals,
                       'home_three_point_attempt_rate': home.three_point_attempt_rate,
                       'home_three_point_field_goal_attempts': home.three_point_field_goal_attempts,
                       'home_three_point_field_goal_percentage': home.three_point_field_goal_percentage,
                       'home_three_point_field_goals': home.three_point_field_goals,
                       'home_total_rebound_percentage': home.total_rebound_percentage,
                       'home_total_rebounds': home.total_rebounds,
                       'home_true_shooting_percentage': home.true_shooting_percentage,
                       'home_turnover_percentage': home.turnover_percentage,
                       'home_turnovers': home.turnovers,
                       'home_two_point_field_goal_attempts': home.two_point_field_goal_attempts,
                       'home_two_point_field_goal_percentage': home.two_point_field_goal_percentage,
                       'home_two_point_field_goals': home.two_point_field_goals,
                       'home_win_percentage': home.win_percentage,
                       'home_wins': home.wins,
                       'pace': [(home.pace + away.pace) / 2]
                      })

    predict_data = np.concatenate((np.array([[home.name, away.name]]), model.predict(df).astype(int)), axis=1)
    dataset = pd.DataFrame({'Home Team': predict_data[:, 0], 'Away Team': predict_data[:, 1], 'Home Points': predict_data[:, 2], 'Away Points': predict_data[:, 3]})
    return dataset

In [56]:
home, away = inputs()
h_obj, a_obj = teamObjs(home, away)
display(makePrediction(h_obj, a_obj))

Enter Home Team Name (Enter 1 for team list): Connecticut
Enter Away Team Name (Enter 1 for team list): Duke


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done  50 out of  50 | elapsed:    0.0s finished


Unnamed: 0,Home Team,Away Team,Home Points,Away Points
0,Connecticut,Duke,92,92
