In [1]:
import sqlalchemy
import pandas as pd
from geopy.geocoders import MapBox
from geopy.distance import geodesic

In [2]:
engine = sqlalchemy.create_engine('sqlite:///db/ncaa_history.sqlite')
connection = engine.connect()

## Import SQL Database Tables

In [3]:
games_df = pd.read_sql_table('games_all', engine)
games_df.head()

Unnamed: 0,location,loser,loser_score,loser_seed,round,winner,winner_score,winner_seed,year
0,"Pittsburgh, PA",Radford,61,16,1,Villanova,87,1,2018
1,"Pittsburgh, PA",Virginia Tech,83,8,1,Alabama,86,9,2018
2,"San Diego, CA",Murray State,68,12,1,West Virginia,85,5,2018
3,"San Diego, CA",Wichita State,75,4,1,Marshall,81,13,2018
4,"Dallas, TX",St. Bonaventure,62,11,1,Florida,77,6,2018


In [4]:
games_df = games_df[['year', 'round', 'location', 'winner', 'winner_seed', 'winner_score', 'loser', 'loser_seed', 'loser_score']]
games_df.head()

Unnamed: 0,year,round,location,winner,winner_seed,winner_score,loser,loser_seed,loser_score
0,2018,1,"Pittsburgh, PA",Villanova,1,87,Radford,16,61
1,2018,1,"Pittsburgh, PA",Alabama,9,86,Virginia Tech,8,83
2,2018,1,"San Diego, CA",West Virginia,5,85,Murray State,12,68
3,2018,1,"San Diego, CA",Marshall,13,81,Wichita State,4,75
4,2018,1,"Dallas, TX",Florida,6,77,St. Bonaventure,11,62


In [5]:
schools_df = pd.read_sql_table('schools', engine)
schools_df.head()

Unnamed: 0,location,logo,name,url
0,"Abilene, Texas",https://d2p3bygnnzw9w3.cloudfront.net/req/2019...,Abilene Christian Wildcats,https://www.sports-reference.com/cbb/schools/a...
1,"USAF Academy, Colorado",https://d2p3bygnnzw9w3.cloudfront.net/req/2019...,Air Force,https://www.sports-reference.com/cbb/schools/a...
2,"Akron, Ohio",https://d2p3bygnnzw9w3.cloudfront.net/req/2019...,Akron,https://www.sports-reference.com/cbb/schools/a...
3,"Normal, Alabama",https://d2p3bygnnzw9w3.cloudfront.net/req/2019...,Alabama A&M Bulldogs,https://www.sports-reference.com/cbb/schools/a...
4,"Tuscaloosa, Alabama",https://d2p3bygnnzw9w3.cloudfront.net/req/2019...,Alabama,https://www.sports-reference.com/cbb/schools/a...


In [6]:
school_locations = schools_df.loc[:,['location','name']]
school_locations.head()

Unnamed: 0,location,name
0,"Abilene, Texas",Abilene Christian Wildcats
1,"USAF Academy, Colorado",Air Force
2,"Akron, Ohio",Akron
3,"Normal, Alabama",Alabama A&M Bulldogs
4,"Tuscaloosa, Alabama",Alabama


In [7]:
teams_df = pd.read_sql_table('teams_all', engine)
teams_df.head()

Unnamed: 0,name,opp_g,opp_mp,opp_opp_ast,opp_opp_blk,opp_opp_drb,opp_opp_fg,opp_opp_fg2,opp_opp_fg2_pct,opp_opp_fg2a,...,team_rank_pts,team_rank_pts_per_g,team_rank_stl,team_rank_tov,team_rank_trb,team_stl,team_tov,team_trb,url,year
0,Villanova,40,8075,532,105,928,1028,756,0.49,1543,...,1,1,28,182,8,259,426,1436,https://www.sports-reference.com/cbb/schools/v...,2018
1,Radford,36,7325,413,120,818,819,558,0.484,1152,...,169,312,97,223,93,226,441,1244,https://www.sports-reference.com/cbb/schools/r...,2018
2,Virginia Tech,33,6675,460,124,784,844,552,0.492,1123,...,75,46,213,92,262,191,395,1078,https://www.sports-reference.com/cbb/schools/v...,2018
3,Alabama,36,7200,427,134,863,877,615,0.469,1310,...,90,217,90,346,61,228,513,1290,https://www.sports-reference.com/cbb/schools/a...,2018
4,West Virginia,37,7425,462,148,893,835,547,0.454,1204,...,13,45,5,161,13,301,420,1401,https://www.sports-reference.com/cbb/schools/w...,2018


## School/Game Locations

In [8]:
# merge games table and school locations to start games_stats table
game_stats = games_df.merge(school_locations, how='left', left_on='winner', right_on='name', suffixes = ('','_winner'))
game_stats = game_stats.merge(school_locations, how='left', left_on='loser', right_on='name', suffixes = ('','_loser'))
del game_stats['name']
del game_stats['name_loser']
game_stats.head()

Unnamed: 0,year,round,location,winner,winner_seed,winner_score,loser,loser_seed,loser_score,location_winner,location_loser
0,2018,1,"Pittsburgh, PA",Villanova,1,87,Radford,16,61,"Villanova, Pennsylvania","Radford, Virginia"
1,2018,1,"Pittsburgh, PA",Alabama,9,86,Virginia Tech,8,83,"Tuscaloosa, Alabama","Blacksburg, Virginia"
2,2018,1,"San Diego, CA",West Virginia,5,85,Murray State,12,68,"Morgantown, West Virginia","Murray, Kentucky"
3,2018,1,"San Diego, CA",Marshall,13,81,Wichita State,4,75,"Huntington, West Virginia","Wichita, Kansas"
4,2018,1,"Dallas, TX",Florida,6,77,St. Bonaventure,11,62,"Gainesville, Florida","St. Bonaventure, New York"


In [9]:
geolocator = MapBox(api_key="pk.eyJ1Ijoib2JuaWNob2xzb24iLCJhIjoiY2pwcHBrbmIxMGdhMTN4cWZ2czR6NDVwcCJ9.T5lnDc1uaxKgp4S18rFyBw", timeout=None)

In [10]:
def get_latlons(column):
    counter = 0
    locations = []
    latlons = []

    for location in column:
        geolocation = geolocator.geocode(location)
        locations.append(geolocation)
        counter +=1
        if (counter % 100 == 0):
            print(counter)

    for location in locations:
        latlon = (location.latitude, location.longitude)
        latlons.append(latlon)
    
    return latlons

In [11]:
game_locations = get_latlons(game_stats['location'])
winner_locations = get_latlons(game_stats['location_winner'])
loser_locations = get_latlons(game_stats['location_loser'])

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800


In [12]:
def get_travel_distances(school_location_list):
    travel_distances = []
    
    for index, school_location in enumerate(school_location_list):
        game_location = game_locations[index]
        travel_distance = geodesic(school_location, game_location).miles
        travel_distances.append(travel_distance)
        
    return travel_distances

In [13]:
winner_travel_distances = get_travel_distances(winner_locations)
loser_travel_distances = get_travel_distances(loser_locations)

In [14]:
pd.options.mode.chained_assignment = None

game_stats['travel_miles_winner'] = winner_travel_distances
game_stats['travel_miles_loser'] = loser_travel_distances
game_stats.head()

Unnamed: 0,year,round,location,winner,winner_seed,winner_score,loser,loser_seed,loser_score,location_winner,location_loser,travel_miles_winner,travel_miles_loser
0,2018,1,"Pittsburgh, PA",Villanova,1,87,Radford,16,61,"Villanova, Pennsylvania","Radford, Virginia",246.946684,230.496683
1,2018,1,"Pittsburgh, PA",Alabama,9,86,Virginia Tech,8,83,"Tuscaloosa, Alabama","Blacksburg, Virginia",645.478758,222.738124
2,2018,1,"San Diego, CA",West Virginia,5,85,Murray State,12,68,"Morgantown, West Virginia","Murray, Kentucky",2117.874565,1658.405787
3,2018,1,"San Diego, CA",Marshall,13,81,Wichita State,4,75,"Huntington, West Virginia","Wichita, Kansas",1982.540076,1170.375993
4,2018,1,"Dallas, TX",Florida,6,77,St. Bonaventure,11,62,"Gainesville, Florida","St. Bonaventure, New York",882.710411,1190.429282


## Create Losing/Winning Records with Season Stats

In [15]:
winners_df = game_stats.rename(columns={'winner':'team_name', 'winner_seed':'team_seed', 'winner_score':'team_score', 
                                       'loser':'opponent_name', 'loser_seed':'opponent_seed', 'loser_score':'opponent_score', 
                                       'location_winner':'team_location', 'location_loser':'opponent_location', 
                                       'travel_miles_winner':'team_travel_miles', 'travel_miles_loser':'opponent_travel_miles'})
winners_df['outcome'] = 'WIN'
winners_df = winners_df[['year', 'round', 'location', 'team_name', 'team_seed', 'team_score', 'team_location', 
                         'team_travel_miles', 'opponent_name', 'opponent_seed', 'opponent_score', 'opponent_location', 
                         'opponent_travel_miles', 'outcome']]
winners_df.head()

Unnamed: 0,year,round,location,team_name,team_seed,team_score,team_location,team_travel_miles,opponent_name,opponent_seed,opponent_score,opponent_location,opponent_travel_miles,outcome
0,2018,1,"Pittsburgh, PA",Villanova,1,87,"Villanova, Pennsylvania",246.946684,Radford,16,61,"Radford, Virginia",230.496683,WIN
1,2018,1,"Pittsburgh, PA",Alabama,9,86,"Tuscaloosa, Alabama",645.478758,Virginia Tech,8,83,"Blacksburg, Virginia",222.738124,WIN
2,2018,1,"San Diego, CA",West Virginia,5,85,"Morgantown, West Virginia",2117.874565,Murray State,12,68,"Murray, Kentucky",1658.405787,WIN
3,2018,1,"San Diego, CA",Marshall,13,81,"Huntington, West Virginia",1982.540076,Wichita State,4,75,"Wichita, Kansas",1170.375993,WIN
4,2018,1,"Dallas, TX",Florida,6,77,"Gainesville, Florida",882.710411,St. Bonaventure,11,62,"St. Bonaventure, New York",1190.429282,WIN


In [16]:
losers_df = game_stats.rename(columns={'winner':'opponent_name', 'winner_seed':'opponent_seed', 'winner_score':'opponent_score', 
                                       'loser':'team_name', 'loser_seed':'team_seed', 'loser_score':'team_score', 
                                       'location_winner':'opponent_location', 'location_loser':'team_location', 
                                       'travel_miles_winner':'opponent_travel_miles', 'travel_miles_loser':'team_travel_miles'})
losers_df['outcome'] = 'LOSS'
losers_df = losers_df[['year', 'round', 'location', 'team_name', 'team_seed', 'team_score', 'team_location', 
                         'team_travel_miles', 'opponent_name', 'opponent_seed', 'opponent_score', 'opponent_location', 
                         'opponent_travel_miles', 'outcome']]
losers_df.head()

Unnamed: 0,year,round,location,team_name,team_seed,team_score,team_location,team_travel_miles,opponent_name,opponent_seed,opponent_score,opponent_location,opponent_travel_miles,outcome
0,2018,1,"Pittsburgh, PA",Radford,16,61,"Radford, Virginia",230.496683,Villanova,1,87,"Villanova, Pennsylvania",246.946684,LOSS
1,2018,1,"Pittsburgh, PA",Virginia Tech,8,83,"Blacksburg, Virginia",222.738124,Alabama,9,86,"Tuscaloosa, Alabama",645.478758,LOSS
2,2018,1,"San Diego, CA",Murray State,12,68,"Murray, Kentucky",1658.405787,West Virginia,5,85,"Morgantown, West Virginia",2117.874565,LOSS
3,2018,1,"San Diego, CA",Wichita State,4,75,"Wichita, Kansas",1170.375993,Marshall,13,81,"Huntington, West Virginia",1982.540076,LOSS
4,2018,1,"Dallas, TX",St. Bonaventure,11,62,"St. Bonaventure, New York",1190.429282,Florida,6,77,"Gainesville, Florida",882.710411,LOSS


In [17]:
dataframes = [winners_df, losers_df]
outcomes_df = pd.concat(dataframes, sort=False)
print(len(winners_df), len(losers_df), len(outcomes_df))

1872 1872 3744


In [18]:
outcomes_incl_team_stats = outcomes_df.merge(teams_df, how='left', left_on=['team_name','year'], right_on=['name','year'], suffixes=('','_stats'))
outcomes_full_stats = outcomes_incl_team_stats.merge(teams_df.add_prefix('opponent_'), how='left', left_on=['opponent_name','year'], right_on=['opponent_name','opponent_year'], suffixes=('','_stats'))
outcomes_full_stats.head()

Unnamed: 0,year,round,location,team_name,team_seed,team_score,team_location,team_travel_miles,opponent_name,opponent_seed,...,opponent_team_rank_pts,opponent_team_rank_pts_per_g,opponent_team_rank_stl,opponent_team_rank_tov,opponent_team_rank_trb,opponent_team_stl,opponent_team_tov,opponent_team_trb,opponent_url,opponent_year
0,2018,1,"Pittsburgh, PA",Villanova,1,87,"Villanova, Pennsylvania",246.946684,Radford,16,...,169,312,97,223,93,226,441,1244,https://www.sports-reference.com/cbb/schools/r...,2018
1,2018,1,"Pittsburgh, PA",Alabama,9,86,"Tuscaloosa, Alabama",645.478758,Virginia Tech,8,...,75,46,213,92,262,191,395,1078,https://www.sports-reference.com/cbb/schools/v...,2018
2,2018,1,"San Diego, CA",West Virginia,5,85,"Morgantown, West Virginia",2117.874565,Murray State,12,...,126,61,129,74,143,216,384,1186,https://www.sports-reference.com/cbb/schools/m...,2018
3,2018,1,"San Diego, CA",Marshall,13,81,"Huntington, West Virginia",1982.540076,Wichita State,4,...,46,16,317,76,33,154,385,1336,https://www.sports-reference.com/cbb/schools/w...,2018
4,2018,1,"Dallas, TX",Florida,6,77,"Gainesville, Florida",882.710411,St. Bonaventure,11,...,85,80,63,84,120,239,391,1208,https://www.sports-reference.com/cbb/schools/s...,2018


In [19]:
list(outcomes_full_stats.columns.values)

['year',
 'round',
 'location',
 'team_name',
 'team_seed',
 'team_score',
 'team_location',
 'team_travel_miles',
 'opponent_name',
 'opponent_seed',
 'opponent_score',
 'opponent_location',
 'opponent_travel_miles',
 'outcome',
 'name',
 'opp_g',
 'opp_mp',
 'opp_opp_ast',
 'opp_opp_blk',
 'opp_opp_drb',
 'opp_opp_fg',
 'opp_opp_fg2',
 'opp_opp_fg2_pct',
 'opp_opp_fg2a',
 'opp_opp_fg3',
 'opp_opp_fg3_pct',
 'opp_opp_fg3a',
 'opp_opp_fg_pct',
 'opp_opp_fga',
 'opp_opp_ft',
 'opp_opp_ft_pct',
 'opp_opp_fta',
 'opp_opp_orb',
 'opp_opp_pf',
 'opp_opp_pts',
 'opp_opp_pts_per_g',
 'opp_opp_stl',
 'opp_opp_tov',
 'opp_opp_trb',
 'opp_rank_g',
 'opp_rank_mp',
 'opp_rank_opp_ast',
 'opp_rank_opp_blk',
 'opp_rank_opp_drb',
 'opp_rank_opp_fg',
 'opp_rank_opp_fg2',
 'opp_rank_opp_fg2_pct',
 'opp_rank_opp_fg2a',
 'opp_rank_opp_fg3',
 'opp_rank_opp_fg3_pct',
 'opp_rank_opp_fg3a',
 'opp_rank_opp_fg_pct',
 'opp_rank_opp_fga',
 'opp_rank_opp_ft',
 'opp_rank_opp_ft_pct',
 'opp_rank_opp_fta',
 'opp_ran

In [20]:
model_input_df = outcomes_full_stats[['year', 'round', 'location', 'outcome', 'team_name', 'team_seed', 'team_score', 
                                     'team_location', 'team_travel_miles', 'opponent_name', 'opponent_seed', 
                                     'opponent_score', 'opponent_location', 'opponent_travel_miles', 'team_SOS', 
                                     'team_SRS', 'team_ast', 'team_blk', 'team_drb', 'team_fg', 'team_fg2',
                                      'team_fg2_pct', 'team_fg2a', 'team_fg3', 'team_fg3_pct', 'team_fg3a', 
                                      'team_fg_pct', 'team_fga', 'team_ft', 'team_ft_pct', 'team_fta', 'team_g', 
                                      'team_mp', 'team_orb', 'team_pf', 'team_pts', 'team_pts_per_g', 'team_rank_SOS', 
                                      'team_rank_SRS', 'team_rank_ast', 'team_rank_blk', 'team_rank_drb', 'team_rank_fg', 
                                      'team_rank_fg2', 'team_rank_fg2_pct', 'team_rank_fg2a', 'team_rank_fg3', 
                                      'team_rank_fg3_pct', 'team_rank_fg3a', 'team_rank_fg_pct', 'team_rank_fga', 
                                      'team_rank_ft', 'team_rank_ft_pct', 'team_rank_fta', 'team_rank_g', 'team_rank_mp', 
                                      'team_rank_orb', 'team_rank_pf', 'team_rank_pts', 'team_rank_pts_per_g', 
                                      'team_rank_stl', 'team_rank_tov', 'team_rank_trb', 'team_stl', 'team_tov', 
                                      'team_trb', 'opp_g', 'opp_mp', 'opp_opp_ast', 'opp_opp_blk', 'opp_opp_drb', 
                                      'opp_opp_fg', 'opp_opp_fg2', 'opp_opp_fg2_pct', 'opp_opp_fg2a', 'opp_opp_fg3', 
                                      'opp_opp_fg3_pct', 'opp_opp_fg3a', 'opp_opp_fg_pct', 'opp_opp_fga', 'opp_opp_ft', 
                                      'opp_opp_ft_pct', 'opp_opp_fta', 'opp_opp_orb', 'opp_opp_pf', 'opp_opp_pts',
                                      'opp_opp_pts_per_g', 'opp_opp_stl', 'opp_opp_tov', 'opp_opp_trb', 'opp_rank_g',
                                      'opp_rank_mp', 'opp_rank_opp_ast', 'opp_rank_opp_blk', 'opp_rank_opp_drb',
                                      'opp_rank_opp_fg', 'opp_rank_opp_fg2', 'opp_rank_opp_fg2_pct', 'opp_rank_opp_fg2a', 
                                      'opp_rank_opp_fg3', 'opp_rank_opp_fg3_pct', 'opp_rank_opp_fg3a','opp_rank_opp_fg_pct', 
                                      'opp_rank_opp_fga', 'opp_rank_opp_ft', 'opp_rank_opp_ft_pct', 'opp_rank_opp_fta',
                                      'opp_rank_opp_orb', 'opp_rank_opp_pf', 'opp_rank_opp_pts', 'opp_rank_opp_pts_per_g',
                                      'opp_rank_opp_stl', 'opp_rank_opp_tov', 'opp_rank_opp_trb',  'opponent_team_SOS', 
                                      'opponent_team_SRS', 'opponent_team_ast', 'opponent_team_blk', 'opponent_team_drb', 
                                      'opponent_team_fg', 'opponent_team_fg2', 'opponent_team_fg2_pct', 'opponent_team_fg2a', 
                                      'opponent_team_fg3', 'opponent_team_fg3_pct', 'opponent_team_fg3a', 'opponent_team_fg_pct', 
                                      'opponent_team_fga', 'opponent_team_ft', 'opponent_team_ft_pct', 'opponent_team_fta', 
                                      'opponent_team_g', 'opponent_team_mp', 'opponent_team_orb', 'opponent_team_pf', 
                                      'opponent_team_pts', 'opponent_team_pts_per_g', 'opponent_team_rank_SOS', 
                                      'opponent_team_rank_SRS', 'opponent_team_rank_ast', 'opponent_team_rank_blk', 
                                      'opponent_team_rank_drb', 'opponent_team_rank_fg', 'opponent_team_rank_fg2', 
                                      'opponent_team_rank_fg2_pct', 'opponent_team_rank_fg2a', 'opponent_team_rank_fg3', 
                                      'opponent_team_rank_fg3_pct', 'opponent_team_rank_fg3a', 'opponent_team_rank_fg_pct', 
                                      'opponent_team_rank_fga', 'opponent_team_rank_ft', 'opponent_team_rank_ft_pct', 
                                      'opponent_team_rank_fta', 'opponent_team_rank_g', 'opponent_team_rank_mp', 
                                      'opponent_team_rank_orb', 'opponent_team_rank_pf', 'opponent_team_rank_pts', 
                                      'opponent_team_rank_pts_per_g', 'opponent_team_rank_stl', 'opponent_team_rank_tov', 
                                      'opponent_team_rank_trb', 'opponent_team_stl', 'opponent_team_tov', 'opponent_team_trb', 
                                      'opponent_opp_g', 'opponent_opp_mp', 'opponent_opp_opp_ast', 'opponent_opp_opp_blk', 
                                      'opponent_opp_opp_drb', 'opponent_opp_opp_fg', 'opponent_opp_opp_fg2', 
                                      'opponent_opp_opp_fg2_pct', 'opponent_opp_opp_fg2a', 'opponent_opp_opp_fg3', 
                                      'opponent_opp_opp_fg3_pct', 'opponent_opp_opp_fg3a', 'opponent_opp_opp_fg_pct', 
                                      'opponent_opp_opp_fga', 'opponent_opp_opp_ft', 'opponent_opp_opp_ft_pct', 
                                      'opponent_opp_opp_fta', 'opponent_opp_opp_orb', 'opponent_opp_opp_pf', 
                                      'opponent_opp_opp_pts', 'opponent_opp_opp_pts_per_g', 'opponent_opp_opp_stl', 
                                      'opponent_opp_opp_tov', 'opponent_opp_opp_trb', 'opponent_opp_rank_g', 
                                      'opponent_opp_rank_mp', 'opponent_opp_rank_opp_ast', 'opponent_opp_rank_opp_blk', 
                                      'opponent_opp_rank_opp_drb', 'opponent_opp_rank_opp_fg', 'opponent_opp_rank_opp_fg2', 
                                      'opponent_opp_rank_opp_fg2_pct', 'opponent_opp_rank_opp_fg2a', 'opponent_opp_rank_opp_fg3', 
                                      'opponent_opp_rank_opp_fg3_pct', 'opponent_opp_rank_opp_fg3a', 'opponent_opp_rank_opp_fg_pct', 
                                      'opponent_opp_rank_opp_fga', 'opponent_opp_rank_opp_ft', 'opponent_opp_rank_opp_ft_pct', 
                                      'opponent_opp_rank_opp_fta', 'opponent_opp_rank_opp_orb', 'opponent_opp_rank_opp_pf', 
                                      'opponent_opp_rank_opp_pts', 'opponent_opp_rank_opp_pts_per_g', 'opponent_opp_rank_opp_stl', 
                                      'opponent_opp_rank_opp_tov', 'opponent_opp_rank_opp_trb']]

pd.set_option('display.max_columns', 500)
model_input_df.head()

Unnamed: 0,year,round,location,outcome,team_name,team_seed,team_score,team_location,team_travel_miles,opponent_name,opponent_seed,opponent_score,opponent_location,opponent_travel_miles,team_SOS,team_SRS,team_ast,team_blk,team_drb,team_fg,team_fg2,team_fg2_pct,team_fg2a,team_fg3,team_fg3_pct,team_fg3a,team_fg_pct,team_fga,team_ft,team_ft_pct,team_fta,team_g,team_mp,team_orb,team_pf,team_pts,team_pts_per_g,team_rank_SOS,team_rank_SRS,team_rank_ast,team_rank_blk,team_rank_drb,team_rank_fg,team_rank_fg2,team_rank_fg2_pct,team_rank_fg2a,team_rank_fg3,team_rank_fg3_pct,team_rank_fg3a,team_rank_fg_pct,team_rank_fga,team_rank_ft,team_rank_ft_pct,team_rank_fta,team_rank_g,team_rank_mp,team_rank_orb,team_rank_pf,team_rank_pts,team_rank_pts_per_g,team_rank_stl,team_rank_tov,team_rank_trb,team_stl,team_tov,team_trb,opp_g,opp_mp,opp_opp_ast,opp_opp_blk,opp_opp_drb,opp_opp_fg,opp_opp_fg2,opp_opp_fg2_pct,opp_opp_fg2a,opp_opp_fg3,opp_opp_fg3_pct,opp_opp_fg3a,opp_opp_fg_pct,opp_opp_fga,opp_opp_ft,opp_opp_ft_pct,opp_opp_fta,opp_opp_orb,opp_opp_pf,opp_opp_pts,opp_opp_pts_per_g,opp_opp_stl,opp_opp_tov,opp_opp_trb,opp_rank_g,opp_rank_mp,opp_rank_opp_ast,opp_rank_opp_blk,opp_rank_opp_drb,opp_rank_opp_fg,opp_rank_opp_fg2,opp_rank_opp_fg2_pct,opp_rank_opp_fg2a,opp_rank_opp_fg3,opp_rank_opp_fg3_pct,opp_rank_opp_fg3a,opp_rank_opp_fg_pct,opp_rank_opp_fga,opp_rank_opp_ft,opp_rank_opp_ft_pct,opp_rank_opp_fta,opp_rank_opp_orb,opp_rank_opp_pf,opp_rank_opp_pts,opp_rank_opp_pts_per_g,opp_rank_opp_stl,opp_rank_opp_tov,opp_rank_opp_trb,opponent_team_SOS,opponent_team_SRS,opponent_team_ast,opponent_team_blk,opponent_team_drb,opponent_team_fg,opponent_team_fg2,opponent_team_fg2_pct,opponent_team_fg2a,opponent_team_fg3,opponent_team_fg3_pct,opponent_team_fg3a,opponent_team_fg_pct,opponent_team_fga,opponent_team_ft,opponent_team_ft_pct,opponent_team_fta,opponent_team_g,opponent_team_mp,opponent_team_orb,opponent_team_pf,opponent_team_pts,opponent_team_pts_per_g,opponent_team_rank_SOS,opponent_team_rank_SRS,opponent_team_rank_ast,opponent_team_rank_blk,opponent_team_rank_drb,opponent_team_rank_fg,opponent_team_rank_fg2,opponent_team_rank_fg2_pct,opponent_team_rank_fg2a,opponent_team_rank_fg3,opponent_team_rank_fg3_pct,opponent_team_rank_fg3a,opponent_team_rank_fg_pct,opponent_team_rank_fga,opponent_team_rank_ft,opponent_team_rank_ft_pct,opponent_team_rank_fta,opponent_team_rank_g,opponent_team_rank_mp,opponent_team_rank_orb,opponent_team_rank_pf,opponent_team_rank_pts,opponent_team_rank_pts_per_g,opponent_team_rank_stl,opponent_team_rank_tov,opponent_team_rank_trb,opponent_team_stl,opponent_team_tov,opponent_team_trb,opponent_opp_g,opponent_opp_mp,opponent_opp_opp_ast,opponent_opp_opp_blk,opponent_opp_opp_drb,opponent_opp_opp_fg,opponent_opp_opp_fg2,opponent_opp_opp_fg2_pct,opponent_opp_opp_fg2a,opponent_opp_opp_fg3,opponent_opp_opp_fg3_pct,opponent_opp_opp_fg3a,opponent_opp_opp_fg_pct,opponent_opp_opp_fga,opponent_opp_opp_ft,opponent_opp_opp_ft_pct,opponent_opp_opp_fta,opponent_opp_opp_orb,opponent_opp_opp_pf,opponent_opp_opp_pts,opponent_opp_opp_pts_per_g,opponent_opp_opp_stl,opponent_opp_opp_tov,opponent_opp_opp_trb,opponent_opp_rank_g,opponent_opp_rank_mp,opponent_opp_rank_opp_ast,opponent_opp_rank_opp_blk,opponent_opp_rank_opp_drb,opponent_opp_rank_opp_fg,opponent_opp_rank_opp_fg2,opponent_opp_rank_opp_fg2_pct,opponent_opp_rank_opp_fg2a,opponent_opp_rank_opp_fg3,opponent_opp_rank_opp_fg3_pct,opponent_opp_rank_opp_fg3a,opponent_opp_rank_opp_fg_pct,opponent_opp_rank_opp_fga,opponent_opp_rank_opp_ft,opponent_opp_rank_opp_ft_pct,opponent_opp_rank_opp_fta,opponent_opp_rank_opp_orb,opponent_opp_rank_opp_pf,opponent_opp_rank_opp_pts,opponent_opp_rank_opp_pts_per_g,opponent_opp_rank_opp_stl,opponent_opp_rank_opp_tov,opponent_opp_rank_opp_trb
0,2018,1,"Pittsburgh, PA",WIN,Villanova,1,87,"Villanova, Pennsylvania",246.946684,Radford,16,61,"Radford, Virginia",230.496683,10.24,26.64,655,162,1056,1220,756,0.59,1282,464,0.401,1158,0.5,2440,559,0.779,718,40,8075,380,645,3463,86.6,10,1,3,31,4,1,16,4,96,1,13,2,5,1,32,10,73,,,80,273,1,1,28,182,8,259,426,1436,40,8075,532,105,928,1028,756,0.49,1543,272,0.317,858,0.428,2401,479,0.747,641,378,698,2807,70.2,193,512,1306,,,335,148,328,346,346,151,346,265,21,332,107,350,241,334,204,301,24,346,114,138,31,327,-4.08,-2.61,442,105,830,850,573,0.475,1207,277,0.349,793,0.425,2000,445,0.725,614,36,7325,414,596,2422,67.3,265,191,185,184,171,175,219,287,167,106,178,87,285,106,199,132,214,,,32,196,169,312,97,223,93,226,441,1244,36,7325,413,120,818,819,558,0.484,1152,261,0.35,745,0.432,1897,437,0.751,582,317,601,2336,64.9,198,469,1135,,,116,257,177,149,117,131,132,217,188,217,128,172,157,339,113,150,164,149,20,160,87,165
1,2018,1,"Pittsburgh, PA",WIN,Alabama,9,86,"Tuscaloosa, Alabama",645.478758,Virginia Tech,8,83,"Blacksburg, Virginia",222.738124,10.64,12.34,458,192,932,910,681,0.535,1273,229,0.326,702,0.461,1975,555,0.67,828,36,7200,358,676,2604,72.3,6,49,149,11,51,98,60,58,106,233,298,187,100,124,34,310,11,,,118,311,90,217,90,346,61,228,513,1290,36,7200,427,134,863,877,615,0.469,1310,262,0.328,799,0.416,2109,527,0.685,769,414,718,2543,70.6,237,473,1277,,,150,308,255,237,230,65,291,228,57,295,50,309,306,33,321,336,12,283,130,323,83,316,6.71,14.35,542,74,834,931,642,0.573,1120,289,0.388,745,0.499,1865,483,0.709,681,33,6675,244,523,2634,79.8,66,34,37,310,163,80,104,6,250,82,30,139,6,214,130,190,121,,,326,40,75,46,213,92,262,191,395,1078,33,6675,460,124,784,844,552,0.492,1123,292,0.347,842,0.43,1965,402,0.697,577,346,572,2382,72.2,174,433,1130,,,236,274,113,197,110,159,106,316,165,323,113,242,83,76,108,230,222,189,169,49,160,158
2,2018,1,"San Diego, CA",WIN,West Virginia,5,85,"Morgantown, West Virginia",2117.874565,Murray State,12,68,"Murray, Kentucky",1658.405787,9.36,19.88,561,192,888,1036,714,0.488,1463,322,0.353,911,0.436,2374,570,0.766,744,37,7425,513,798,2964,80.1,26,9,22,11,89,14,36,237,8,37,156,24,231,3,25,25,56,,,1,351,13,45,5,161,13,301,420,1401,37,7425,462,148,893,835,547,0.454,1204,288,0.38,758,0.426,1962,617,0.712,866,375,695,2575,69.6,204,612,1268,,,245,339,307,177,95,26,200,306,323,239,90,238,349,183,349,297,29,291,100,209,3,308,-3.53,6.87,471,104,846,870,599,0.553,1084,271,0.377,719,0.483,1803,504,0.734,687,32,6400,340,559,2515,78.6,241,91,128,190,141,150,178,25,276,119,52,162,25,255,95,101,114,,,154,108,126,61,129,74,143,216,384,1186,32,6400,322,105,691,761,562,0.471,1192,199,0.307,649,0.413,1841,396,0.724,547,314,579,2117,66.2,173,402,1005,,,6,148,7,47,125,75,188,15,7,82,40,124,70,249,60,141,211,21,34,46,241,26
3,2018,1,"San Diego, CA",WIN,Marshall,13,81,"Huntington, West Virginia",1982.540076,Wichita State,4,75,"Wichita, Kansas",1170.375993,0.43,4.21,615,209,999,1042,680,0.557,1220,362,0.358,1012,0.467,2232,571,0.764,747,36,7275,288,617,3017,83.8,134,112,6,5,17,13,62,22,150,10,132,5,69,16,24,26,52,,,273,235,10,10,41,308,64,252,476,1287,36,7275,539,95,1008,1059,794,0.481,1652,265,0.355,747,0.441,2399,466,0.681,684,458,675,2849,79.1,239,515,1466,,,340,93,347,349,348,110,349,241,216,222,196,349,220,22,251,351,47,348,325,329,27,350,5.46,16.58,610,127,932,957,657,0.538,1222,300,0.381,787,0.476,2009,517,0.739,700,33,6650,404,580,2731,82.8,74,20,8,101,51,49,84,52,148,59,45,92,34,98,73,80,96,,,40,161,46,16,317,76,33,154,385,1336,33,6650,356,111,734,809,527,0.46,1146,282,0.363,776,0.421,1922,464,0.726,639,271,658,2364,71.6,200,379,1005,,,22,199,34,122,57,38,126,288,266,271,67,199,216,264,199,30,71,176,152,175,280,26
4,2018,1,"Dallas, TX",WIN,Florida,6,77,"Gainesville, Florida",882.710411,St. Bonaventure,11,62,"St. Bonaventure, New York",1190.429282,10.08,16.66,445,164,832,893,585,0.474,1233,308,0.37,832,0.432,2065,483,0.718,673,34,6875,366,591,2577,75.8,15,19,179,27,165,118,201,288,137,46,80,58,251,59,130,153,135,,,107,184,102,116,69,8,130,237,335,1198,34,6875,394,105,911,832,603,0.465,1296,229,0.352,650,0.428,1946,460,0.731,629,362,607,2353,69.2,150,468,1273,,,65,148,316,172,205,52,284,92,201,85,103,221,210,284,184,277,152,162,94,7,90,312,2.61,8.87,472,118,864,884,629,0.485,1298,255,0.386,661,0.451,1959,597,0.752,794,34,6900,344,674,2620,77.1,99,74,127,135,117,128,119,255,84,153,34,236,153,139,10,50,22,,,148,308,85,80,63,84,120,239,391,1208,34,6900,463,133,833,807,541,0.497,1088,266,0.321,829,0.421,1917,527,0.715,737,353,663,2407,70.8,181,488,1186,,,248,305,199,118,79,180,79,245,30,317,69,192,306,202,300,250,67,204,135,78,61,231


## Create/Train/Test/Refine Model

In [50]:
X = model_input_df[['team_seed', 'team_travel_miles', 'opponent_seed', 'opponent_travel_miles', 'team_SOS', 'team_SRS', 
                    'opponent_team_SOS', 'opponent_team_SRS']]
y = model_input_df['outcome']

print(X.shape, y.shape)

(3744, 8) (3744,)


In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y, random_state=42, stratify=y)

In [23]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
classifier.score(X_test, y_test)

0.7403846153846154

In [25]:
predictions = classifier.predict(X_test)

In [26]:
probabilities = classifier.predict_proba(X_test)

In [27]:
loss_prob = []
win_prob = []

for prob in probabilities:
    loss_prob.append(prob[0])
    win_prob.append(prob[1])

In [28]:
upsets = model_input_df.loc[((model_input_df['team_seed'] > model_input_df['opponent_seed'])
                            & (model_input_df['outcome'] == 'WIN')) | ((model_input_df['team_seed'] < model_input_df['opponent_seed'])
                            & (model_input_df['outcome'] == 'LOSS')),:]
upsets

Unnamed: 0,year,round,location,outcome,team_name,team_seed,team_score,team_location,team_travel_miles,opponent_name,opponent_seed,opponent_score,opponent_location,opponent_travel_miles,team_SOS,team_SRS,team_ast,team_blk,team_drb,team_fg,team_fg2,team_fg2_pct,team_fg2a,team_fg3,team_fg3_pct,team_fg3a,team_fg_pct,team_fga,team_ft,team_ft_pct,team_fta,team_g,team_mp,team_orb,team_pf,team_pts,team_pts_per_g,team_rank_SOS,team_rank_SRS,team_rank_ast,team_rank_blk,team_rank_drb,team_rank_fg,team_rank_fg2,team_rank_fg2_pct,team_rank_fg2a,team_rank_fg3,team_rank_fg3_pct,team_rank_fg3a,team_rank_fg_pct,team_rank_fga,team_rank_ft,team_rank_ft_pct,team_rank_fta,team_rank_g,team_rank_mp,team_rank_orb,team_rank_pf,team_rank_pts,team_rank_pts_per_g,team_rank_stl,team_rank_tov,team_rank_trb,team_stl,team_tov,team_trb,opp_g,opp_mp,opp_opp_ast,opp_opp_blk,opp_opp_drb,opp_opp_fg,opp_opp_fg2,opp_opp_fg2_pct,opp_opp_fg2a,opp_opp_fg3,opp_opp_fg3_pct,opp_opp_fg3a,opp_opp_fg_pct,opp_opp_fga,opp_opp_ft,opp_opp_ft_pct,opp_opp_fta,opp_opp_orb,opp_opp_pf,opp_opp_pts,opp_opp_pts_per_g,opp_opp_stl,opp_opp_tov,opp_opp_trb,opp_rank_g,opp_rank_mp,opp_rank_opp_ast,opp_rank_opp_blk,opp_rank_opp_drb,opp_rank_opp_fg,opp_rank_opp_fg2,opp_rank_opp_fg2_pct,opp_rank_opp_fg2a,opp_rank_opp_fg3,opp_rank_opp_fg3_pct,opp_rank_opp_fg3a,opp_rank_opp_fg_pct,opp_rank_opp_fga,opp_rank_opp_ft,opp_rank_opp_ft_pct,opp_rank_opp_fta,opp_rank_opp_orb,opp_rank_opp_pf,opp_rank_opp_pts,opp_rank_opp_pts_per_g,opp_rank_opp_stl,opp_rank_opp_tov,opp_rank_opp_trb,opponent_team_SOS,opponent_team_SRS,opponent_team_ast,opponent_team_blk,opponent_team_drb,opponent_team_fg,opponent_team_fg2,opponent_team_fg2_pct,opponent_team_fg2a,opponent_team_fg3,opponent_team_fg3_pct,opponent_team_fg3a,opponent_team_fg_pct,opponent_team_fga,opponent_team_ft,opponent_team_ft_pct,opponent_team_fta,opponent_team_g,opponent_team_mp,opponent_team_orb,opponent_team_pf,opponent_team_pts,opponent_team_pts_per_g,opponent_team_rank_SOS,opponent_team_rank_SRS,opponent_team_rank_ast,opponent_team_rank_blk,opponent_team_rank_drb,opponent_team_rank_fg,opponent_team_rank_fg2,opponent_team_rank_fg2_pct,opponent_team_rank_fg2a,opponent_team_rank_fg3,opponent_team_rank_fg3_pct,opponent_team_rank_fg3a,opponent_team_rank_fg_pct,opponent_team_rank_fga,opponent_team_rank_ft,opponent_team_rank_ft_pct,opponent_team_rank_fta,opponent_team_rank_g,opponent_team_rank_mp,opponent_team_rank_orb,opponent_team_rank_pf,opponent_team_rank_pts,opponent_team_rank_pts_per_g,opponent_team_rank_stl,opponent_team_rank_tov,opponent_team_rank_trb,opponent_team_stl,opponent_team_tov,opponent_team_trb,opponent_opp_g,opponent_opp_mp,opponent_opp_opp_ast,opponent_opp_opp_blk,opponent_opp_opp_drb,opponent_opp_opp_fg,opponent_opp_opp_fg2,opponent_opp_opp_fg2_pct,opponent_opp_opp_fg2a,opponent_opp_opp_fg3,opponent_opp_opp_fg3_pct,opponent_opp_opp_fg3a,opponent_opp_opp_fg_pct,opponent_opp_opp_fga,opponent_opp_opp_ft,opponent_opp_opp_ft_pct,opponent_opp_opp_fta,opponent_opp_opp_orb,opponent_opp_opp_pf,opponent_opp_opp_pts,opponent_opp_opp_pts_per_g,opponent_opp_opp_stl,opponent_opp_opp_tov,opponent_opp_opp_trb,opponent_opp_rank_g,opponent_opp_rank_mp,opponent_opp_rank_opp_ast,opponent_opp_rank_opp_blk,opponent_opp_rank_opp_drb,opponent_opp_rank_opp_fg,opponent_opp_rank_opp_fg2,opponent_opp_rank_opp_fg2_pct,opponent_opp_rank_opp_fg2a,opponent_opp_rank_opp_fg3,opponent_opp_rank_opp_fg3_pct,opponent_opp_rank_opp_fg3a,opponent_opp_rank_opp_fg_pct,opponent_opp_rank_opp_fga,opponent_opp_rank_opp_ft,opponent_opp_rank_opp_ft_pct,opponent_opp_rank_opp_fta,opponent_opp_rank_opp_orb,opponent_opp_rank_opp_pf,opponent_opp_rank_opp_pts,opponent_opp_rank_opp_pts_per_g,opponent_opp_rank_opp_stl,opponent_opp_rank_opp_tov,opponent_opp_rank_opp_trb
1,2018,1,"Pittsburgh, PA",WIN,Alabama,9,86,"Tuscaloosa, Alabama",645.478758,Virginia Tech,8,83,"Blacksburg, Virginia",222.738124,10.64,12.34,458,192,932,910,681,0.535,1273,229,0.326,702,0.461,1975,555,0.670,828,36,7200,358,676,2604,72.3,6,49,149,11,51,98,60,58,106,233,298,187,100,124,34,310,11,,,118,311,90,217,90,346,61,228,513,1290,36,7200,427,134,863,877,615,0.469,1310,262,0.328,799,0.416,2109,527,0.685,769,414,718,2543,70.6,237,473,1277,,,150,308,255,237,230,65,291,228,57,295,50,309,306,33,321,336,12,283,130,323,83,316,6.71,14.35,542,74,834,931,642,0.573,1120,289,0.388,745,0.499,1865,483,0.709,681,33,6675,244,523,2634,79.8,66,34,37,310,163,80,104,6,250,82,30,139,6,214,130,190,121,,,326,40,75,46,213,92,262,191,395,1078,33,6675,460,124,784,844,552,0.492,1123,292,0.347,842,0.430,1965,402,0.697,577,346,572,2382,72.2,174,433,1130,,,236,274,113,197,110,159,106,316,165,323,113,242,83,76,108,230,222,189,169,49,160,158
3,2018,1,"San Diego, CA",WIN,Marshall,13,81,"Huntington, West Virginia",1982.540076,Wichita State,4,75,"Wichita, Kansas",1170.375993,0.43,4.21,615,209,999,1042,680,0.557,1220,362,0.358,1012,0.467,2232,571,0.764,747,36,7275,288,617,3017,83.8,134,112,6,5,17,13,62,22,150,10,132,5,69,16,24,26,52,,,273,235,10,10,41,308,64,252,476,1287,36,7275,539,95,1008,1059,794,0.481,1652,265,0.355,747,0.441,2399,466,0.681,684,458,675,2849,79.1,239,515,1466,,,340,93,347,349,348,110,349,241,216,222,196,349,220,22,251,351,47,348,325,329,27,350,5.46,16.58,610,127,932,957,657,0.538,1222,300,0.381,787,0.476,2009,517,0.739,700,33,6650,404,580,2731,82.8,74,20,8,101,51,49,84,52,148,59,45,92,34,98,73,80,96,,,40,161,46,16,317,76,33,154,385,1336,33,6650,356,111,734,809,527,0.46,1146,282,0.363,776,0.421,1922,464,0.726,639,271,658,2364,71.6,200,379,1005,,,22,199,34,122,57,38,126,288,266,271,67,199,216,264,199,30,71,176,152,175,280,26
6,2018,1,"Detroit, MI",WIN,Butler,10,79,"Indianapolis, Indiana",240.701156,Arkansas,7,62,"Fayetteville, Arkansas",735.862049,10.55,16.92,489,100,879,1020,733,0.543,1350,287,0.357,805,0.473,2155,436,0.776,562,35,7150,319,642,2763,78.9,9,18,101,209,100,18,25,41,56,92,144,75,45,24,216,14,277,,,206,269,38,58,79,84,130,232,391,1198,35,7150,440,120,859,877,603,0.495,1219,274,0.373,735,0.449,1954,512,0.743,689,278,586,2540,72.6,179,484,1137,,,177,257,248,237,205,175,219,269,302,202,232,229,290,322,259,44,202,282,177,65,66,168,9.85,14.76,501,163,858,1005,736,0.512,1438,269,0.395,681,0.474,2119,540,0.681,793,35,7075,350,700,2819,80.5,17,30,85,28,122,25,23,143,14,124,21,212,42,35,50,291,23,,,138,331,26,38,119,61,120,218,381,1208,35,7075,477,125,870,872,571,0.482,1184,301,0.357,844,0.430,2028,602,0.727,828,394,680,2647,75.6,167,490,1264,,,276,278,269,232,140,120,176,327,227,326,116,279,345,268,341,324,42,323,262,30,56,306
13,2018,3,"Boston, MA",WIN,Texas Tech,3,78,"Lubbock, Texas",1775.455163,Purdue,2,65,"West Lafayette, Indiana",833.087287,9.60,19.38,529,158,936,964,712,0.511,1394,252,0.359,702,0.460,2096,579,0.704,823,37,7450,396,669,2759,74.6,21,10,52,34,46,46,38,151,31,165,119,187,109,48,17,221,13,,,57,300,39,154,24,262,38,262,454,1332,37,7450,377,138,831,806,557,0.45,1238,249,0.323,770,0.401,2008,536,0.703,762,368,766,2397,64.8,222,554,1199,,,39,319,197,116,115,18,232,174,39,261,11,270,313,117,316,290,1,196,16,291,7,245,8.74,23.41,598,180,984,1033,680,0.541,1257,353,0.420,840,0.493,2097,555,0.743,747,37,7425,311,580,2974,80.4,38,3,10,16,22,16,62,44,115,16,2,54,11,47,34,63,52,,,227,161,12,42,146,97,57,211,399,1295,37,7425,443,94,828,907,656,0.455,1442,251,0.332,755,0.413,2197,366,0.689,531,376,688,2431,65.7,190,448,1204,,,189,87,190,284,295,27,332,181,90,235,37,331,25,49,40,299,32,223,24,127,124,252
22,2018,1,"Detroit, MI",WIN,Syracuse,11,57,"Syracuse, New York",354.974599,TCU,6,52,"Fort Worth, Texas",1023.318196,10.02,12.89,395,198,942,832,624,0.466,1340,208,0.318,655,0.417,1995,594,0.736,807,37,7500,433,611,2466,66.6,16,42,267,7,42,203,128,313,63,284,328,243,318,112,11,92,18,,,16,221,148,317,29,277,20,258,460,1375,37,7500,591,116,837,801,512,0.451,1136,289,0.318,910,0.391,2046,469,0.731,642,400,679,2360,63.8,228,469,1237,,,347,235,211,107,45,19,117,310,22,345,5,288,224,279,206,328,43,171,10,307,87,287,9.68,16.38,615,119,836,979,705,0.551,1279,274,0.395,694,0.496,1973,476,0.706,674,33,6725,365,553,2708,82.1,20,21,6,130,158,36,42,30,101,113,22,199,9,127,143,206,131,,,108,91,52,23,136,158,126,214,419,1201,33,6725,511,141,710,894,632,0.501,1261,262,0.376,697,0.457,1958,437,0.731,598,291,612,2487,75.4,206,410,1001,,,318,323,14,271,258,195,261,228,312,133,270,233,157,281,139,73,139,249,256,217,215,22
27,2018,2,"San Diego, CA",WIN,Clemson,5,84,"Clemson, South Carolina",1972.501861,Auburn,4,53,"Auburn, Alabama",1839.712443,9.25,17.17,463,169,946,899,611,0.516,1185,288,0.366,786,0.456,1971,498,0.755,660,35,7050,310,564,2584,73.8,27,17,142,25,40,112,158,125,193,88,92,94,129,129,107,44,153,,,230,122,96,170,196,146,80,198,414,1256,35,7050,401,126,840,826,558,0.439,1272,268,0.352,761,0.406,2033,387,0.701,552,332,617,2307,65.9,199,413,1172,,,78,283,216,163,117,9,267,252,200,246,20,282,56,99,69,197,132,125,29,166,204,215,7.29,15.97,481,180,887,906,582,0.487,1194,324,0.357,908,0.431,2102,647,0.774,836,34,6800,403,672,2783,81.9,54,24,117,16,91,101,205,241,182,32,139,25,258,45,4,18,8,,,43,305,36,27,41,134,61,252,411,1290,34,6800,439,111,866,838,565,0.477,1184,273,0.351,777,0.427,1961,539,0.722,747,341,700,2488,73.2,207,516,1207,,,175,199,262,182,136,96,176,268,196,273,101,237,314,233,310,219,22,250,194,221,26,257
28,2018,2,"Detroit, MI",WIN,Syracuse,11,55,"Syracuse, New York",354.974599,Michigan State,3,53,"East Lansing, Michigan",77.646366,10.02,12.89,395,198,942,832,624,0.466,1340,208,0.318,655,0.417,1995,594,0.736,807,37,7500,433,611,2466,66.6,16,42,267,7,42,203,128,313,63,284,328,243,318,112,11,92,18,,,16,221,148,317,29,277,20,258,460,1375,37,7500,591,116,837,801,512,0.451,1136,289,0.318,910,0.391,2046,469,0.731,642,400,679,2360,63.8,228,469,1237,,,347,235,211,107,45,19,117,310,22,345,5,288,224,279,206,328,43,171,10,307,87,287,7.10,22.41,670,251,1038,991,701,0.552,1271,290,0.400,725,0.496,1996,535,0.747,716,35,7025,401,640,2807,80.2,58,4,2,1,7,32,45,28,108,81,17,156,8,111,58,56,77,,,46,266,31,43,333,273,7,143,459,1439,35,7025,419,88,689,774,522,0.384,1360,252,0.337,748,0.367,2108,471,0.728,647,369,698,2271,64.9,207,349,1058,,,132,53,5,67,54,1,314,190,110,226,1,308,229,272,215,291,24,101,18,221,324,63
37,2018,1,"Nashville, TN",WIN,Florida State,9,67,"Tallahassee, Florida",420.000601,Missouri,8,54,"Columbia, Missouri",360.690149,8.50,15.79,523,186,929,994,721,0.536,1344,273,0.350,781,0.468,2125,543,0.691,786,35,7100,393,652,2806,80.2,40,26,60,13,54,31,31,54,61,116,177,100,63,32,45,262,31,,,60,288,32,44,58,267,42,241,456,1322,35,7100,432,104,845,887,602,0.454,1327,285,0.355,802,0.417,2129,492,0.714,689,383,658,2551,72.9,204,494,1228,,,163,139,229,259,204,22,303,296,220,298,53,315,259,195,259,312,71,287,184,209,52,278,9.19,13.81,443,140,893,794,488,0.505,966,306,0.383,799,0.450,1765,505,0.734,688,33,6650,332,594,2399,72.7,29,37,182,59,81,256,327,168,334,48,39,83,160,290,91,98,111,,,175,191,182,206,278,295,107,170,469,1225,33,6650,376,117,736,795,564,0.454,1243,231,0.327,706,0.408,1949,425,0.725,586,343,649,2246,68.1,217,365,1079,,,36,243,38,99,134,22,241,102,51,157,26,224,133,261,118,225,87,84,63,275,303,90
44,2018,2,"Nashville, TN",WIN,Florida State,9,75,"Tallahassee, Florida",420.000601,Xavier,1,70,"Cincinnati, Ohio",237.640045,8.50,15.79,523,186,929,994,721,0.536,1344,273,0.350,781,0.468,2125,543,0.691,786,35,7100,393,652,2806,80.2,40,26,60,13,54,31,31,54,61,116,177,100,63,32,45,262,31,,,60,288,32,44,58,267,42,241,456,1322,35,7100,432,104,845,887,602,0.454,1327,285,0.355,802,0.417,2129,492,0.714,689,383,658,2551,72.9,204,494,1228,,,163,139,229,259,204,22,303,296,220,298,53,315,259,195,259,312,71,287,184,209,52,278,9.45,19.08,579,109,1002,1011,737,0.559,1319,274,0.372,736,0.492,2055,657,0.779,843,35,7075,340,602,2953,84.4,24,11,17,170,16,22,21,19,75,113,70,146,14,68,1,8,7,,,154,206,15,8,173,223,31,205,441,1342,35,7075,492,127,799,944,643,0.498,1290,301,0.334,902,0.431,2192,427,0.76,562,296,731,2616,74.7,238,400,1095,,,299,287,140,317,274,183,281,327,97,343,120,330,141,348,82,86,8,316,239,326,246,113
47,2018,2,"Charlotte, NC",WIN,Texas A&M,7,86,"College Station, Texas",953.236125,UNC,2,65,"Chapel Hill, North Carolina",111.229961,10.62,15.30,538,209,1021,976,746,0.525,1421,230,0.329,700,0.460,2121,449,0.661,679,35,7000,429,574,2631,75.2,7,29,43,5,11,38,18,85,16,230,290,189,107,34,193,322,126,,,18,148,77,137,214,308,5,190,476,1450,35,7000,512,135,844,892,624,0.457,1365,268,0.326,821,0.408,2186,415,0.688,603,378,616,2467,70.5,227,379,1222,,,319,312,225,266,245,32,317,252,48,311,27,329,108,44,150,301,134,237,127,304,280,275,11.78,20.08,674,151,1059,1105,800,0.510,1568,305,0.359,849,0.457,2417,503,0.743,677,37,7425,511,587,3018,81.6,2,8,1,42,3,5,4,153,3,49,116,49,123,2,98,63,127,,,2,180,9,28,140,238,1,213,445,1570,37,7425,541,176,854,972,615,0.453,1357,357,0.38,940,0.423,2297,410,0.687,597,362,657,2711,73.3,228,433,1216,,,341,351,240,338,230,21,311,351,321,349,75,344,100,40,138,277,75,336,198,307,160,263


In [29]:
X_upsets = upsets[['team_seed', 'team_travel_miles', 'opponent_seed', 'opponent_travel_miles', 'team_SOS', 'team_SRS', 
                    'opponent_team_SOS', 'opponent_team_SRS']]
y_upsets = upsets['outcome']

In [30]:
classifier.score(X_upsets, y_upsets)

0.3058350100603622

In [31]:
rd_1_upsets = model_input_df.loc[((model_input_df['team_seed'] > model_input_df['opponent_seed'])
                            & (model_input_df['outcome'] == 'WIN')) | ((model_input_df['team_seed'] < model_input_df['opponent_seed'])
                            & (model_input_df['outcome'] == 'LOSS')) & (model_input_df['round'] == 1),:]
rd_1_upsets

Unnamed: 0,year,round,location,outcome,team_name,team_seed,team_score,team_location,team_travel_miles,opponent_name,opponent_seed,opponent_score,opponent_location,opponent_travel_miles,team_SOS,team_SRS,team_ast,team_blk,team_drb,team_fg,team_fg2,team_fg2_pct,team_fg2a,team_fg3,team_fg3_pct,team_fg3a,team_fg_pct,team_fga,team_ft,team_ft_pct,team_fta,team_g,team_mp,team_orb,team_pf,team_pts,team_pts_per_g,team_rank_SOS,team_rank_SRS,team_rank_ast,team_rank_blk,team_rank_drb,team_rank_fg,team_rank_fg2,team_rank_fg2_pct,team_rank_fg2a,team_rank_fg3,team_rank_fg3_pct,team_rank_fg3a,team_rank_fg_pct,team_rank_fga,team_rank_ft,team_rank_ft_pct,team_rank_fta,team_rank_g,team_rank_mp,team_rank_orb,team_rank_pf,team_rank_pts,team_rank_pts_per_g,team_rank_stl,team_rank_tov,team_rank_trb,team_stl,team_tov,team_trb,opp_g,opp_mp,opp_opp_ast,opp_opp_blk,opp_opp_drb,opp_opp_fg,opp_opp_fg2,opp_opp_fg2_pct,opp_opp_fg2a,opp_opp_fg3,opp_opp_fg3_pct,opp_opp_fg3a,opp_opp_fg_pct,opp_opp_fga,opp_opp_ft,opp_opp_ft_pct,opp_opp_fta,opp_opp_orb,opp_opp_pf,opp_opp_pts,opp_opp_pts_per_g,opp_opp_stl,opp_opp_tov,opp_opp_trb,opp_rank_g,opp_rank_mp,opp_rank_opp_ast,opp_rank_opp_blk,opp_rank_opp_drb,opp_rank_opp_fg,opp_rank_opp_fg2,opp_rank_opp_fg2_pct,opp_rank_opp_fg2a,opp_rank_opp_fg3,opp_rank_opp_fg3_pct,opp_rank_opp_fg3a,opp_rank_opp_fg_pct,opp_rank_opp_fga,opp_rank_opp_ft,opp_rank_opp_ft_pct,opp_rank_opp_fta,opp_rank_opp_orb,opp_rank_opp_pf,opp_rank_opp_pts,opp_rank_opp_pts_per_g,opp_rank_opp_stl,opp_rank_opp_tov,opp_rank_opp_trb,opponent_team_SOS,opponent_team_SRS,opponent_team_ast,opponent_team_blk,opponent_team_drb,opponent_team_fg,opponent_team_fg2,opponent_team_fg2_pct,opponent_team_fg2a,opponent_team_fg3,opponent_team_fg3_pct,opponent_team_fg3a,opponent_team_fg_pct,opponent_team_fga,opponent_team_ft,opponent_team_ft_pct,opponent_team_fta,opponent_team_g,opponent_team_mp,opponent_team_orb,opponent_team_pf,opponent_team_pts,opponent_team_pts_per_g,opponent_team_rank_SOS,opponent_team_rank_SRS,opponent_team_rank_ast,opponent_team_rank_blk,opponent_team_rank_drb,opponent_team_rank_fg,opponent_team_rank_fg2,opponent_team_rank_fg2_pct,opponent_team_rank_fg2a,opponent_team_rank_fg3,opponent_team_rank_fg3_pct,opponent_team_rank_fg3a,opponent_team_rank_fg_pct,opponent_team_rank_fga,opponent_team_rank_ft,opponent_team_rank_ft_pct,opponent_team_rank_fta,opponent_team_rank_g,opponent_team_rank_mp,opponent_team_rank_orb,opponent_team_rank_pf,opponent_team_rank_pts,opponent_team_rank_pts_per_g,opponent_team_rank_stl,opponent_team_rank_tov,opponent_team_rank_trb,opponent_team_stl,opponent_team_tov,opponent_team_trb,opponent_opp_g,opponent_opp_mp,opponent_opp_opp_ast,opponent_opp_opp_blk,opponent_opp_opp_drb,opponent_opp_opp_fg,opponent_opp_opp_fg2,opponent_opp_opp_fg2_pct,opponent_opp_opp_fg2a,opponent_opp_opp_fg3,opponent_opp_opp_fg3_pct,opponent_opp_opp_fg3a,opponent_opp_opp_fg_pct,opponent_opp_opp_fga,opponent_opp_opp_ft,opponent_opp_opp_ft_pct,opponent_opp_opp_fta,opponent_opp_opp_orb,opponent_opp_opp_pf,opponent_opp_opp_pts,opponent_opp_opp_pts_per_g,opponent_opp_opp_stl,opponent_opp_opp_tov,opponent_opp_opp_trb,opponent_opp_rank_g,opponent_opp_rank_mp,opponent_opp_rank_opp_ast,opponent_opp_rank_opp_blk,opponent_opp_rank_opp_drb,opponent_opp_rank_opp_fg,opponent_opp_rank_opp_fg2,opponent_opp_rank_opp_fg2_pct,opponent_opp_rank_opp_fg2a,opponent_opp_rank_opp_fg3,opponent_opp_rank_opp_fg3_pct,opponent_opp_rank_opp_fg3a,opponent_opp_rank_opp_fg_pct,opponent_opp_rank_opp_fga,opponent_opp_rank_opp_ft,opponent_opp_rank_opp_ft_pct,opponent_opp_rank_opp_fta,opponent_opp_rank_opp_orb,opponent_opp_rank_opp_pf,opponent_opp_rank_opp_pts,opponent_opp_rank_opp_pts_per_g,opponent_opp_rank_opp_stl,opponent_opp_rank_opp_tov,opponent_opp_rank_opp_trb
1,2018,1,"Pittsburgh, PA",WIN,Alabama,9,86,"Tuscaloosa, Alabama",645.478758,Virginia Tech,8,83,"Blacksburg, Virginia",222.738124,10.64,12.34,458,192,932,910,681,0.535,1273,229,0.326,702,0.461,1975,555,0.670,828,36,7200,358,676,2604,72.3,6,49,149,11,51,98,60,58,106,233,298,187,100,124,34,310,11,,,118,311,90,217,90,346,61,228,513,1290,36,7200,427,134,863,877,615,0.469,1310,262,0.328,799,0.416,2109,527,0.685,769,414,718,2543,70.6,237,473,1277,,,150,308,255,237,230,65,291,228,57,295,50,309,306,33,321,336,12,283,130,323,83,316,6.71,14.35,542,74,834,931,642,0.573,1120,289,0.388,745,0.499,1865,483,0.709,681,33,6675,244,523,2634,79.8,66,34,37,310,163,80,104,6,250,82,30,139,6,214,130,190,121,,,326,40,75,46,213,92,262,191,395,1078,33,6675,460,124,784,844,552,0.492,1123,292,0.347,842,0.430,1965,402,0.697,577,346,572,2382,72.2,174,433,1130,,,236,274,113,197,110,159,106,316,165,323,113,242,83,76,108,230,222,189,169,49,160,158
3,2018,1,"San Diego, CA",WIN,Marshall,13,81,"Huntington, West Virginia",1982.540076,Wichita State,4,75,"Wichita, Kansas",1170.375993,0.43,4.21,615,209,999,1042,680,0.557,1220,362,0.358,1012,0.467,2232,571,0.764,747,36,7275,288,617,3017,83.8,134,112,6,5,17,13,62,22,150,10,132,5,69,16,24,26,52,,,273,235,10,10,41,308,64,252,476,1287,36,7275,539,95,1008,1059,794,0.481,1652,265,0.355,747,0.441,2399,466,0.681,684,458,675,2849,79.1,239,515,1466,,,340,93,347,349,348,110,349,241,216,222,196,349,220,22,251,351,47,348,325,329,27,350,5.46,16.58,610,127,932,957,657,0.538,1222,300,0.381,787,0.476,2009,517,0.739,700,33,6650,404,580,2731,82.8,74,20,8,101,51,49,84,52,148,59,45,92,34,98,73,80,96,,,40,161,46,16,317,76,33,154,385,1336,33,6650,356,111,734,809,527,0.46,1146,282,0.363,776,0.421,1922,464,0.726,639,271,658,2364,71.6,200,379,1005,,,22,199,34,122,57,38,126,288,266,271,67,199,216,264,199,30,71,176,152,175,280,26
6,2018,1,"Detroit, MI",WIN,Butler,10,79,"Indianapolis, Indiana",240.701156,Arkansas,7,62,"Fayetteville, Arkansas",735.862049,10.55,16.92,489,100,879,1020,733,0.543,1350,287,0.357,805,0.473,2155,436,0.776,562,35,7150,319,642,2763,78.9,9,18,101,209,100,18,25,41,56,92,144,75,45,24,216,14,277,,,206,269,38,58,79,84,130,232,391,1198,35,7150,440,120,859,877,603,0.495,1219,274,0.373,735,0.449,1954,512,0.743,689,278,586,2540,72.6,179,484,1137,,,177,257,248,237,205,175,219,269,302,202,232,229,290,322,259,44,202,282,177,65,66,168,9.85,14.76,501,163,858,1005,736,0.512,1438,269,0.395,681,0.474,2119,540,0.681,793,35,7075,350,700,2819,80.5,17,30,85,28,122,25,23,143,14,124,21,212,42,35,50,291,23,,,138,331,26,38,119,61,120,218,381,1208,35,7075,477,125,870,872,571,0.482,1184,301,0.357,844,0.430,2028,602,0.727,828,394,680,2647,75.6,167,490,1264,,,276,278,269,232,140,120,176,327,227,326,116,279,345,268,341,324,42,323,262,30,56,306
13,2018,3,"Boston, MA",WIN,Texas Tech,3,78,"Lubbock, Texas",1775.455163,Purdue,2,65,"West Lafayette, Indiana",833.087287,9.60,19.38,529,158,936,964,712,0.511,1394,252,0.359,702,0.460,2096,579,0.704,823,37,7450,396,669,2759,74.6,21,10,52,34,46,46,38,151,31,165,119,187,109,48,17,221,13,,,57,300,39,154,24,262,38,262,454,1332,37,7450,377,138,831,806,557,0.45,1238,249,0.323,770,0.401,2008,536,0.703,762,368,766,2397,64.8,222,554,1199,,,39,319,197,116,115,18,232,174,39,261,11,270,313,117,316,290,1,196,16,291,7,245,8.74,23.41,598,180,984,1033,680,0.541,1257,353,0.420,840,0.493,2097,555,0.743,747,37,7425,311,580,2974,80.4,38,3,10,16,22,16,62,44,115,16,2,54,11,47,34,63,52,,,227,161,12,42,146,97,57,211,399,1295,37,7425,443,94,828,907,656,0.455,1442,251,0.332,755,0.413,2197,366,0.689,531,376,688,2431,65.7,190,448,1204,,,189,87,190,284,295,27,332,181,90,235,37,331,25,49,40,299,32,223,24,127,124,252
22,2018,1,"Detroit, MI",WIN,Syracuse,11,57,"Syracuse, New York",354.974599,TCU,6,52,"Fort Worth, Texas",1023.318196,10.02,12.89,395,198,942,832,624,0.466,1340,208,0.318,655,0.417,1995,594,0.736,807,37,7500,433,611,2466,66.6,16,42,267,7,42,203,128,313,63,284,328,243,318,112,11,92,18,,,16,221,148,317,29,277,20,258,460,1375,37,7500,591,116,837,801,512,0.451,1136,289,0.318,910,0.391,2046,469,0.731,642,400,679,2360,63.8,228,469,1237,,,347,235,211,107,45,19,117,310,22,345,5,288,224,279,206,328,43,171,10,307,87,287,9.68,16.38,615,119,836,979,705,0.551,1279,274,0.395,694,0.496,1973,476,0.706,674,33,6725,365,553,2708,82.1,20,21,6,130,158,36,42,30,101,113,22,199,9,127,143,206,131,,,108,91,52,23,136,158,126,214,419,1201,33,6725,511,141,710,894,632,0.501,1261,262,0.376,697,0.457,1958,437,0.731,598,291,612,2487,75.4,206,410,1001,,,318,323,14,271,258,195,261,228,312,133,270,233,157,281,139,73,139,249,256,217,215,22
27,2018,2,"San Diego, CA",WIN,Clemson,5,84,"Clemson, South Carolina",1972.501861,Auburn,4,53,"Auburn, Alabama",1839.712443,9.25,17.17,463,169,946,899,611,0.516,1185,288,0.366,786,0.456,1971,498,0.755,660,35,7050,310,564,2584,73.8,27,17,142,25,40,112,158,125,193,88,92,94,129,129,107,44,153,,,230,122,96,170,196,146,80,198,414,1256,35,7050,401,126,840,826,558,0.439,1272,268,0.352,761,0.406,2033,387,0.701,552,332,617,2307,65.9,199,413,1172,,,78,283,216,163,117,9,267,252,200,246,20,282,56,99,69,197,132,125,29,166,204,215,7.29,15.97,481,180,887,906,582,0.487,1194,324,0.357,908,0.431,2102,647,0.774,836,34,6800,403,672,2783,81.9,54,24,117,16,91,101,205,241,182,32,139,25,258,45,4,18,8,,,43,305,36,27,41,134,61,252,411,1290,34,6800,439,111,866,838,565,0.477,1184,273,0.351,777,0.427,1961,539,0.722,747,341,700,2488,73.2,207,516,1207,,,175,199,262,182,136,96,176,268,196,273,101,237,314,233,310,219,22,250,194,221,26,257
28,2018,2,"Detroit, MI",WIN,Syracuse,11,55,"Syracuse, New York",354.974599,Michigan State,3,53,"East Lansing, Michigan",77.646366,10.02,12.89,395,198,942,832,624,0.466,1340,208,0.318,655,0.417,1995,594,0.736,807,37,7500,433,611,2466,66.6,16,42,267,7,42,203,128,313,63,284,328,243,318,112,11,92,18,,,16,221,148,317,29,277,20,258,460,1375,37,7500,591,116,837,801,512,0.451,1136,289,0.318,910,0.391,2046,469,0.731,642,400,679,2360,63.8,228,469,1237,,,347,235,211,107,45,19,117,310,22,345,5,288,224,279,206,328,43,171,10,307,87,287,7.10,22.41,670,251,1038,991,701,0.552,1271,290,0.400,725,0.496,1996,535,0.747,716,35,7025,401,640,2807,80.2,58,4,2,1,7,32,45,28,108,81,17,156,8,111,58,56,77,,,46,266,31,43,333,273,7,143,459,1439,35,7025,419,88,689,774,522,0.384,1360,252,0.337,748,0.367,2108,471,0.728,647,369,698,2271,64.9,207,349,1058,,,132,53,5,67,54,1,314,190,110,226,1,308,229,272,215,291,24,101,18,221,324,63
37,2018,1,"Nashville, TN",WIN,Florida State,9,67,"Tallahassee, Florida",420.000601,Missouri,8,54,"Columbia, Missouri",360.690149,8.50,15.79,523,186,929,994,721,0.536,1344,273,0.350,781,0.468,2125,543,0.691,786,35,7100,393,652,2806,80.2,40,26,60,13,54,31,31,54,61,116,177,100,63,32,45,262,31,,,60,288,32,44,58,267,42,241,456,1322,35,7100,432,104,845,887,602,0.454,1327,285,0.355,802,0.417,2129,492,0.714,689,383,658,2551,72.9,204,494,1228,,,163,139,229,259,204,22,303,296,220,298,53,315,259,195,259,312,71,287,184,209,52,278,9.19,13.81,443,140,893,794,488,0.505,966,306,0.383,799,0.450,1765,505,0.734,688,33,6650,332,594,2399,72.7,29,37,182,59,81,256,327,168,334,48,39,83,160,290,91,98,111,,,175,191,182,206,278,295,107,170,469,1225,33,6650,376,117,736,795,564,0.454,1243,231,0.327,706,0.408,1949,425,0.725,586,343,649,2246,68.1,217,365,1079,,,36,243,38,99,134,22,241,102,51,157,26,224,133,261,118,225,87,84,63,275,303,90
44,2018,2,"Nashville, TN",WIN,Florida State,9,75,"Tallahassee, Florida",420.000601,Xavier,1,70,"Cincinnati, Ohio",237.640045,8.50,15.79,523,186,929,994,721,0.536,1344,273,0.350,781,0.468,2125,543,0.691,786,35,7100,393,652,2806,80.2,40,26,60,13,54,31,31,54,61,116,177,100,63,32,45,262,31,,,60,288,32,44,58,267,42,241,456,1322,35,7100,432,104,845,887,602,0.454,1327,285,0.355,802,0.417,2129,492,0.714,689,383,658,2551,72.9,204,494,1228,,,163,139,229,259,204,22,303,296,220,298,53,315,259,195,259,312,71,287,184,209,52,278,9.45,19.08,579,109,1002,1011,737,0.559,1319,274,0.372,736,0.492,2055,657,0.779,843,35,7075,340,602,2953,84.4,24,11,17,170,16,22,21,19,75,113,70,146,14,68,1,8,7,,,154,206,15,8,173,223,31,205,441,1342,35,7075,492,127,799,944,643,0.498,1290,301,0.334,902,0.431,2192,427,0.76,562,296,731,2616,74.7,238,400,1095,,,299,287,140,317,274,183,281,327,97,343,120,330,141,348,82,86,8,316,239,326,246,113
47,2018,2,"Charlotte, NC",WIN,Texas A&M,7,86,"College Station, Texas",953.236125,UNC,2,65,"Chapel Hill, North Carolina",111.229961,10.62,15.30,538,209,1021,976,746,0.525,1421,230,0.329,700,0.460,2121,449,0.661,679,35,7000,429,574,2631,75.2,7,29,43,5,11,38,18,85,16,230,290,189,107,34,193,322,126,,,18,148,77,137,214,308,5,190,476,1450,35,7000,512,135,844,892,624,0.457,1365,268,0.326,821,0.408,2186,415,0.688,603,378,616,2467,70.5,227,379,1222,,,319,312,225,266,245,32,317,252,48,311,27,329,108,44,150,301,134,237,127,304,280,275,11.78,20.08,674,151,1059,1105,800,0.510,1568,305,0.359,849,0.457,2417,503,0.743,677,37,7425,511,587,3018,81.6,2,8,1,42,3,5,4,153,3,49,116,49,123,2,98,63,127,,,2,180,9,28,140,238,1,213,445,1570,37,7425,541,176,854,972,615,0.453,1357,357,0.38,940,0.423,2297,410,0.687,597,362,657,2711,73.3,228,433,1216,,,341,351,240,338,230,21,311,351,321,349,75,344,100,40,138,277,75,336,198,307,160,263


In [32]:
X_upsets_rd_1 = rd_1_upsets[['team_seed', 'team_travel_miles', 'opponent_seed', 'opponent_travel_miles', 'team_SOS', 'team_SRS', 
                    'opponent_team_SOS', 'opponent_team_SRS']]
y_upsets_rd_1 = rd_1_upsets['outcome']

In [33]:
classifier.score(X_upsets_rd_1, y_upsets_rd_1)

0.3112676056338028

In [34]:
X_seed_only = model_input_df[['team_seed', 'opponent_seed']]
y_seed_only = model_input_df['outcome']

In [69]:
X_train_seed_only, X_test_seed_only, y_train_seed_only, y_test_seed_only = train_test_split(X_seed_only, y_seed_only, random_state=42, stratify=y_seed_only)

In [70]:
classifier_seed_only = LogisticRegression()
classifier_seed_only.fit(X_train_seed_only, y_train_seed_only)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [71]:
classifier_seed_only.score(X_test_seed_only, y_test_seed_only)

0.7104700854700855

In [72]:
X_upsets_seed_only = upsets[['team_seed', 'opponent_seed']]
y_upsets_seed_only = upsets['outcome']

In [73]:
classifier_seed_only.score(X_upsets_seed_only, y_upsets_seed_only)

0.0

In [40]:
model_input_df['year'].min()

'1993'

In [52]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [53]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [54]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [55]:
classifier.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [56]:
classifier.score(X_test_scaled, y_test)

0.7393162393162394

In [57]:
X_upsets = upsets[['team_seed', 'team_travel_miles', 'opponent_seed', 'opponent_travel_miles', 'team_SOS', 'team_SRS', 
                    'opponent_team_SOS', 'opponent_team_SRS']]
y_upsets = upsets['outcome']

In [58]:
X_upsets_scaled = X_scaler.transform(X_upsets)

In [59]:
classifier.score(X_upsets_scaled, y_upsets)

0.3038229376257545

In [67]:
tourns_after_2000 = model_input_df.loc[(model_input_df['year'] == '2000') | (model_input_df['year'] == '2001') |
                                       (model_input_df['year'] == '2002') | (model_input_df['year'] == '2003') |
                                       (model_input_df['year'] == '2004') | (model_input_df['year'] == '2005') |
                                       (model_input_df['year'] == '2006') | (model_input_df['year'] == '2007') |
                                       (model_input_df['year'] == '2008') | (model_input_df['year'] == '2009') |
                                       (model_input_df['year'] == '2010') | (model_input_df['year'] == '2011') |
                                       (model_input_df['year'] == '2012') | (model_input_df['year'] == '2013') |
                                       (model_input_df['year'] == '2014') | (model_input_df['year'] == '2015') |
                                       (model_input_df['year'] == '2016') | (model_input_df['year'] == '2017') |
                                       (model_input_df['year'] == '2018'), :]
len(tourns_after_2000)

2736

In [68]:
X_2000 = tourns_after_2000[['team_seed', 'team_travel_miles', 'opponent_seed', 'opponent_travel_miles', 'team_SOS', 'team_SRS', 
                    'opponent_team_SOS', 'opponent_team_SRS']]
y_2000 = tourns_after_2000['outcome']

In [74]:
X_train_2000, X_test_2000, y_train_2000, y_test_2000 = train_test_split(X_2000, y_2000, random_state=42, stratify=y_2000)


In [75]:
classifier_2000 = LogisticRegression()

In [76]:
classifier.fit(X_train_2000, y_train_2000)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [77]:
classifier.score(X_test_2000, y_test_2000)

0.7587719298245614

In [78]:
upsets_after_2000 = upsets.loc[(upsets['year'] == '2000') | (upsets['year'] == '2001') |
                                       (upsets['year'] == '2002') | (upsets['year'] == '2003') |
                                       (upsets['year'] == '2004') | (upsets['year'] == '2005') |
                                       (upsets['year'] == '2006') | (upsets['year'] == '2007') |
                                       (upsets['year'] == '2008') | (upsets['year'] == '2009') |
                                       (upsets['year'] == '2010') | (upsets['year'] == '2011') |
                                       (upsets['year'] == '2012') | (upsets['year'] == '2013') |
                                       (upsets['year'] == '2014') | (upsets['year'] == '2015') |
                                       (upsets['year'] == '2016') | (upsets['year'] == '2017') |
                                       (upsets['year'] == '2018'), :]
len(upsets)

994

In [79]:
X_upsets_2000 = upsets_after_2000[['team_seed', 'team_travel_miles', 'opponent_seed', 'opponent_travel_miles', 
                                   'team_SOS', 'team_SRS', 'opponent_team_SOS', 'opponent_team_SRS']]
y_upsets_2000 = upsets_after_2000['outcome']

In [80]:
classifier.score(X_upsets_2000, y_upsets_2000)

0.3293010752688172

In [81]:
X = tourns_after_2000[['team_seed', 'team_travel_miles', 'opponent_seed', 'opponent_travel_miles', 'team_SRS', 
                       'opponent_team_SRS']]
y = tourns_after_2000['outcome']

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [83]:
classifier = LogisticRegression()

In [84]:
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [85]:
classifier.score(X_test, y_test)

0.7602339181286549

In [86]:
X_upsets = upsets_after_2000[['team_seed', 'team_travel_miles', 'opponent_seed', 'opponent_travel_miles', 'team_SRS', 
                       'opponent_team_SRS']]
y_upsets = upsets_after_2000['outcome']

In [87]:
classifier.score(X_upsets, y_upsets)

0.3293010752688172

In [88]:
X = tourns_after_2000[['team_travel_miles', 'opponent_travel_miles', 'team_SRS', 'opponent_team_SRS']]
y = tourns_after_2000['outcome']

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [90]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [91]:
classifier.score(X_test, y_test)

0.7573099415204678

In [92]:
X_upsets = upsets_after_2000[['team_travel_miles', 'opponent_travel_miles', 'team_SRS', 'opponent_team_SRS']]
y_upsets = upsets_after_2000['outcome']

In [93]:
classifier.score(X_upsets, y_upsets)

0.25

In [96]:
tourns_after_2000.head(1)

Unnamed: 0,year,round,location,outcome,team_name,team_seed,team_score,team_location,team_travel_miles,opponent_name,opponent_seed,opponent_score,opponent_location,opponent_travel_miles,team_SOS,team_SRS,team_ast,team_blk,team_drb,team_fg,team_fg2,team_fg2_pct,team_fg2a,team_fg3,team_fg3_pct,team_fg3a,team_fg_pct,team_fga,team_ft,team_ft_pct,team_fta,team_g,team_mp,team_orb,team_pf,team_pts,team_pts_per_g,team_rank_SOS,team_rank_SRS,team_rank_ast,team_rank_blk,team_rank_drb,team_rank_fg,team_rank_fg2,team_rank_fg2_pct,team_rank_fg2a,team_rank_fg3,team_rank_fg3_pct,team_rank_fg3a,team_rank_fg_pct,team_rank_fga,team_rank_ft,team_rank_ft_pct,team_rank_fta,team_rank_g,team_rank_mp,team_rank_orb,team_rank_pf,team_rank_pts,team_rank_pts_per_g,team_rank_stl,team_rank_tov,team_rank_trb,team_stl,team_tov,team_trb,opp_g,opp_mp,opp_opp_ast,opp_opp_blk,opp_opp_drb,opp_opp_fg,opp_opp_fg2,opp_opp_fg2_pct,opp_opp_fg2a,opp_opp_fg3,opp_opp_fg3_pct,opp_opp_fg3a,opp_opp_fg_pct,opp_opp_fga,opp_opp_ft,opp_opp_ft_pct,opp_opp_fta,opp_opp_orb,opp_opp_pf,opp_opp_pts,opp_opp_pts_per_g,opp_opp_stl,opp_opp_tov,opp_opp_trb,opp_rank_g,opp_rank_mp,opp_rank_opp_ast,opp_rank_opp_blk,opp_rank_opp_drb,opp_rank_opp_fg,opp_rank_opp_fg2,opp_rank_opp_fg2_pct,opp_rank_opp_fg2a,opp_rank_opp_fg3,opp_rank_opp_fg3_pct,opp_rank_opp_fg3a,opp_rank_opp_fg_pct,opp_rank_opp_fga,opp_rank_opp_ft,opp_rank_opp_ft_pct,opp_rank_opp_fta,opp_rank_opp_orb,opp_rank_opp_pf,opp_rank_opp_pts,opp_rank_opp_pts_per_g,opp_rank_opp_stl,opp_rank_opp_tov,opp_rank_opp_trb,opponent_team_SOS,opponent_team_SRS,opponent_team_ast,opponent_team_blk,opponent_team_drb,opponent_team_fg,opponent_team_fg2,opponent_team_fg2_pct,opponent_team_fg2a,opponent_team_fg3,opponent_team_fg3_pct,opponent_team_fg3a,opponent_team_fg_pct,opponent_team_fga,opponent_team_ft,opponent_team_ft_pct,opponent_team_fta,opponent_team_g,opponent_team_mp,opponent_team_orb,opponent_team_pf,opponent_team_pts,opponent_team_pts_per_g,opponent_team_rank_SOS,opponent_team_rank_SRS,opponent_team_rank_ast,opponent_team_rank_blk,opponent_team_rank_drb,opponent_team_rank_fg,opponent_team_rank_fg2,opponent_team_rank_fg2_pct,opponent_team_rank_fg2a,opponent_team_rank_fg3,opponent_team_rank_fg3_pct,opponent_team_rank_fg3a,opponent_team_rank_fg_pct,opponent_team_rank_fga,opponent_team_rank_ft,opponent_team_rank_ft_pct,opponent_team_rank_fta,opponent_team_rank_g,opponent_team_rank_mp,opponent_team_rank_orb,opponent_team_rank_pf,opponent_team_rank_pts,opponent_team_rank_pts_per_g,opponent_team_rank_stl,opponent_team_rank_tov,opponent_team_rank_trb,opponent_team_stl,opponent_team_tov,opponent_team_trb,opponent_opp_g,opponent_opp_mp,opponent_opp_opp_ast,opponent_opp_opp_blk,opponent_opp_opp_drb,opponent_opp_opp_fg,opponent_opp_opp_fg2,opponent_opp_opp_fg2_pct,opponent_opp_opp_fg2a,opponent_opp_opp_fg3,opponent_opp_opp_fg3_pct,opponent_opp_opp_fg3a,opponent_opp_opp_fg_pct,opponent_opp_opp_fga,opponent_opp_opp_ft,opponent_opp_opp_ft_pct,opponent_opp_opp_fta,opponent_opp_opp_orb,opponent_opp_opp_pf,opponent_opp_opp_pts,opponent_opp_opp_pts_per_g,opponent_opp_opp_stl,opponent_opp_opp_tov,opponent_opp_opp_trb,opponent_opp_rank_g,opponent_opp_rank_mp,opponent_opp_rank_opp_ast,opponent_opp_rank_opp_blk,opponent_opp_rank_opp_drb,opponent_opp_rank_opp_fg,opponent_opp_rank_opp_fg2,opponent_opp_rank_opp_fg2_pct,opponent_opp_rank_opp_fg2a,opponent_opp_rank_opp_fg3,opponent_opp_rank_opp_fg3_pct,opponent_opp_rank_opp_fg3a,opponent_opp_rank_opp_fg_pct,opponent_opp_rank_opp_fga,opponent_opp_rank_opp_ft,opponent_opp_rank_opp_ft_pct,opponent_opp_rank_opp_fta,opponent_opp_rank_opp_orb,opponent_opp_rank_opp_pf,opponent_opp_rank_opp_pts,opponent_opp_rank_opp_pts_per_g,opponent_opp_rank_opp_stl,opponent_opp_rank_opp_tov,opponent_opp_rank_opp_trb
0,2018,1,"Pittsburgh, PA",WIN,Villanova,1,87,"Villanova, Pennsylvania",246.946684,Radford,16,61,"Radford, Virginia",230.496683,10.24,26.64,655,162,1056,1220,756,0.59,1282,464,0.401,1158,0.5,2440,559,0.779,718,40,8075,380,645,3463,86.6,10,1,3,31,4,1,16,4,96,1,13,2,5,1,32,10,73,,,80,273,1,1,28,182,8,259,426,1436,40,8075,532,105,928,1028,756,0.49,1543,272,0.317,858,0.428,2401,479,0.747,641,378,698,2807,70.2,193,512,1306,,,335,148,328,346,346,151,346,265,21,332,107,350,241,334,204,301,24,346,114,138,31,327,-4.08,-2.61,442,105,830,850,573,0.475,1207,277,0.349,793,0.425,2000,445,0.725,614,36,7325,414,596,2422,67.3,265,191,185,184,171,175,219,287,167,106,178,87,285,106,199,132,214,,,32,196,169,312,97,223,93,226,441,1244,36,7325,413,120,818,819,558,0.484,1152,261,0.35,745,0.432,1897,437,0.751,582,317,601,2336,64.9,198,469,1135,,,116,257,177,149,117,131,132,217,188,217,128,172,157,339,113,150,164,149,20,160,87,165


In [113]:
X = tourns_after_2000[['team_seed', 'team_travel_miles', 'opponent_seed', 'opponent_travel_miles', 'team_rank_SOS', 
                       'team_rank_SRS', 'team_rank_ast', 'team_rank_blk', 'team_rank_fg', 
                      'team_rank_fg2', 'team_rank_fg2_pct', 'team_rank_fg2a', 'team_rank_fg3', 'team_rank_fg3_pct', 
                      'team_rank_fg3a', 'team_rank_fg_pct', 'team_rank_fga', 'team_rank_ft', 'team_rank_ft_pct', 
                      'team_rank_fta', 'team_rank_pts', 'team_rank_pts_per_g', 'team_rank_stl', 'team_rank_trb', 
                       'opponent_team_rank_SOS', 'opponent_team_rank_SRS', 'opponent_team_rank_ast', 
                       'opponent_team_rank_blk', 'opponent_team_rank_fg', 'opponent_team_rank_fg2', 
                       'opponent_team_rank_fg2_pct', 'opponent_team_rank_fg2a', 'opponent_team_rank_fg3', 
                       'opponent_team_rank_fg3_pct', 'opponent_team_rank_fg3a', 'opponent_team_rank_fg_pct', 
                       'opponent_team_rank_fga', 'opponent_team_rank_ft', 'opponent_team_rank_ft_pct', 
                       'opponent_team_rank_fta', 'opponent_team_rank_pts', 'opponent_team_rank_pts_per_g', 
                       'opponent_team_rank_stl', 'opponent_team_rank_trb']]
y = tourns_after_2000['outcome']

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [115]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [116]:
classifier.score(X_test, y_test)

0.8026315789473685

In [117]:
X_upsets = upsets_after_2000[['team_seed', 'team_travel_miles', 'opponent_seed', 'opponent_travel_miles', 'team_rank_SOS', 
                       'team_rank_SRS', 'team_rank_ast', 'team_rank_blk', 'team_rank_fg', 
                      'team_rank_fg2', 'team_rank_fg2_pct', 'team_rank_fg2a', 'team_rank_fg3', 'team_rank_fg3_pct', 
                      'team_rank_fg3a', 'team_rank_fg_pct', 'team_rank_fga', 'team_rank_ft', 'team_rank_ft_pct', 
                      'team_rank_fta', 'team_rank_pts', 'team_rank_pts_per_g', 'team_rank_stl', 'team_rank_trb', 
                       'opponent_team_rank_SOS', 'opponent_team_rank_SRS', 'opponent_team_rank_ast', 
                       'opponent_team_rank_blk', 'opponent_team_rank_fg', 'opponent_team_rank_fg2', 
                       'opponent_team_rank_fg2_pct', 'opponent_team_rank_fg2a', 'opponent_team_rank_fg3', 
                       'opponent_team_rank_fg3_pct', 'opponent_team_rank_fg3a', 'opponent_team_rank_fg_pct', 
                       'opponent_team_rank_fga', 'opponent_team_rank_ft', 'opponent_team_rank_ft_pct', 
                       'opponent_team_rank_fta', 'opponent_team_rank_pts', 'opponent_team_rank_pts_per_g', 
                       'opponent_team_rank_stl', 'opponent_team_rank_trb']]
y_upsets = upsets_after_2000['outcome']

In [118]:
classifier.score(X_upsets, y_upsets)

0.5981182795698925

In [130]:
tourn_2018 = model_input_df.loc[(model_input_df['year'] == '2018') & (model_input_df['round'] == 1), :]
len(tourn_2018)

64

In [131]:
X_2018 = tourn_2018[['team_seed', 'team_travel_miles', 'opponent_seed', 'opponent_travel_miles', 'team_rank_SOS', 
                       'team_rank_SRS', 'team_rank_ast', 'team_rank_blk', 'team_rank_fg', 
                      'team_rank_fg2', 'team_rank_fg2_pct', 'team_rank_fg2a', 'team_rank_fg3', 'team_rank_fg3_pct', 
                      'team_rank_fg3a', 'team_rank_fg_pct', 'team_rank_fga', 'team_rank_ft', 'team_rank_ft_pct', 
                      'team_rank_fta', 'team_rank_pts', 'team_rank_pts_per_g', 'team_rank_stl', 'team_rank_trb', 
                       'opponent_team_rank_SOS', 'opponent_team_rank_SRS', 'opponent_team_rank_ast', 
                       'opponent_team_rank_blk', 'opponent_team_rank_fg', 'opponent_team_rank_fg2', 
                       'opponent_team_rank_fg2_pct', 'opponent_team_rank_fg2a', 'opponent_team_rank_fg3', 
                       'opponent_team_rank_fg3_pct', 'opponent_team_rank_fg3a', 'opponent_team_rank_fg_pct', 
                       'opponent_team_rank_fga', 'opponent_team_rank_ft', 'opponent_team_rank_ft_pct', 
                       'opponent_team_rank_fta', 'opponent_team_rank_pts', 'opponent_team_rank_pts_per_g', 
                       'opponent_team_rank_stl', 'opponent_team_rank_trb']]
y_2018 = tourn_2018['outcome']

In [132]:
classifier.score(X_2018, y_2018)

0.859375

In [133]:
predictions = classifier.predict(X_2018)
probabilities = classifier.predict_proba(X_2018)

In [140]:
import pickle

model_filename = 'ncaa_model.pkl'
with open(model_filename, 'wb') as file:  
    pickle.dump(classifier, file)

In [141]:
with open(model_filename, 'rb') as file:  
    restored_model = pickle.load(file)

In [142]:
restored_model.score(X_2018, y_2018)

0.859375

In [159]:
X_2018_test_game = X_2018.iloc[0:1,:]
y_2018_test_game = y_2018.iloc[0]

X_2018_test_game

Unnamed: 0,team_seed,team_travel_miles,opponent_seed,opponent_travel_miles,team_rank_SOS,team_rank_SRS,team_rank_ast,team_rank_blk,team_rank_fg,team_rank_fg2,team_rank_fg2_pct,team_rank_fg2a,team_rank_fg3,team_rank_fg3_pct,team_rank_fg3a,team_rank_fg_pct,team_rank_fga,team_rank_ft,team_rank_ft_pct,team_rank_fta,team_rank_pts,team_rank_pts_per_g,team_rank_stl,team_rank_trb,opponent_team_rank_SOS,opponent_team_rank_SRS,opponent_team_rank_ast,opponent_team_rank_blk,opponent_team_rank_fg,opponent_team_rank_fg2,opponent_team_rank_fg2_pct,opponent_team_rank_fg2a,opponent_team_rank_fg3,opponent_team_rank_fg3_pct,opponent_team_rank_fg3a,opponent_team_rank_fg_pct,opponent_team_rank_fga,opponent_team_rank_ft,opponent_team_rank_ft_pct,opponent_team_rank_fta,opponent_team_rank_pts,opponent_team_rank_pts_per_g,opponent_team_rank_stl,opponent_team_rank_trb
0,1,246.946684,16,230.496683,10,1,3,31,1,16,4,96,1,13,2,5,1,32,10,73,1,1,28,8,265,191,185,184,175,219,287,167,106,178,87,285,106,199,132,214,169,312,97,93


In [160]:
testing_probability = restored_model.predict_proba(X_2018_test_game)

In [161]:
testing_probability

array([[0.00220883, 0.99779117]])

In [162]:
restored_model.decision_function(X_2018_test_game)

array([6.1130788])