In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [2]:
teams = pd.read_csv("./data/proj2/teams.csv")
seasons = pd.read_csv("./data/proj2/seasons.csv")

season_compact_results = pd.read_csv("./data/proj2/regular_season_compact_results.csv")
season_detailed_results = pd.read_csv("./data/proj2/regular_season_detailed_results.csv")

tourney_compact_results = pd.read_csv("./data/proj2/tourney_compact_results.csv")
tourney_detailed_results = pd.read_csv("./data/proj2/tourney_detailed_results.csv")

tourney_seeds = pd.read_csv("./data/proj2/tourney_seeds.csv")
tourney_slots = pd.read_csv("./data/proj2/tourney_slots.csv")


In [3]:
teams.head()

Unnamed: 0,team_id,team_name
0,1101,Abilene Chr
1,1102,Air Force
2,1103,Akron
3,1104,Alabama
4,1105,Alabama A&M


In [4]:
seasons.head()

Unnamed: 0,season,dayzero,regionW,regionX,regionY,regionZ
0,1985,10/29/1984,East,West,Midwest,Southeast
1,1986,10/28/1985,East,Midwest,Southeast,West
2,1987,10/27/1986,East,Southeast,Midwest,West
3,1988,11/02/1987,East,Midwest,Southeast,West
4,1989,10/31/1988,East,West,Midwest,Southeast


In [5]:
season_compact_results.head()

Unnamed: 0,season,daynum,wteam,wscore,lteam,lscore,wloc,numot
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [6]:
season_detailed_results.head()

Unnamed: 0,season,daynum,wteam,wscore,lteam,lscore,wloc,numot,wfgm,wfga,...,lfga3,lftm,lfta,lor,ldr,last,lto,lstl,lblk,lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [7]:
tourney_compact_results.head()

Unnamed: 0,season,daynum,wteam,wscore,lteam,lscore,wloc,numot
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [8]:
tourney_detailed_results.head()

Unnamed: 0,season,daynum,wteam,wscore,lteam,lscore,wloc,numot,wfgm,wfga,...,lfga3,lftm,lfta,lor,ldr,last,lto,lstl,lblk,lpf
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


In [9]:
tourney_seeds.head()

Unnamed: 0,season,seed,team
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [10]:
tourney_slots.head()

Unnamed: 0,season,slot,strongseed,weakseed
0,1985,R1W1,W01,W16
1,1985,R1W2,W02,W15
2,1985,R1W3,W03,W14
3,1985,R1W4,W04,W13
4,1985,R1W5,W05,W12


In [12]:
all_results = pd.concat([season_compact_results, tourney_compact_results]);
all_results

wins = all_results.groupby(['wteam', 'season'], as_index=False)['wscore'].count()
wins.rename(columns = {'wteam':'team', 'wscore':'wins'}, inplace = True)
wins

loses = all_results.groupby(['lteam', 'season'], as_index=False)['lscore'].count()
loses.rename(columns = {'lteam':'team', 'lscore':'loses'}, inplace = True)
loses

variables_per_team = pd.merge(wins, loses, on=['team', 'season'])
variables_per_team['WP'] = variables_per_team['wins'] / (variables_per_team['wins'] + variables_per_team['loses'])
variables_per_team

Unnamed: 0,team,season,wins,loses,WP
0,1101,2014,2,19,0.095238
1,1102,1985,5,19,0.208333
2,1102,1986,6,19,0.240000
3,1102,1987,8,15,0.347826
4,1102,1988,7,17,0.291667
...,...,...,...,...,...
9467,1464,2010,6,22,0.214286
9468,1464,2011,7,21,0.250000
9469,1464,2012,14,15,0.482759
9470,1464,2013,14,15,0.482759


In [14]:
for season in range(19):
    curr_season = all_results[all_results.season == season]
    for index, row in teams.iterrows():
        all_games_by_team = all_results

In [16]:
seasons
variables_per_team[variables_per_team.season ==2011]

Unnamed: 0,team,season,wins,loses,WP
27,1102,2011,14,14,0.500000
57,1103,2011,22,13,0.628571
87,1104,2011,21,11,0.656250
102,1105,2011,10,15,0.400000
132,1106,2011,15,18,0.454545
...,...,...,...,...,...
9348,1460,2011,17,14,0.548387
9378,1461,2011,8,21,0.275862
9408,1462,2011,24,8,0.750000
9438,1463,2011,14,13,0.518519
