## Blocking

In [1]:
# Import packages
import sys
sys.path.append('../')

import datetime
import pandas as pd

import collegebasketball as cbb
cbb.__version__

'0.3'

In [2]:
# Initialize some variables
dataset_names = ['kenpom', 'TRank', 'stats', 'all']
dataset_types = ['season', 'season_blocked', 'march', 'seed']
drop_cols = ['Seed', 'Seed_Fav', 'Seed_Diff']
path = '../Data/Training/'
rows = dict()
data = dict()

# Process each data source
for name in dataset_names:
    
    # Load in datasets
    for dt in dataset_types[0:-1]:
        data[name + dt] = pd.read_csv('{0}{1}_{2}.csv'.format(path, name, dt))
    
    # Apply seed based blocking rule
    df = data[name + 'season'].copy()
    data[name + 'seed'] = df[(df['Seed'].notnull()) & (df['Seed_Fav'].notnull())]
    
    # Drop extra columns
    for dt in dataset_types:
        data[name + dt] = data[name + dt].drop(['Seed', 'Seed_Fav', 'Seed_Diff'], axis=1)
    
    # Save sizes
    rows[name] = [name]
    for dt in dataset_types:
        rows[name].append(len(data[name + dt]))

size_df = pd.DataFrame.from_dict(rows, orient='index').drop(0, axis=1)
size_df.columns = pd.Index(dataset_types)
print('Dataset Sizes:')
size_df

Dataset Sizes:


Unnamed: 0,season,season_blocked,march,seed
kenpom,32563,32563,1152,4801
TRank,21672,21672,761,3275
stats,17889,17889,570,2736
all,17889,17889,570,2736


In [3]:
# Process each datasets
dataset_types.remove('march')
for name in dataset_names:
    print(name + ':')
    for dt in dataset_types:
        
        # See how similar each dataset is to the march data
        mcc, f1 = cbb.covariate_shift(data[name + dt], data[name + 'march'])
        print('   {0}: mcc = {1}, f1 = {2}'.format(dt, mcc, f1))

kenpom:
   season: mcc = 0.6747833696834589, f1 = 0.8333333333333333
   season_blocked: mcc = 0.7198041474182011, f1 = 0.8602150537634408
   seed: mcc = 0.2509409207857019, f1 = 0.5829383886255924
TRank:
   season: mcc = 0.5252302082964219, f1 = 0.7482993197278912
   season_blocked: mcc = 0.6746922520269371, f1 = 0.8376623376623377
   seed: mcc = 0.28826168278729336, f1 = 0.6232876712328766
stats:
   season: mcc = 0.442893395056564, f1 = 0.6956521739130436
   season_blocked: mcc = 0.5136818241297477, f1 = 0.7342995169082125
   seed: mcc = 0.050985399789998005, f1 = 0.4729064039408867
all:
   season: mcc = 0.29310025325672023, f1 = 0.6130653266331659
   season_blocked: mcc = 0.3167620715624992, f1 = 0.6451612903225806
   seed: mcc = 0.28497754047984003, f1 = 0.6100000000000001


In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [5]:
knn =  KNeighborsClassifier()
dt = DecisionTreeClassifier(min_samples_leaf=5)
rf = RandomForestClassifier(n_estimators=100, min_samples_split=5)
log = LogisticRegression(penalty='l1', C=10)

cls = [knn, dt, rf, log]
cl_names = ['KNN', 'Decision Tree', 'Random Forest', 'Logistic Regression']
exclude = ['Favored', 'Underdog', 'Year', 'Label']

In [24]:
train = data['kenpomseason'].copy()
test = data['kenpommarch'].copy()
cols = train.columns.to_series().reset_index(drop=True)
stats = cols[cols.apply(lambda x: 'Rank' not in x and
                                  'Fav' not in x and
                                  'Diff' not in x)].tolist()
rank_cols = cols[cols.apply(lambda x: 'Rank' in x or 'Label' in x or 'Win' in x)].tolist()
value_cols = cols[cols.apply(lambda x: 'Rank' not in x)].tolist()
diff_cols = cols[cols.apply(lambda x: 'Diff' not in x)].tolist()

In [7]:
cbb.evaluate(train, test, exclude, cls, cl_names)

Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.350785,0.238434,0.283898,0.110935,0.706597
1,Decision Tree,0.374172,0.402135,0.38765,0.180783,0.690104
2,Random Forest,0.449704,0.270463,0.337778,0.198683,0.741319
3,Logistic Regression,0.605042,0.256228,0.36,0.285406,0.777778


In [8]:
cbb.evaluate(train[rank_cols], test, ['Label'], cls, cl_names)

Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.352332,0.241993,0.28692,0.113245,0.706597
1,Decision Tree,0.300366,0.291815,0.296029,0.073246,0.661458
2,Random Forest,0.468571,0.291815,0.359649,0.221393,0.746528
3,Logistic Regression,0.612245,0.213523,0.316623,0.261523,0.775174


In [9]:
cbb.evaluate(train[value_cols], test, exclude, cls, cl_names)

Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.342246,0.227758,0.273504,0.100786,0.704861
1,Decision Tree,0.352751,0.3879,0.369492,0.153424,0.677083
2,Random Forest,0.427778,0.274021,0.334056,0.184233,0.733507
3,Logistic Regression,0.596639,0.252669,0.355,0.278765,0.776042


In [25]:
cbb.evaluate(train[diff_cols], test, exclude, cls, cl_names)

Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.373626,0.241993,0.293737,0.130825,0.716146
1,Decision Tree,0.357143,0.355872,0.356506,0.149393,0.686632
2,Random Forest,0.380117,0.231317,0.287611,0.132407,0.720486
3,Logistic Regression,0.605042,0.256228,0.36,0.285406,0.777778


In [11]:
# Extract the actual statistics in this dataset
train = data['kenpomseason'].copy()
test = data['kenpommarch'].copy()
cols = train.columns.drop(exclude).to_series().reset_index(drop=True)
stats = cols[cols.apply(lambda x: 'Rank' not in x and
                                  'Fav' not in x and
                                  'Diff' not in x)]
types = ['', '_Fav', '_Diff']

pca_objects = dict()
train_pca = list()
test_pca = list()

# Apply pca to columns for each stat
for stat in stats:
#     print(stat)
    
    for t in types:
        
        if stat not in ['Win_Loss', 'AdjEM']:
            stat_columns = [stat + t, stat + ' Rank' + t]
        elif stat == 'Win_Loss':
            stat_columns = [stat + t]
        else:
            stat_columns = ['AdjEM' + t, 'Rank' + t]
        
        pca = PCA(n_components=1)
        pca_objects[stat] = pca
        
#         print(stat + t + '\t' + str(list(stat_columns)))

        train_pca.append(pd.DataFrame(pca.fit_transform(data['kenpomseason'][stat_columns]), columns=[stat + t]))
        test_pca.append(pd.DataFrame(pca.transform(data['kenpommarch'][stat_columns]), columns=[stat + t]))

In [127]:
train = pd.concat(train_pca, axis=1)
test = pd.concat(test_pca, axis=1)
train['Label'] = data['kenpomseason']['Label']
test['Label'] = data['kenpommarch']['Label']

# cbb.evaluate(train, test, ['Label'], cls, cl_names)

In [12]:
train = data['kenpomseason']
test = data['kenpommarch']
print('Kenpom Data:')
cbb.evaluate(train, test, exclude, cls, cl_names)

Kenpom Data:


Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.350785,0.238434,0.283898,0.110935,0.706597
1,Decision Tree,0.375,0.405694,0.389744,0.182748,0.690104
2,Random Forest,0.465909,0.291815,0.358862,0.219506,0.74566
3,Logistic Regression,0.605042,0.256228,0.36,0.285406,0.777778


In [13]:
print('Kenpom Seed:')
train = data['kenpomseed']
cbb.evaluate(train, test, exclude, cls, cl_names)

Kenpom Seed:


Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.375,0.27758,0.319018,0.143272,0.710938
1,Decision Tree,0.339223,0.341637,0.340426,0.126637,0.677083
2,Random Forest,0.467005,0.327402,0.384937,0.235931,0.744792
3,Logistic Regression,0.542484,0.295374,0.382488,0.272073,0.767361


In [14]:
train = data['TRankseason']
test = data['TRankmarch']
print('T-Rank Data:')
cbb.evaluate(train, test, exclude, cls, cl_names)

T-Rank Data:


Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.296,0.198925,0.237942,0.053216,0.688568
1,Decision Tree,0.336634,0.365591,0.350515,0.128998,0.668857
2,Random Forest,0.382114,0.252688,0.304207,0.140691,0.717477
3,Logistic Regression,0.362637,0.177419,0.238267,0.101386,0.722733


In [15]:
print('T-Rank Seed:')
train = data['TRankseed']
cbb.evaluate(train, test, exclude, cls, cl_names)

T-Rank Seed:


Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.309524,0.27957,0.293785,0.080642,0.671485
1,Decision Tree,0.325688,0.38172,0.351485,0.119831,0.655716
2,Random Forest,0.414286,0.311828,0.355828,0.187685,0.724047
3,Logistic Regression,0.358974,0.225806,0.277228,0.113625,0.712221


In [16]:
train = data['statsseason']
test = data['statsmarch']
print('Stats Data:')
cbb.evaluate(train, test, exclude, cls, cl_names)

Stats Data:


Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.342105,0.181818,0.237443,0.08254,0.707018
1,Decision Tree,0.302198,0.384615,0.338462,0.081078,0.622807
2,Random Forest,0.363636,0.111888,0.171123,0.075228,0.72807
3,Logistic Regression,0.5,0.076923,0.133333,0.11514,0.749123


In [17]:
print('Stats Seed:')
train = data['statsseed']
cbb.evaluate(train, test, exclude, cls, cl_names)

Stats Seed:


Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.290076,0.265734,0.277372,0.049394,0.652632
1,Decision Tree,0.259459,0.335664,0.292683,0.013723,0.592982
2,Random Forest,0.37037,0.13986,0.203046,0.089168,0.724561
3,Logistic Regression,0.5,0.230769,0.315789,0.207952,0.749123


In [20]:
train = data['allseason'].drop(['Seed_Diff_x', 'Seed_Diff_y'], axis=1)
test = data['allmarch'].drop(['Seed_Diff_x', 'Seed_Diff_y'], axis=1)
print('All:')
cbb.evaluate(train, test, exclude, cls, cl_names)

All:


Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.363636,0.251748,0.297521,0.119248,0.701754
1,Decision Tree,0.320988,0.363636,0.340984,0.101907,0.647368
2,Random Forest,0.421053,0.27972,0.336134,0.175552,0.722807
3,Logistic Regression,0.388235,0.230769,0.289474,0.132643,0.715789


In [21]:
train = data['allseed'].drop(['Seed_Diff_x', 'Seed_Diff_y'], axis=1)
print('All:')
cbb.evaluate(train, test, exclude, cls, cl_names)

All:


Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.319328,0.265734,0.290076,0.081106,0.673684
1,Decision Tree,0.322368,0.342657,0.332203,0.099444,0.654386
2,Random Forest,0.417476,0.300699,0.349593,0.180478,0.719298
3,Logistic Regression,0.361111,0.272727,0.310757,0.122942,0.696491


In [31]:
train = data['allseason'].copy().drop(['Seed_Diff_x', 'Seed_Diff_y'], axis=1)
test = data['allmarch'].copy().drop(['Seed_Diff_x', 'Seed_Diff_y'], axis=1)
cols = train.columns.to_series().reset_index(drop=True)
stats = cols[cols.apply(lambda x: 'Rank' not in x and
                                  'Fav' not in x and
                                  'Diff' not in x)].tolist()
rank_cols = cols[cols.apply(lambda x: 'Rank' in x or 'Label' in x or 'Win' in x)].tolist()
value_cols = cols[cols.apply(lambda x: 'Rank' not in x)].tolist()
diff_cols = cols[cols.apply(lambda x: 'Diff' not in x)].tolist()

In [32]:
cbb.evaluate(train, test, exclude, cls, cl_names)

Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.363636,0.251748,0.297521,0.119248,0.701754
1,Decision Tree,0.310127,0.342657,0.325581,0.084636,0.64386
2,Random Forest,0.397727,0.244755,0.30303,0.144739,0.717544
3,Logistic Regression,0.388235,0.230769,0.289474,0.132643,0.715789


In [38]:
cbb.evaluate(train[rank_cols], test, ['Label'], cls, cl_names)

Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.368932,0.265734,0.308943,0.12789,0.701754
1,Decision Tree,0.323741,0.314685,0.319149,0.095449,0.663158
2,Random Forest,0.388889,0.244755,0.300429,0.137851,0.714035
3,Logistic Regression,0.553191,0.181818,0.273684,0.20905,0.757895


In [33]:
cbb.evaluate(train[value_cols], test, exclude, cls, cl_names)

Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.37,0.258741,0.304527,0.126747,0.703509
1,Decision Tree,0.337748,0.356643,0.346939,0.120296,0.663158
2,Random Forest,0.407407,0.230769,0.294643,0.146953,0.722807
3,Logistic Regression,0.464789,0.230769,0.308411,0.186126,0.740351


In [36]:
cbb.evaluate(train[diff_cols], test, exclude, cls, cl_names)

Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.375,0.251748,0.301255,0.128852,0.707018
1,Decision Tree,0.338028,0.335664,0.336842,0.115794,0.668421
2,Random Forest,0.380435,0.244755,0.297872,0.13111,0.710526
3,Logistic Regression,0.395062,0.223776,0.285714,0.135363,0.719298


In [37]:
only_value_cols = cols[cols.apply(lambda x: 'Diff' not in x and
                                            'Rank' not in x)].tolist()
cbb.evaluate(train[only_value_cols], test, exclude, cls, cl_names)

Unnamed: 0,Classifier,Precision,Recall,F1,MCC,Accuracy
0,KNN,0.391753,0.265734,0.316667,0.147158,0.712281
1,Decision Tree,0.372263,0.356643,0.364286,0.157498,0.687719
2,Random Forest,0.397436,0.216783,0.280543,0.134608,0.721053
3,Logistic Regression,0.428571,0.188811,0.262136,0.144488,0.733333


In [70]:
from sklearn.metrics import matthews_corrcoef, f1_score, precision_score, recall_score, accuracy_score
import numpy as np

In [49]:
[1, 2, 3].remove(3)

In [77]:
def rank_features(train, test, exclude, model):
    
    # Intialize some variables
    curr_features = list()
    feature_metrics = list()
    curr_metrics = {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
    feature_names = [x for x in train.columns if x not in exclude]
    
    for _ in range(len(feature_names)):
        #print('Outer loop:')
        
        max_diff = 0
        max_metric = None
        
        # Test a model with the current features plus one new feature
        for feature in feature_names:
            
            #print('\t' + feature + ':')
            
            model.fit(train[curr_features + [feature]], train[['Label']].values.ravel())
            predictions = model.predict(test[curr_features + [feature]])
            
            metrics = {'precision': precision_score(test['Label'], predictions),
                       'recall': recall_score(test['Label'], predictions), 
                       'f1': f1_score(test['Label'], predictions)}
            metrics['diff'] = np.prod([metrics[key] - curr_metrics[key] for key in metrics.keys()])
            
            #print('\t' + str(metrics))

            if metrics['diff'] > max_diff:
                max_diff = metrics['diff']
                max_metric = feature
                max_values = metrics
                
                #print('\t\tnew max')
            
        # If no additional feature improve the score, stop
        if max_metric is None:
             break
        else:
            curr_features.append(max_metric)
            curr_metrics = metrics
            feature_names.remove(feature)
            feature_metrics.append(max_values)
    
    return curr_features, feature_metrics

In [78]:
curr_features, feature_metrics = rank_features(train, test, exclude, cls[-1])
curr_features, feature_metrics

(['WAB_Diff',
  'OppO Rank_Diff',
  'AST_Fav',
  'Barthag',
  'NCSOS AdjEM Rank',
  'Rank_Diff',
  'WAB Rank',
  'AdjDE_Fav',
  'AdjT Rank',
  'FTR',
  'Rank_Fav',
  'AdjT Rank_Diff',
  'Rank',
  'OppO_Fav',
  'STL',
  'AdjO Rank_Fav',
  'Rk_Fav',
  'Rk_Fav',
  'Rank_Diff',
  'OppD Rank_Fav',
  'DRB Rank_Fav',
  'Rank_Fav',
  'Win_Loss_Diff',
  'AdjO Rank_Fav',
  'Win_Loss_Diff',
  'Rank_Fav'],
 [{'precision': 0.328125,
   'recall': 0.14685314685314685,
   'f1': 0.20289855072463767,
   'diff': 0.0097769078747339602},
  {'precision': 0.38571428571428573,
   'recall': 0.1888111888111888,
   'f1': 0.25352112676056338,
   'diff': 0.018463226913931138},
  {'precision': 0.40298507462686567,
   'recall': 0.1888111888111888,
   'f1': 0.25714285714285712,
   'diff': 0.00014095138394961068},
  {'precision': 0.38157894736842107,
   'recall': 0.20279720279720279,
   'f1': 0.26484018264840181,
   'diff': 4.409818884264691e-06},
  {'precision': 0.41428571428571431,
   'recall': 0.20279720279720279,
