In [9]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, confusion_matrix, log_loss
from sklearn.ensemble import RandomForestClassifier

In [0]:
DATA_FOLDER = '/content/drive/My Drive/Colab Notebooks/kaggle/ncaa'

In [0]:
def generate_mapping(data, swap=False):
    mapping = {}

    for col in data.columns:
        val = None

        if col == 'WTeamID':
            val = 'T1ID' if not swap else 'T2ID'
        elif col == 'LTeamID':
            val = 'T2ID' if not swap else 'T1ID'
        elif col.startswith('W'):
            val = col.replace('W', 'T1') if not swap else col.replace('W', 'T2')
        elif col.startswith('L'):
            val = col.replace('L', 'T2') if not swap else col.replace('L', 'T1')
        else:
            pass

        if val is not None: 
            mapping[col] = val
    
    return mapping

In [0]:
def transform_data(data):
    wdata = data.rename(columns=generate_mapping(data))
    ldata = data.rename(columns=generate_mapping(data, swap=True))
    wdata['Win'] = 1
    ldata['Win'] = 0
    data = pd.concat([wdata, ldata])
    # drop all rows with at least one NaN
    data = data.drop(columns=['T1Loc', 'T2Loc'])
    data = data.dropna(axis='index')
    # transform columns to proper data types
    data['Season'] = data['Season'].astype('int64')
    data['T1ID'] = data['T1ID'].astype('int64')
    data['T2ID'] = data['T2ID'].astype('int64')
    return data

In [0]:
def train_test_split(data, features):
    train = data[(data.Season < 2019)]
    test = data[data.Season == 2019]

    train_x = train[features]
    train_y = train.Win

    test_x = test[features]
    test_y = test.Win
    return (train_x, train_y, test_x, test_y)

def train(train_x, train_y):
    model = RandomForestClassifier(max_depth=5, random_state=0, n_estimators=200)
    model.fit(train_x, train_y)
    return model

def predict(model, test_x, test_y):
    preds = model.predict(test_x)
    probs = model.predict_proba(test_x)
    mse = mean_squared_error(test_y, preds)
    mae = mean_absolute_error(test_y, preds)
    accuracy = accuracy_score(test_y, preds)
    confusion = confusion_matrix(test_y, preds)
    logloss = log_loss(test_y, probs)
    
    print('MSE: {}'.format(mse))
    print('MAE: {}'.format(mae))
    print('Accuracy: {}'.format(accuracy))
    print('Confusion matrix: {}'.format(confusion))
    print('Log loss: {}'.format(logloss))
    
    return (preds, probs)

In [0]:
def record_exists(src_data, row):
    return pd.Series((src_data.T1ID == row.T1ID) & (src_data.T2ID == row.T2ID)).any()

In [0]:
def get_similar_data(row, src_data, feature):
    if feature in src_data.columns:
      similar_data = src_data[(src_data.Season == row.Season) & (src_data.T1ID == row.T1ID) & (src_data.T2ID == row.T2ID)]
      if len(similar_data) == 0:
          similar_data = src_data[(src_data.T1ID == row.T1ID) & (src_data.T2ID == row.T2ID)]
      if len(similar_data) == 0:
          similar_data = src_data[(src_data.T2ID == row.T2ID)] if feature.startswith('T2') else src_data[(src_data.T1ID == row.T1ID)]

      return similar_data[feature]
    
    return pd.Series()

def populate_row(row, data_sources, dst_data, feature):
    feature_data = pd.Series()

    for data_source in data_sources:
      similar_data = get_similar_data(row, data_source, feature)
      feature_data = pd.concat([feature_data, similar_data])


    return feature_data.mean()  

def populate_data(data_sources, dst_data, features):
    dst_data_c = dst_data.copy()
    for feature in features:
        if feature not in dst_data_c.columns:
            dst_data_c[feature] = dst_data_c.apply(lambda row: populate_row(row, data_sources, dst_data_c, feature), axis=1)
    return dst_data_c

In [0]:
reg_det_data = pd.read_csv(f'{DATA_FOLDER}/WRegularSeasonDetailedResults.csv')
reg_comp_data = pd.read_csv(f'{DATA_FOLDER}/WRegularSeasonCompactResults.csv')
tourney_det_data = pd.read_csv(f'{DATA_FOLDER}/WNCAATourneyDetailedResults.csv')
tourney_comp_data = pd.read_csv(f'{DATA_FOLDER}/WNCAATourneyCompactResults.csv')

In [41]:
reg_det_data_t = transform_data(reg_det_data)
tourney_det_data_t = transform_data(tourney_det_data)
reg_comp_data_t = transform_data(reg_comp_data)
tourney_comp_data_t = transform_data(tourney_comp_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [0]:
total_data = pd.concat([reg_det_data_t, tourney_det_data_t], axis=0)

In [0]:
features = list(set(total_data.columns) - set(['Win', 'Season', 'DayNum']))
float_features = list(set(features) - set(['T1ID', 'T2ID']))

for feature in float_features:
  total_data[feature] = total_data[feature].astype('float64')

In [0]:
train_x, train_y, test_x, test_y = train_test_split(total_data, features)

In [0]:
model = train(train_x, train_y)

In [61]:
preds, probs = predict(model, test_x, test_y)

MSE: 0.03234732824427481
MAE: 0.03234732824427481
Accuracy: 0.9676526717557252
Confusion matrix: [[5073  167]
 [ 172 5068]]
Log loss: 0.26486876705803747


In [62]:
sorted(list(zip(train_x.columns, model.feature_importances_)), key=lambda t: t[1])

[('T1OR', 0.0),
 ('T2OR', 4.818359833713286e-07),
 ('T2ID', 1.7601812413682168e-06),
 ('T1ID', 2.4712002026378974e-06),
 ('T2FGA3', 2.9028974234939006e-05),
 ('T1FGA3', 0.0001010669236623396),
 ('T1Blk', 0.0001581316843938414),
 ('NumOT', 0.00039601260578857827),
 ('T2Blk', 0.0003972165781300794),
 ('T2FGA', 0.0009257383578958404),
 ('T1FGA', 0.0012211418223881598),
 ('T1Stl', 0.0019471345649651013),
 ('T2Stl', 0.0020814122617121298),
 ('T2TO', 0.0028729596275681767),
 ('T1TO', 0.002912109444225256),
 ('T1FGM3', 0.003096608584296594),
 ('T2FGM3', 0.0032435485211716624),
 ('T2PF', 0.00660628989852077),
 ('T1PF', 0.010987638571654327),
 ('T2FTA', 0.020804650741859523),
 ('T1FTA', 0.02138084480182391),
 ('T1FTM', 0.027797950087596238),
 ('T1DR', 0.028559493306459603),
 ('T2FTM', 0.03405685235915444),
 ('T2DR', 0.03767878407374024),
 ('T1Ast', 0.05553040721556849),
 ('T2Ast', 0.06666630094577575),
 ('T1FGM', 0.09539271032953167),
 ('T2FGM', 0.11861055179570944),
 ('T2Score', 0.207253582245

In [44]:
s_data = pd.read_csv(f'{DATA_FOLDER}/WSampleSubmissionStage2.csv')

s_data_t = s_data.copy()
s_data_t['Season'] = s_data_t['ID'].apply(lambda id: id.split('_')[0])
s_data_t['T1ID'] = s_data_t['ID'].apply(lambda id: id.split('_')[1])
s_data_t['T2ID'] = s_data_t['ID'].apply(lambda id: id.split('_')[2])
s_data_t['Season'] = s_data_t['Season'].astype('int64')
s_data_t['T1ID'] = s_data_t['T1ID'].astype('int64')
s_data_t['T2ID'] = s_data_t['T2ID'].astype('int64')
existing_data = s_data_t[s_data_t.apply(lambda row: record_exists(total_data, row), axis=1)]
print(f'number of missing records: {len(s_data_exp) - len(existing_data)}')

number of missing records: 1339


In [0]:
s_data_p = populate_data([total_data, reg_comp_data_t, tourney_comp_data_t], s_data_t, features)

In [0]:
test_preds = model.predict(s_data_p[features])

In [75]:
test_preds.sum()

987

In [0]:
test_probs = model.predict_proba(s_data_p[features])

In [0]:
s_data_p['Pred'] = list(map(lambda p: p[1], test_probs))

In [78]:
s_data_p['Pred']

0       0.697634
1       0.841407
2       0.742942
3       0.030861
4       0.545935
          ...   
2011    0.102375
2012    0.451386
2013    0.234762
2014    0.132152
2015    0.620433
Name: Pred, Length: 2016, dtype: float64

In [0]:
submission_df = s_data_p[['ID', 'Pred']]

In [0]:
submission_df.to_csv(f'{DATA_FOLDER}/stage2_submission5.csv', index=False)