In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from source.aggregated_stats import process_details, full_stats
from source.make_train_test import make_teams_target, make_training_data, add_seed, prepare_data

pd.set_option("max_columns", 300)

In [2]:
regular_season = 'data/raw_men/MDataFiles_Stage1/MRegularSeasonDetailedResults.csv'
playoff = 'data/raw_men/MDataFiles_Stage1/MNCAATourneyDetailedResults.csv'
playoff_compact = 'data/raw_men/MDataFiles_Stage1/MNCAATourneyCompactResults.csv'
seed = 'data/raw_men/MDataFiles_Stage1/MNCAATourneySeeds.csv'
rank = 'data/raw_men/MDataFiles_Stage1/MMasseyOrdinals.csv'
save_loc = 'data/processed_men/'

In [None]:
reg = pd.read_csv(regular_season)
reg = process_details(reg)
reg.to_csv(save_loc + 'game_details_regular_extended.csv', index=False)
regular_stats = full_stats(reg)
print(regular_stats.Season.max())

regular_stats.head()

In [None]:
last2weeks = reg[reg.DayNum >= 118].copy()
last2weeks = full_stats(last2weeks)
last2weeks.columns = ['L2W_' + col for col in last2weeks]
last2weeks.rename(columns={'L2W_Season': 'Season', 'L2W_TeamID': 'TeamID'}, inplace=True)
last2weeks.head()

In [None]:
regular_stats = pd.merge(regular_stats, last2weeks, on=['Season', 'TeamID'], how='left')
print(regular_stats.Season.max())

regular_stats.head()

In [None]:
play = pd.read_csv(playoff)
play = process_details(play)
play.to_csv(save_loc + 'game_details_playoff_extended.csv', index=False)
playoff_stats = full_stats(play)
print(playoff_stats.Season.max())

playoff_stats.head()

In [None]:
target_data = pd.read_csv(playoff_compact)
target_data = make_teams_target(target_data, 'men')
print(target_data.shape)
print(target_data.Season.max())
target_data.head()

In [None]:
regular_stats = add_seed(seed, regular_stats)
print(regular_stats.Seed.isna().mean())
regular_stats.head()

In [None]:
playoff_stats = add_seed(seed, playoff_stats)
print(playoff_stats.Seed.isna().mean())
playoff_stats.head()

In [None]:
all_reg = make_training_data(regular_stats, target_data)

all_reg.head()

# Testing the function for both tournaments

In [None]:
all_reg = prepare_data('men')

all_reg.head()

In [None]:
all_reg = prepare_data('women')

all_reg.head()