In [33]:
import os
import re
import time
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

from itertools import permutations
import requests
import json

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [34]:
mens_path = os.path.abspath('../../data/mens_teams.csv')
mens_df = pd.read_csv(mens_path)
mens_df.head()

Unnamed: 0,School,"City, State",SR key,NCAA key,NCAA School,NCAA Name,background-color
0,Abilene Christian,"Abilene, Texas",abilene-christian,abilene-christian,Abilene Christian,Abilene Christian University,#582C83
1,Air Force,"USAF Academy, Colorado",air-force,air-force,Air Force,Air Force Academy,#0032A0
2,Akron,"Akron, Ohio",akron,akron,Akron,University of Akron,#0F192B
3,Alabama,"Tuscaloosa, Alabama",alabama,alabama,Alabama,University of Alabama,#9D2235
4,Alabama A&M,"Normal, Alabama",alabama-am,alabama-am,Alabama A&M,Alabama A&M University,#862633


In [35]:
womens_path = os.path.abspath('../../data/womens_teams.csv')
womens_df = pd.read_csv(womens_path)
womens_df.head()

Unnamed: 0,School,"City, State",SR key,NCAA key,NCAA School,NCAA Name,background-color
0,Abilene Christian,"Abilene, Texas",abilene-christian,abilene-christian,Abilene Christian,Abilene Christian University,#582C83
1,Air Force,"USAF Academy, Colorado",air-force,air-force,Air Force,Air Force Academy,#0032A0
2,Akron,"Akron, Ohio",akron,akron,Akron,University of Akron,#0F192B
3,Alabama,"Tuscaloosa, Alabama",alabama,alabama,Alabama,University of Alabama,#9D2235
4,Alabama A&M,"Normal, Alabama",alabama-am,alabama-am,Alabama A&M,Alabama A&M University,#862633


In [36]:
MENS_SR_SCHOOL_KEYS = [row.to_dict().get('SR key') for index, row in mens_df.iterrows()]
WOMENS_SR_SCHOOL_KEYS = [row.to_dict().get('SR key') for index, row in womens_df.iterrows()]

def get_gamelog_basic_url(school_key, season, isWomens = False):
    type = 'women' if isWomens else 'men'
    return f'https://www.sports-reference.com/cbb/schools/{school_key}/{type}/{season}-gamelogs.html'

def get_gamelog_advanced_url(school_key, season, isWomens = False):
    type = 'women' if isWomens else 'men'
    return f'https://www.sports-reference.com/cbb/schools/{school_key}/{type}/{season}-gamelogs-advanced.html'

def get_team_season_file_path(school_key, season, filename, isWomens = False):
    type = 'women' if isWomens else 'men'
    file_path = os.path.abspath(f'../../data/seasons/{season}/{school_key}/{type}/{filename}')

    dir = os.path.dirname(file_path)
    if not os.path.exists(dir):
        os.makedirs(dir)
    
    return file_path

## Download gamelogs HTML
Download the basic and advanced gamelog html for each team

In [37]:
def download_gamelog(school_key, season, isWomens = False):
    basic_url, advanced_url = get_gamelog_basic_url(school_key, season, isWomens), get_gamelog_advanced_url(school_key, season, isWomens)

    time.sleep(3) # Delay for 3 seconds
    basic_html = requests.get(basic_url).content
    time.sleep(3) # Delay for 3 seconds
    advanced_html = requests.get(advanced_url).content

    basic_file_path = get_team_season_file_path(school_key, season, f'{school_key}_basic.html', isWomens)
    advanced_file_path = get_team_season_file_path(school_key, season, f'{school_key}_advanced.html', isWomens)

    with open(basic_file_path, 'w') as file:
        file.write(basic_html.decode('utf-8'))
    with open(advanced_file_path, 'w') as file:
        file.write(advanced_html.decode('utf-8'))

def download_gamelogs_for_single_season(season, isWomens = False):
    teams = WOMENS_SR_SCHOOL_KEYS if isWomens else MENS_SR_SCHOOL_KEYS
    for school_key in tqdm(teams, unit=f'school ({season})'):
        download_gamelog(school_key, season, isWomens)

def download_gamelogs(seasons, isWomens = False):
    for season in seasons:
        download_gamelogs_for_single_season(season, isWomens)

## Change Opp name column to Opp key
The Opp name does not always match the school name we have saved in the team data, thus we will reparse the tables to instead use the SR keys

In [38]:
def get_opposing_school_keys(school_key, season, advanced = False, isWomens = False):
    opp_school_keys = []
    type = 'advanced' if advanced else 'basic'
    html_file_path = get_team_season_file_path(school_key, season, f'{school_key}_{type}.html', isWomens)

    with open(html_file_path, 'r') as file:
        soup = BeautifulSoup(file, 'html.parser')
        table = soup.find("table")
        rows = table.find_all('tr')
    
        for row in rows[2:]:
            try:
                link = row.find_all('td')[2].find('a')['href']
                key = re.search(r'/schools/([^/]+)/', link).group(1)

                if isWomens and '_w' in key:
                    key = key.replace('_w', '')
                opp_school_keys.append(key)
            except IndexError:
                # repeating header row
                continue
            except TypeError:
                #  opponent is missing
                opp_school_keys.append('')
    
    return opp_school_keys

## Basic gamelog CSV
Extract the basic gamelog to csv

In [39]:
def create_basic_gamelog(school_key, season, isWomens = False):
    file_path = get_team_season_file_path(school_key, season, f'{school_key}_basic.html', isWomens)

    team_df = pd.read_html(file_path)[0]

    # drop columns from 'Defensive Four Factors'
    opponent_columns = [column for column in team_df.columns if 'Opponent' in column[0]]
    team_df = team_df.drop(opponent_columns, axis=1)

    # Use second level column names
    team_df.columns = [column[1] for column in team_df.columns]

    # rename to location column
    team_df = team_df.rename(columns={'Unnamed: 2_level_1': 'Location'})

    # remove unneeded columns
    unneeded_columns = [column for column in team_df.columns if 'Unnamed' in column] + ['G']
    team_df = team_df.drop(unneeded_columns, axis=1)

    # Drop repeating header rows
    team_df = team_df[team_df.Tm != 'Tm']
    team_df = team_df[team_df.FG != 'School']

    # rename repeating 'Opp' column
    index = team_df.columns.to_list().index('Opp')
    team_df.columns.values[index] = 'Opp name'

    # Opp names to Opp keys
    opp_school_keys = get_opposing_school_keys(school_key, season, False, isWomens)
    # Shape for both must match same rows
    assert team_df.shape[0] == len(opp_school_keys)
    team_df['Opp name'] = opp_school_keys
    team_df = team_df.rename(columns={'Opp name': 'Opp key'})

    # save file
    csv_file_path = get_team_season_file_path(school_key, season, f'{school_key}_basic.csv', isWomens)
    team_df.to_csv(csv_file_path, index=False)

def create_basic_gamelogs_for_single_season(season, isWomens = False):
    teams = WOMENS_SR_SCHOOL_KEYS if isWomens else MENS_SR_SCHOOL_KEYS
    for school_key in tqdm(teams, unit=f'school ({season} basic csv)'):
        try:
            create_basic_gamelog(school_key, season, isWomens)
        except ValueError:
            continue

def create_basic_gamelogs(seasons, isWomens = False):
    for season in seasons:
        create_basic_gamelogs_for_single_season(season, isWomens)

In [40]:
# create_basic_gamelogs(['2020', '2021', '2022', '2023'], isWomens=True)

Quick visual verification:

In [41]:
school_key, season = 'connecticut', 2020
file_path = get_team_season_file_path(school_key, season, f'{school_key}_basic.csv', isWomens=True)
team_df = pd.read_csv(file_path)
team_df.head()

Unnamed: 0,Date,Location,Opp key,W/L,Tm,Opp,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,2019-11-10,,california,W,72,61,27,65,0.415,5,15,0.333,13,21,0.619,9,38,12,10,6,10,12
1,2019-11-13,@,vanderbilt,W,64,51,27,65,0.415,6,23,0.261,4,7,0.571,11,32,17,16,5,15,9
2,2019-11-17,@,temple,W,83,54,31,63,0.492,8,22,0.364,13,16,0.813,11,34,21,7,6,11,8
3,2019-11-19,,virginia,W,83,44,30,59,0.508,9,23,0.391,14,22,0.636,9,37,21,11,7,10,13
4,2019-11-24,@,ohio-state,W,73,62,29,61,0.475,8,21,0.381,7,8,0.875,8,39,19,7,4,14,9


## Advanced gamelog CSV
Extract the advanced gamelog to csv

In [42]:
def create_advanced_gamelog(school_key, season, isWomens = False):
    file_path = get_team_season_file_path(school_key, season, f'{school_key}_advanced.html', isWomens)

    team_df = pd.read_html(file_path)[0]

    # drop columns from 'Defensive Four Factors'
    defensive_columns = [column for column in team_df.columns if 'Defensive' in column[0]]
    team_df = team_df.drop(defensive_columns, axis=1)

    # Use second level column names
    team_df.columns = [column[1] for column in team_df.columns]

    # rename to location column
    team_df = team_df.rename(columns={'Unnamed: 2_level_1': 'Location'})

    # remove unneeded columns
    unneeded_columns = [column for column in team_df.columns if 'Unnamed' in column] + ['G']
    team_df = team_df.drop(unneeded_columns, axis=1)

    # Drop repeating header rows
    team_df = team_df[team_df.Tm != 'Tm']
    team_df = team_df[team_df['eFG%'] != 'Offensive Four Factors']

    # rename repeating 'Opp' column
    index = team_df.columns.to_list().index('Opp')
    team_df.columns.values[index] = 'Opp name'

    # Opp names to Opp keys
    opp_school_keys = get_opposing_school_keys(school_key, season, True, isWomens)
    # Shape for both must match same rows
    assert team_df.shape[0] == len(opp_school_keys)
    team_df['Opp name'] = opp_school_keys
    team_df = team_df.rename(columns={'Opp name': 'Opp key'})

    # save file
    csv_file_path = get_team_season_file_path(school_key, season, f'{school_key}_advanced.csv', isWomens)
    team_df.to_csv(csv_file_path, index=False)

def create_advanced_gamelogs_for_single_season(season, isWomens = False):
    teams = WOMENS_SR_SCHOOL_KEYS if isWomens else MENS_SR_SCHOOL_KEYS
    for school_key in tqdm(teams, unit=f'school ({season} advanced csv)'):
        try:
            create_advanced_gamelog(school_key, season, isWomens)
        except ValueError:
            continue

def create_advanced_gamelogs(seasons, isWomens = False):
    for season in seasons:
        create_advanced_gamelogs_for_single_season(season, isWomens)

In [43]:
# create_advanced_gamelogs(['2020', '2021', '2022', '2023'], isWomens=True)

Quick visual verification:

In [44]:
school_key, season = 'connecticut', 2020
file_path = get_team_season_file_path(school_key, season, f'{school_key}_advanced.csv', isWomens=True)
team_df = pd.read_csv(file_path)
team_df.head()

Unnamed: 0,Date,Location,Opp key,W/L,Tm,Opp,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,eFG%,TOV%,ORB%,FT/FGA
0,2019-11-10,,california,W,72,61,96.7,81.9,74.4,0.323,0.231,0.48,50.7,44.4,13.4,13.3,0.454,11.8,25.7,0.2
1,2019-11-13,@,vanderbilt,W,64,51,87.3,69.6,73.0,0.108,0.354,0.468,48.5,63.0,21.8,10.4,0.462,18.0,31.4,0.062
2,2019-11-17,@,temple,W,83,54,119.9,78.0,69.2,0.254,0.349,0.588,56.7,67.7,10.1,17.1,0.556,13.5,36.7,0.206
3,2019-11-19,,virginia,W,83,44,117.6,62.4,70.6,0.373,0.39,0.598,54.4,70.0,15.6,17.1,0.585,12.6,31.0,0.237
4,2019-11-24,@,ohio-state,W,73,62,100.0,84.9,73.0,0.131,0.344,0.563,54.2,65.5,9.6,10.5,0.541,17.8,26.7,0.115


## Combine basic and advanced gamelog CSVs

In [45]:
def combine_basic_advanced_gamelog(school_key, season, isWomens = False):
    basic_file_path = get_team_season_file_path(school_key, season, f'{school_key}_basic.csv', isWomens)
    advanced_file_path = get_team_season_file_path(school_key, season, f'{school_key}_advanced.csv', isWomens)

    basic_team_df, advanced_team_df = pd.read_csv(basic_file_path), pd.read_csv(advanced_file_path)

    merged_team_df = pd.merge(basic_team_df, advanced_team_df, on=['Date', 'Location', 'Opp key', 'W/L', 'Tm', 'Opp'])

    # fill NaN location values to 'H' to represent Home
    merged_team_df['Location'] = merged_team_df['Location'].fillna('H')
    
    csv_file_path = get_team_season_file_path(school_key, season, f'{school_key}_merged.csv', isWomens)
    merged_team_df.to_csv(csv_file_path, index=False)

def combine_basic_advanced_gamelogs_for_single_season(season, isWomens = False):
    teams = WOMENS_SR_SCHOOL_KEYS if isWomens else MENS_SR_SCHOOL_KEYS
    for school_key in tqdm(teams, unit=f'school ({season} merged csv)'):
        try:
            combine_basic_advanced_gamelog(school_key, season, isWomens)
        except FileNotFoundError:
            continue

def combine_basic_advanced_gamelogs(seasons, isWomens = False):
    for season in seasons:
        combine_basic_advanced_gamelogs_for_single_season(season, isWomens)

In [46]:
# combine_basic_advanced_gamelogs(['2020', '2021', '2022', '2023'], isWomens=True)

Quick visual verification:

In [47]:
school_key, season = 'connecticut', 2020
file_path = get_team_season_file_path(school_key, season, f'{school_key}_merged.csv', isWomens=True)
team_df = pd.read_csv(file_path)
team_df.head()

Unnamed: 0,Date,Location,Opp key,W/L,Tm,Opp,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,eFG%,TOV%,ORB%,FT/FGA
0,2019-11-10,H,california,W,72,61,27,65,0.415,5,15,0.333,13,21,0.619,9,38,12,10,6,10,12,96.7,81.9,74.4,0.323,0.231,0.48,50.7,44.4,13.4,13.3,0.454,11.8,25.7,0.2
1,2019-11-13,@,vanderbilt,W,64,51,27,65,0.415,6,23,0.261,4,7,0.571,11,32,17,16,5,15,9,87.3,69.6,73.0,0.108,0.354,0.468,48.5,63.0,21.8,10.4,0.462,18.0,31.4,0.062
2,2019-11-17,@,temple,W,83,54,31,63,0.492,8,22,0.364,13,16,0.813,11,34,21,7,6,11,8,119.9,78.0,69.2,0.254,0.349,0.588,56.7,67.7,10.1,17.1,0.556,13.5,36.7,0.206
3,2019-11-19,H,virginia,W,83,44,30,59,0.508,9,23,0.391,14,22,0.636,9,37,21,11,7,10,13,117.6,62.4,70.6,0.373,0.39,0.598,54.4,70.0,15.6,17.1,0.585,12.6,31.0,0.237
4,2019-11-24,@,ohio-state,W,73,62,29,61,0.475,8,21,0.381,7,8,0.875,8,39,19,7,4,14,9,100.0,84.9,73.0,0.131,0.344,0.563,54.2,65.5,9.6,10.5,0.541,17.8,26.7,0.115


## Generating Moving Averages
Next we will generate a csv adding the moving averages for each statistic

In [48]:
LATEST = 'LATEST'
META_LABELS = ['Date', 'Location', 'Opp key', 'W/L', 'Tm', 'Opp']
STAT_LABELS = ['FG', 'FGA', 'FG%','3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'ORtg', 'DRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA']

In [49]:
def generate_moving_averages_for_school(school_key, season, keep_latest = False, span = 5, isWomens = False):
    file_path = get_team_season_file_path(school_key, season, f'{school_key}_merged.csv', isWomens)
    team_df = pd.read_csv(file_path)

    # Drop any rows with NULL values
    team_df.dropna(inplace=True)

    if keep_latest:
        # This logic makes it so we keep the latest statistics for an upcoming game
        copyLast = pd.DataFrame(team_df.tail(1).values, columns=team_df.columns)
        team_df = pd.concat([team_df, copyLast], ignore_index=True)
        team_df.loc[team_df.index[-1], 'Date'] = LATEST
        team_df.loc[team_df.index[-1], 'Opp key'] = LATEST


    for column in team_df.columns:
        if column in META_LABELS:
            continue

        # Simple moving averages
        team_df[f"{column}_SMA"] = team_df.loc[:, column].rolling(window=span).mean()
        team_df[f"{column}_SMA"] = team_df[f"{column}_SMA"].shift(1)

        # Cumulative moving average
        team_df[f"{column}_CMA"] = team_df.loc[:, column].expanding(min_periods=span).mean()
        team_df[f"{column}_CMA"] = team_df[f"{column}_CMA"].shift(1)

        # Exponential moving average
        team_df[f"{column}_EMA"] = team_df.loc[:, column].ewm(span=span, adjust=False).mean()
        team_df[f"{column}_EMA"] = team_df[f"{column}_EMA"].shift(1)

    # Drop any rows with NULL values (rows with no MA)
    team_df.dropna(inplace=True)
    
    ma_file_path = get_team_season_file_path(school_key, season, f'{school_key}_{span}ma.csv', isWomens)
    team_df.to_csv(ma_file_path, index=False)

def generate_moving_averages_for_single_season(season, keep_latest = False, span = 5, isWomens = False):
    teams = WOMENS_SR_SCHOOL_KEYS if isWomens else MENS_SR_SCHOOL_KEYS
    for school_key in tqdm(teams, unit=f'school ({season} ma csv)'):
        try:
            generate_moving_averages_for_school(school_key, season, keep_latest, span, isWomens)
        except FileNotFoundError:
            continue

def generate_moving_averages(seasons, keep_latest = False, span = 5, isWomens = False):
    for season in seasons:
        generate_moving_averages_for_single_season(season, keep_latest, span, isWomens)

In [50]:
# generate_moving_averages(['2020', '2021', '2022', '2023'], isWomens=True)

Quick visual verification:

In [51]:
school_key, season, span = 'connecticut', 2020, 5
file_path = get_team_season_file_path(school_key, season, f'{school_key}_{span}ma.csv', isWomens=True)
team_df = pd.read_csv(file_path)
team_df.head()

Unnamed: 0,Date,Location,Opp key,W/L,Tm,Opp,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,eFG%,TOV%,ORB%,FT/FGA,FG_SMA,FG_CMA,FG_EMA,FGA_SMA,FGA_CMA,FGA_EMA,FG%_SMA,FG%_CMA,FG%_EMA,3P_SMA,3P_CMA,3P_EMA,3PA_SMA,3PA_CMA,3PA_EMA,3P%_SMA,3P%_CMA,3P%_EMA,FT_SMA,FT_CMA,FT_EMA,FTA_SMA,FTA_CMA,FTA_EMA,FT%_SMA,FT%_CMA,FT%_EMA,ORB_SMA,ORB_CMA,ORB_EMA,TRB_SMA,TRB_CMA,TRB_EMA,AST_SMA,AST_CMA,AST_EMA,STL_SMA,STL_CMA,STL_EMA,BLK_SMA,BLK_CMA,BLK_EMA,TOV_SMA,TOV_CMA,TOV_EMA,PF_SMA,PF_CMA,PF_EMA,ORtg_SMA,ORtg_CMA,ORtg_EMA,DRtg_SMA,DRtg_CMA,DRtg_EMA,Pace_SMA,Pace_CMA,Pace_EMA,FTr_SMA,FTr_CMA,FTr_EMA,3PAr_SMA,3PAr_CMA,3PAr_EMA,TS%_SMA,TS%_CMA,TS%_EMA,TRB%_SMA,TRB%_CMA,TRB%_EMA,AST%_SMA,AST%_CMA,AST%_EMA,STL%_SMA,STL%_CMA,STL%_EMA,BLK%_SMA,BLK%_CMA,BLK%_EMA,eFG%_SMA,eFG%_CMA,eFG%_EMA,TOV%_SMA,TOV%_CMA,TOV%_EMA,ORB%_SMA,ORB%_CMA,ORB%_EMA,FT/FGA_SMA,FT/FGA_CMA,FT/FGA_EMA
0,2019-11-26,@,dayton,W,75,37,31,62,0.5,10,19,0.526,3,4,0.75,8,44,19,7,8,12,8,107.3,53.0,69.9,0.065,0.306,0.587,57.9,61.3,10.0,18.6,0.581,15.8,26.7,0.048,28.8,28.8,28.925926,62.6,62.6,62.037037,0.461,0.461,0.467074,7.2,7.2,7.432099,20.8,20.8,20.604938,0.346,0.346,0.35937,10.2,10.2,10.333333,14.8,14.8,14.765432,0.7028,0.7028,0.732111,9.6,9.6,9.160494,36.0,36.0,36.925926,18.0,18.0,18.160494,10.2,10.2,9.37037,5.6,5.6,5.45679,12.0,12.0,11.975309,10.2,10.2,10.333333,104.3,104.3,104.953086,75.36,75.36,76.774074,72.04,72.04,72.180247,0.2378,0.2378,0.238654,0.3336,0.3336,0.33363,0.5394,0.5394,0.548704,52.9,52.9,53.360494,62.12,62.12,62.411111,14.1,14.1,12.962963,13.68,13.68,13.487654,0.5196,0.5196,0.528012,14.74,14.74,14.841975,30.3,30.3,29.403704,0.164,0.164,0.167148
1,2019-12-05,@,seton-hall,W,92,78,34,66,0.515,8,20,0.4,16,19,0.842,14,40,18,3,4,14,10,126.1,106.9,73.0,0.288,0.303,0.613,66.7,52.9,4.1,12.1,0.576,15.7,50.0,0.242,29.6,29.166667,29.617284,62.0,62.5,62.024691,0.478,0.4675,0.478049,8.2,7.666667,8.288066,21.6,20.5,20.069959,0.3846,0.376,0.414914,8.2,9.0,7.888889,11.4,13.0,11.176955,0.729,0.710667,0.738074,9.4,9.333333,8.773663,37.2,37.333333,39.283951,19.4,18.166667,18.440329,9.6,9.666667,8.580247,6.0,6.0,6.304527,12.4,12.0,11.983539,9.4,9.833333,9.555556,106.42,104.8,105.735391,69.58,71.633333,68.849383,71.14,71.683333,71.420165,0.1862,0.209,0.18077,0.3486,0.329,0.32442,0.5608,0.547333,0.561469,54.34,53.733333,54.873663,65.5,61.983333,62.040741,13.42,13.416667,11.975309,14.74,14.5,15.19177,0.545,0.529833,0.545675,15.54,14.916667,15.161317,30.5,29.7,28.502469,0.1336,0.144667,0.127432
2,2019-12-08,H,notre-dame,W,81,57,30,66,0.455,8,23,0.348,13,19,0.684,13,45,18,5,6,15,9,106.4,74.9,76.5,0.288,0.348,0.54,60.8,60.0,6.6,15.0,0.515,16.7,38.2,0.197,31.0,29.857143,31.078189,62.2,63.0,63.349794,0.498,0.474286,0.490366,8.6,7.714286,8.192044,21.0,20.428571,20.046639,0.4124,0.379429,0.409942,10.6,10.0,10.592593,13.8,13.857143,13.784636,0.7832,0.729429,0.772716,10.0,10.0,10.515775,38.8,37.714286,39.522634,19.6,18.142857,18.293553,7.0,8.714286,6.720165,5.8,5.714286,5.536351,12.2,12.285714,12.655693,9.6,9.857143,9.703704,114.18,107.842857,112.523594,77.04,76.671429,81.532922,71.14,71.871429,71.946776,0.2222,0.220286,0.216513,0.3384,0.325286,0.31728,0.5898,0.556714,0.578646,57.98,55.585714,58.815775,63.48,60.685714,58.993827,9.88,12.085714,9.350206,15.08,14.157143,14.16118,0.5678,0.536429,0.555783,15.08,15.028571,15.340878,34.22,32.6,35.668313,0.1696,0.158571,0.165621
3,2019-12-16,@,depaul,W,84,74,35,64,0.547,8,20,0.4,6,8,0.75,14,46,21,5,4,20,12,114.2,100.6,73.9,0.125,0.313,0.619,64.8,60.0,6.8,9.3,0.609,22.8,50.0,0.094,30.8,29.875,30.718793,62.8,63.375,64.233196,0.4906,0.471875,0.478578,8.6,7.75,8.128029,21.2,20.75,21.031093,0.4092,0.3755,0.389295,10.6,10.375,11.395062,14.4,14.5,15.523091,0.7574,0.72375,0.743144,10.4,10.375,11.34385,41.0,38.625,41.348422,19.0,18.125,18.195702,6.6,8.25,6.146776,5.8,5.75,5.690901,13.0,12.625,13.437128,9.8,9.75,9.469136,111.48,107.6625,110.482396,76.42,76.45,79.321948,72.6,72.45,73.464518,0.229,0.22875,0.240342,0.3382,0.328125,0.32752,0.5802,0.554625,0.565764,58.8,56.2375,59.477183,61.94,60.6,59.329218,9.18,11.4,8.433471,14.66,14.2625,14.440786,0.5596,0.53375,0.542189,15.72,15.2375,15.793919,34.52,33.3,36.512209,0.1678,0.163375,0.176081
4,2019-12-22,H,oklahoma,W,97,53,44,78,0.564,8,20,0.4,1,3,0.333,14,49,22,9,11,13,9,121.9,66.6,79.6,0.038,0.256,0.611,64.5,50.0,11.3,27.5,0.615,14.1,45.2,0.013,31.8,30.444444,32.145862,63.8,63.444444,64.155464,0.4984,0.480222,0.501385,8.4,7.777778,8.085353,20.6,20.666667,20.687395,0.411,0.378222,0.392863,9.0,9.888889,9.596708,11.6,13.777778,13.015394,0.7802,0.726667,0.745429,11.4,10.777778,12.229233,42.8,39.444444,42.898948,19.0,18.444444,19.130468,5.4,7.888889,5.764518,5.2,5.555556,5.127267,15.0,13.444444,15.624752,9.6,10.0,10.312757,110.8,108.388889,111.721597,84.06,79.133333,86.414632,73.26,72.611111,73.609678,0.1794,0.217222,0.201895,0.3228,0.326444,0.32268,0.5844,0.561778,0.583509,60.88,57.188889,61.251456,59.94,60.533333,59.552812,7.42,10.888889,7.88898,13.1,13.711111,12.727191,0.5644,0.542111,0.564459,17.76,16.077778,18.129279,38.32,35.155556,41.008139,0.1392,0.155667,0.148721


## Merge opponent data

In [52]:
all_stat_cols = [item for col in STAT_LABELS for item in [col, f'{col}_SMA', f'{col}_CMA', f'{col}_EMA']]
opposing_stat_cols = [item for col in STAT_LABELS for item in [f'opp_{col}', f'opp_{col}_SMA', f'opp_{col}_CMA', f'opp_{col}_EMA']]
rename_opposing_cols = {item: f'opp_{item}' for stat in STAT_LABELS for item in [f'{stat}', f'{stat}_SMA', f'{stat}_CMA', f'{stat}_EMA']}

def merge_opponent_data_for_school(school_key, season, span = 5, isWomens = False):
    file_path = get_team_season_file_path(school_key, season, f'{school_key}_{span}ma.csv', isWomens)
    team_df = pd.read_csv(file_path)

    if team_df.shape[0] < 1:
        return

    home_df, away_df = pd.DataFrame(), pd.DataFrame()
    for index, row in team_df.iterrows():
        try:
            game_obj = row.to_dict()
            opponent_key = game_obj.get('Opp key')

            opponent_file_path = get_team_season_file_path(opponent_key, season, f'{opponent_key}_{span}ma.csv', isWomens)
            opponent_df = pd.read_csv(opponent_file_path)

            opponent_df = opponent_df.loc[(opponent_df['Opp key'] == school_key) & (opponent_df['Date'] == game_obj.get('Date'))]
            current_df = team_df[(team_df['Opp key'] == game_obj.get('Opp key')) & (team_df['Date'] == game_obj.get('Date'))]

            if game_obj.get('Location') == '@':
                home_df, away_df = pd.concat([home_df, opponent_df]), pd.concat([away_df, current_df])
            else:
                home_df, away_df = pd.concat([home_df, current_df]), pd.concat([away_df, opponent_df])
        except FileNotFoundError:
            continue

    # flip score column names for away dataframe to match home dataframe
    away_df.rename(columns={'Tm': 'Opp', 'Opp': 'Tm'}, inplace=True)

    away_df = away_df.drop(['Location', 'Opp key', 'W/L'], axis=1)
    away_df.rename(columns=rename_opposing_cols, inplace=True)

    merged_df = pd.merge(home_df, away_df, on=["Date", "Tm", "Opp"])
    merged_df = merged_df.sort_values(by='Date')

    merged_file_path = get_team_season_file_path(school_key, season, f'{school_key}_{span}span_full.csv', isWomens)
    merged_df.to_csv(merged_file_path, index=False)

def merge_opponent_data_for_single_season(season, span = 5, isWomens = False):
    teams = WOMENS_SR_SCHOOL_KEYS if isWomens else MENS_SR_SCHOOL_KEYS
    for school_key in tqdm(teams, unit=f'school ({season} full csv)'):
        try:
            merge_opponent_data_for_school(school_key, season, span, isWomens)
        except FileNotFoundError:
            continue

def merge_opponent_data(seasons, span = 5, isWomens = False):
    for season in seasons:
        merge_opponent_data_for_single_season(season, span, isWomens)

In [53]:
# merge_opponent_data(['2020', '2021', '2022', '2023'], isWomens=True)

Quick visual verification:

In [54]:
school_key, season, span = 'connecticut', 2023, 5
file_path = get_team_season_file_path(school_key, season, f'{school_key}_{span}span_full.csv', isWomens=True)
team_df = pd.read_csv(file_path)
team_df.head()

Unnamed: 0,Date,Location,Opp key,W/L,Tm,Opp,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF,ORtg,DRtg,Pace,FTr,3PAr,TS%,TRB%,AST%,STL%,BLK%,eFG%,TOV%,ORB%,FT/FGA,FG_SMA,FG_CMA,FG_EMA,FGA_SMA,FGA_CMA,FGA_EMA,FG%_SMA,FG%_CMA,FG%_EMA,3P_SMA,3P_CMA,3P_EMA,3PA_SMA,3PA_CMA,3PA_EMA,3P%_SMA,3P%_CMA,3P%_EMA,FT_SMA,FT_CMA,FT_EMA,FTA_SMA,FTA_CMA,FTA_EMA,FT%_SMA,FT%_CMA,FT%_EMA,ORB_SMA,ORB_CMA,ORB_EMA,TRB_SMA,TRB_CMA,TRB_EMA,AST_SMA,AST_CMA,AST_EMA,STL_SMA,STL_CMA,STL_EMA,BLK_SMA,BLK_CMA,BLK_EMA,TOV_SMA,TOV_CMA,TOV_EMA,PF_SMA,PF_CMA,PF_EMA,ORtg_SMA,ORtg_CMA,ORtg_EMA,DRtg_SMA,DRtg_CMA,DRtg_EMA,Pace_SMA,Pace_CMA,Pace_EMA,FTr_SMA,FTr_CMA,FTr_EMA,3PAr_SMA,3PAr_CMA,3PAr_EMA,TS%_SMA,TS%_CMA,TS%_EMA,TRB%_SMA,TRB%_CMA,TRB%_EMA,AST%_SMA,AST%_CMA,AST%_EMA,STL%_SMA,STL%_CMA,STL%_EMA,BLK%_SMA,BLK%_CMA,BLK%_EMA,eFG%_SMA,eFG%_CMA,eFG%_EMA,TOV%_SMA,TOV%_CMA,TOV%_EMA,ORB%_SMA,ORB%_CMA,ORB%_EMA,FT/FGA_SMA,FT/FGA_CMA,FT/FGA_EMA,opp_FG,opp_FGA,opp_FG%,opp_3P,opp_3PA,opp_3P%,opp_FT,opp_FTA,opp_FT%,opp_ORB,opp_TRB,opp_AST,opp_STL,opp_BLK,opp_TOV,opp_PF,opp_ORtg,opp_DRtg,opp_Pace,opp_FTr,opp_3PAr,opp_TS%,opp_TRB%,opp_AST%,opp_STL%,opp_BLK%,opp_eFG%,opp_TOV%,opp_ORB%,opp_FT/FGA,opp_FG_SMA,opp_FG_CMA,opp_FG_EMA,opp_FGA_SMA,opp_FGA_CMA,opp_FGA_EMA,opp_FG%_SMA,opp_FG%_CMA,opp_FG%_EMA,opp_3P_SMA,opp_3P_CMA,opp_3P_EMA,opp_3PA_SMA,opp_3PA_CMA,opp_3PA_EMA,opp_3P%_SMA,opp_3P%_CMA,opp_3P%_EMA,opp_FT_SMA,opp_FT_CMA,opp_FT_EMA,opp_FTA_SMA,opp_FTA_CMA,opp_FTA_EMA,opp_FT%_SMA,opp_FT%_CMA,opp_FT%_EMA,opp_ORB_SMA,opp_ORB_CMA,opp_ORB_EMA,opp_TRB_SMA,opp_TRB_CMA,opp_TRB_EMA,opp_AST_SMA,opp_AST_CMA,opp_AST_EMA,opp_STL_SMA,opp_STL_CMA,opp_STL_EMA,opp_BLK_SMA,opp_BLK_CMA,opp_BLK_EMA,opp_TOV_SMA,opp_TOV_CMA,opp_TOV_EMA,opp_PF_SMA,opp_PF_CMA,opp_PF_EMA,opp_ORtg_SMA,opp_ORtg_CMA,opp_ORtg_EMA,opp_DRtg_SMA,opp_DRtg_CMA,opp_DRtg_EMA,opp_Pace_SMA,opp_Pace_CMA,opp_Pace_EMA,opp_FTr_SMA,opp_FTr_CMA,opp_FTr_EMA,opp_3PAr_SMA,opp_3PAr_CMA,opp_3PAr_EMA,opp_TS%_SMA,opp_TS%_CMA,opp_TS%_EMA,opp_TRB%_SMA,opp_TRB%_CMA,opp_TRB%_EMA,opp_AST%_SMA,opp_AST%_CMA,opp_AST%_EMA,opp_STL%_SMA,opp_STL%_CMA,opp_STL%_EMA,opp_BLK%_SMA,opp_BLK%_CMA,opp_BLK%_EMA,opp_eFG%_SMA,opp_eFG%_CMA,opp_eFG%_EMA,opp_TOV%_SMA,opp_TOV%_CMA,opp_TOV%_EMA,opp_ORB%_SMA,opp_ORB%_CMA,opp_ORB%_EMA,opp_FT/FGA_SMA,opp_FT/FGA_CMA,opp_FT/FGA_EMA
0,2022-12-02,H,providence,W,98,53,36,59,0.61,13,23,0.565,13,18,0.722,8,41,28,6,6,12,12,136.2,73.7,71.9,0.305,0.39,0.725,60.3,77.8,8.3,12.5,0.72,15.1,33.3,0.22,33.4,33.4,33.901235,63.6,63.6,64.259259,0.525,0.525,0.526901,8.0,8.0,7.851852,20.2,20.2,20.074074,0.4146,0.4146,0.405531,12.4,12.4,11.382716,16.8,16.8,15.432099,0.6816,0.6816,0.642901,10.2,10.2,10.160494,34.6,34.6,35.82716,21.4,21.4,22.666667,10.6,10.6,10.160494,3.4,3.4,3.839506,14.8,14.8,14.45679,15.8,15.8,14.777778,114.02,114.02,114.023457,82.12,82.12,82.585185,76.38,76.38,76.239506,0.2702,0.2702,0.247864,0.3132,0.3132,0.308815,0.6086,0.6086,0.607358,57.58,57.58,58.658025,63.56,63.56,66.104938,13.7,13.7,13.159259,10.56,10.56,12.07037,0.588,0.588,0.588086,17.12,17.12,16.779012,36.74,36.74,36.479012,0.1988,0.1988,0.183222,23,68,0.338,4,20,0.2,3,7,0.429,11,27,11,5,2,12,15,73.7,136.2,71.9,0.103,0.294,0.372,39.7,47.8,7.0,5.6,0.368,14.4,25.0,0.044,23.8,24.125,25.893004,58.0,57.875,60.180155,0.408,0.419,0.429846,5.2,4.875,6.015089,17.4,15.375,17.241427,0.3046,0.338875,0.362046,12.0,12.0,11.847279,17.0,17.625,16.994513,0.6996,0.67075,0.690275,8.6,8.875,9.163237,37.8,36.75,39.286694,12.6,12.0,13.951075,6.2,6.875,5.940558,3.4,3.375,3.555556,15.6,14.25,13.163237,11.4,11.25,9.540009,89.84,91.6625,97.694879,73.66,76.0125,76.609145,72.28,71.2125,71.433882,0.301,0.3095,0.285685,0.3,0.26525,0.287582,0.4888,0.493625,0.509711,53.66,52.75,54.79273,51.72,48.5,52.77037,8.62,9.6875,8.32684,9.34,10.1,9.777641,0.4522,0.4615,0.479488,19.16,17.7625,16.229995,25.9,26.875,27.805258,0.2116,0.20875,0.197694
1,2022-12-04,H,connecticut,W,74,60,32,57,0.561,6,13,0.462,4,7,0.571,4,30,15,7,6,17,17,102.4,83.0,72.3,0.123,0.228,0.613,54.5,46.9,9.7,15.4,0.614,22.0,18.2,0.07,30.4,30.428571,29.956104,63.4,62.857143,62.869684,0.481,0.486143,0.477394,5.2,5.428571,4.979424,14.8,14.857143,14.632373,0.337,0.355571,0.327313,20.8,21.142857,18.447188,30.4,29.285714,27.019204,0.6892,0.729,0.689815,14.8,13.857143,13.633745,44.0,42.857143,40.61454,17.6,18.285714,17.972565,7.2,8.428571,7.691358,4.6,5.285714,4.766804,15.6,15.428571,14.593964,17.4,17.571429,16.256516,111.02,111.971429,109.106996,82.92,82.442857,86.962551,78.24,78.142857,76.465295,0.48,0.475857,0.433449,0.234,0.237714,0.23352,0.559,0.571714,0.550882,62.66,61.114286,58.253361,57.66,59.957143,59.875857,9.14,10.771429,10.009191,9.44,10.914286,9.554321,0.522,0.529714,0.517075,16.64,16.642857,16.105487,46.0,44.085714,41.859122,0.3298,0.346143,0.29694,21,57,0.368,8,18,0.444,10,13,0.769,7,25,15,4,3,15,15,83.0,102.4,72.3,0.228,0.316,0.475,45.5,71.4,5.5,6.8,0.439,19.2,21.2,0.175,33.8,33.833333,34.600823,62.2,62.833333,62.506173,0.544,0.539167,0.554601,9.0,8.833333,9.567901,20.0,20.666667,21.049383,0.461,0.439667,0.458687,10.6,12.5,11.921811,15.2,17.0,16.288066,0.6568,0.688333,0.669267,9.2,9.833333,9.440329,35.2,35.666667,37.55144,23.2,22.5,24.444444,8.2,9.833333,8.773663,3.4,3.833333,4.559671,13.8,14.333333,13.63786,14.8,15.166667,13.851852,117.06,117.716667,121.415638,87.24,80.716667,79.623457,74.56,75.633333,74.793004,0.2524,0.276,0.266909,0.3184,0.326,0.335877,0.6286,0.628,0.646572,56.76,58.033333,59.20535,67.94,65.933333,70.003292,10.92,12.8,11.539506,9.18,10.883333,12.21358,0.6168,0.61,0.632058,16.58,16.783333,16.219342,32.56,36.166667,35.419342,0.1762,0.202333,0.195481
2,2022-12-08,H,princeton,W,69,64,26,44,0.591,6,12,0.5,11,17,0.647,4,31,17,5,4,25,20,91.3,84.7,75.6,0.386,0.273,0.663,55.4,65.4,6.6,10.0,0.659,32.4,20.0,0.25,31.4,32.0,30.067215,61.6,62.0,60.670782,0.5076,0.514714,0.492401,9.0,8.714286,9.045267,20.6,20.285714,20.032922,0.4432,0.440286,0.453791,10.8,12.142857,11.281207,14.2,16.428571,15.192044,0.7106,0.699857,0.702512,8.8,9.428571,8.626886,34.6,34.142857,33.367627,22.4,21.428571,21.296296,7.4,9.0,7.182442,3.8,3.714286,4.039781,13.6,14.428571,14.091907,13.8,15.142857,14.234568,111.82,112.757143,108.610425,87.72,83.814286,87.215638,73.82,75.157143,73.962003,0.238,0.269143,0.25394,0.3316,0.324571,0.329251,0.6026,0.606143,0.589381,54.88,56.242857,54.6369,70.7,66.714286,70.468861,9.92,11.757143,9.526337,10.02,10.3,10.409053,0.5812,0.585571,0.567705,16.64,17.128571,17.212894,29.88,34.028571,30.679561,0.1812,0.198429,0.188654,22,65,0.338,7,25,0.28,13,19,0.684,9,25,13,13,1,13,18,84.7,91.3,75.6,0.292,0.385,0.432,44.6,59.1,17.2,3.1,0.392,14.9,25.0,0.2,22.4,22.571429,22.368999,57.6,57.142857,55.807956,0.398,0.401714,0.405587,4.4,4.714286,4.139918,15.0,15.571429,15.198903,0.3172,0.305571,0.279671,14.4,13.571429,16.108368,18.2,18.428571,20.222222,0.7788,0.741286,0.788978,10.4,10.142857,9.288066,32.8,34.428571,33.510288,11.4,12.285714,12.334705,10.0,8.428571,8.802469,3.0,3.571429,2.426612,12.8,13.0,12.102881,16.0,16.142857,16.736626,92.78,92.328571,95.704801,87.82,87.471429,84.953224,69.02,69.142857,68.37476,0.3172,0.324714,0.362568,0.2562,0.269286,0.269831,0.4872,0.486429,0.500041,52.04,53.328571,53.802881,51.64,55.071429,55.483539,14.62,12.271429,13.063374,7.6,9.057143,6.152126,0.4362,0.442857,0.442545,16.12,16.414286,15.436077,28.18,28.814286,28.284774,0.2504,0.238143,0.288019
3,2022-12-11,H,connecticut,W,85,78,30,71,0.423,12,30,0.4,13,16,0.813,8,23,19,12,3,5,17,112.0,102.8,75.9,0.225,0.423,0.541,33.8,63.3,15.8,6.3,0.507,6.0,18.6,0.183,28.4,27.363636,27.855137,66.2,64.909091,66.996698,0.4296,0.424909,0.416946,6.4,6.818182,6.979119,18.4,19.454545,20.152805,0.3372,0.355909,0.338736,14.0,13.272727,12.289099,17.2,17.0,15.19367,0.8084,0.769636,0.800721,12.4,10.454545,12.032905,33.6,34.545455,32.308388,15.6,15.181818,14.909736,8.6,8.545455,8.859236,2.6,3.363636,2.076208,13.6,14.545455,14.497468,17.4,17.909091,18.640011,101.76,97.072727,98.140834,97.26,90.727273,98.157144,75.88,77.427273,76.503319,0.2636,0.266182,0.23211,0.2766,0.298909,0.298831,0.519,0.515091,0.506117,51.46,49.863636,50.464311,54.46,55.727273,53.190897,11.3,11.027273,11.549662,5.92,8.781818,5.108716,0.4776,0.477455,0.468361,15.44,16.681818,16.367021,36.12,30.927273,34.598393,0.2144,0.207818,0.18775,31,58,0.534,3,10,0.3,13,15,0.867,10,45,21,2,3,21,15,102.8,112.0,75.9,0.259,0.172,0.599,66.2,67.7,2.6,7.3,0.56,24.4,40.0,0.224,29.8,31.25,28.711477,56.8,59.75,55.113855,0.5258,0.52425,0.525267,8.2,8.375,8.030178,17.2,19.25,17.355281,0.4742,0.44775,0.469194,10.4,12.0,11.187471,14.8,16.5,15.794696,0.6542,0.69325,0.684008,7.6,8.75,7.084591,34.2,33.75,32.578418,21.4,20.875,19.864198,5.8,8.5,6.454961,4.0,3.75,4.02652,15.8,15.75,17.727938,14.2,15.75,16.156379,106.84,110.075,102.840283,87.04,83.925,86.377092,73.28,75.2125,74.508002,0.274,0.28375,0.29796,0.301,0.318125,0.310501,0.6132,0.61325,0.613921,56.1,56.1375,54.891267,70.84,66.55,68.779241,7.92,11.1125,8.550892,10.26,10.2625,10.272702,0.5982,0.59475,0.598137,19.96,19.0375,22.275263,28.0,32.275,27.119707,0.193,0.204875,0.209103
4,2022-12-18,N,florida-state,W,85,77,30,63,0.476,7,21,0.333,18,21,0.857,10,39,22,3,7,18,19,104.3,94.5,81.5,0.333,0.333,0.582,61.9,73.3,3.7,18.4,0.532,19.8,37.0,0.286,30.4,31.222222,29.474318,57.4,59.555556,56.075903,0.5308,0.525333,0.528178,7.6,7.777778,6.353452,16.6,18.222222,14.903521,0.4418,0.431333,0.412796,9.8,12.111111,11.791648,13.8,16.333333,15.529797,0.6676,0.712556,0.745005,7.6,8.888889,8.056394,36.0,35.0,36.718945,22.2,20.888889,20.242798,4.8,7.777778,4.969974,4.2,3.666667,3.684347,17.2,16.333333,18.818625,15.0,15.666667,15.770919,105.0,109.266667,102.826856,95.08,87.044444,94.918061,74.54,75.288889,74.972001,0.253,0.281,0.284973,0.2882,0.301889,0.264334,0.612,0.611667,0.608947,57.54,57.255556,58.660844,72.24,66.677778,68.419494,6.42,10.166667,6.567261,10.54,9.933333,9.281802,0.5974,0.590889,0.585424,21.28,19.633333,22.983509,29.1,33.133333,31.413138,0.1796,0.207,0.214069,26,67,0.388,8,29,0.276,17,21,0.81,7,24,12,12,7,12,26,94.5,104.3,81.5,0.313,0.433,0.5,38.1,46.2,14.7,16.7,0.448,13.5,19.4,0.254,32.8,32.083333,32.987084,67.8,70.25,70.285503,0.4848,0.458417,0.470775,8.0,7.25,8.00499,23.0,23.083333,24.137609,0.3546,0.328167,0.336078,20.2,17.0,20.610792,27.2,23.416667,28.186512,0.7398,0.712833,0.725852,14.0,12.916667,15.141848,45.2,43.5,46.183588,16.0,15.083333,15.4192,10.6,9.583333,12.568375,7.4,7.083333,8.081379,12.8,13.666667,12.43395,13.8,16.25,12.844621,117.94,107.625,116.382933,70.82,78.816667,65.305475,79.44,81.416667,80.993576,0.4,0.3355,0.400525,0.34,0.328333,0.34331,0.5818,0.5435,0.565725,60.02,56.05,59.386533,49.16,47.125,46.961167,13.2,11.583333,15.410752,19.28,15.875,21.722415,0.5438,0.510417,0.527566,13.76,14.391667,13.035712,41.12,36.1,40.79546,0.2976,0.24275,0.293253


## Generate dataset

In [55]:
FINAL_FEATURES = [
    'FG_SMA', 'FG_CMA', 'FG_EMA', 'FGA_SMA', 'FGA_CMA', 'FGA_EMA', 'FG%_SMA', 'FG%_CMA', 'FG%_EMA', '3P_SMA', '3P_CMA', '3P_EMA', '3PA_SMA', '3PA_CMA', '3PA_EMA', '3P%_SMA', '3P%_CMA', '3P%_EMA', 'FT_SMA', 'FT_CMA', 'FT_EMA', 
    'FTA_SMA', 'FTA_CMA', 'FTA_EMA', 'FT%_SMA', 'FT%_CMA', 'FT%_EMA', 'ORB_SMA', 'ORB_CMA', 'ORB_EMA', 'TRB_SMA', 'TRB_CMA', 'TRB_EMA', 'AST_SMA', 'AST_CMA', 'AST_EMA', 'STL_SMA', 'STL_CMA', 'STL_EMA', 'BLK_SMA', 'BLK_CMA', 
    'BLK_EMA', 'TOV_SMA', 'TOV_CMA', 'TOV_EMA', 'PF_SMA', 'PF_CMA', 'PF_EMA', 'ORtg_SMA', 'ORtg_CMA', 'ORtg_EMA', 'DRtg_SMA', 'DRtg_CMA', 'DRtg_EMA', 'Pace_SMA', 'Pace_CMA', 'Pace_EMA', 'FTr_SMA', 'FTr_CMA', 'FTr_EMA', 
    '3PAr_SMA', '3PAr_CMA', '3PAr_EMA', 'TS%_SMA', 'TS%_CMA', 'TS%_EMA', 'TRB%_SMA', 'TRB%_CMA', 'TRB%_EMA', 'AST%_SMA', 'AST%_CMA', 'AST%_EMA', 'STL%_SMA', 'STL%_CMA', 'STL%_EMA', 'BLK%_SMA', 'BLK%_CMA', 'BLK%_EMA', 'eFG%_SMA', 
    'eFG%_CMA', 'eFG%_EMA', 'TOV%_SMA', 'TOV%_CMA', 'TOV%_EMA', 'ORB%_SMA', 'ORB%_CMA', 'ORB%_EMA', 'FT/FGA_SMA', 'FT/FGA_CMA', 'FT/FGA_EMA', 'opp_FG_SMA', 'opp_FG_CMA', 'opp_FG_EMA', 'opp_FGA_SMA', 'opp_FGA_CMA', 'opp_FGA_EMA', 
    'opp_FG%_SMA', 'opp_FG%_CMA', 'opp_FG%_EMA', 'opp_3P_SMA', 'opp_3P_CMA', 'opp_3P_EMA', 'opp_3PA_SMA', 'opp_3PA_CMA', 'opp_3PA_EMA', 'opp_3P%_SMA', 'opp_3P%_CMA', 'opp_3P%_EMA', 'opp_FT_SMA', 'opp_FT_CMA', 'opp_FT_EMA', 
    'opp_FTA_SMA', 'opp_FTA_CMA', 'opp_FTA_EMA', 'opp_FT%_SMA', 'opp_FT%_CMA', 'opp_FT%_EMA', 'opp_ORB_SMA', 'opp_ORB_CMA', 'opp_ORB_EMA', 'opp_TRB_SMA', 'opp_TRB_CMA', 'opp_TRB_EMA', 'opp_AST_SMA', 'opp_AST_CMA', 'opp_AST_EMA', 
    'opp_STL_SMA', 'opp_STL_CMA', 'opp_STL_EMA', 'opp_BLK_SMA', 'opp_BLK_CMA', 'opp_BLK_EMA', 'opp_TOV_SMA', 'opp_TOV_CMA', 'opp_TOV_EMA', 'opp_PF_SMA', 'opp_PF_CMA', 'opp_PF_EMA', 'opp_ORtg_SMA', 'opp_ORtg_CMA', 'opp_ORtg_EMA', 
    'opp_DRtg_SMA', 'opp_DRtg_CMA', 'opp_DRtg_EMA', 'opp_Pace_SMA', 'opp_Pace_CMA', 'opp_Pace_EMA', 'opp_FTr_SMA', 'opp_FTr_CMA', 'opp_FTr_EMA', 'opp_3PAr_SMA', 'opp_3PAr_CMA', 'opp_3PAr_EMA', 'opp_TS%_SMA', 'opp_TS%_CMA', 
    'opp_TS%_EMA', 'opp_TRB%_SMA', 'opp_TRB%_CMA', 'opp_TRB%_EMA', 'opp_AST%_SMA', 'opp_AST%_CMA', 'opp_AST%_EMA', 'opp_STL%_SMA', 'opp_STL%_CMA', 'opp_STL%_EMA', 'opp_BLK%_SMA', 'opp_BLK%_CMA', 'opp_BLK%_EMA', 'opp_eFG%_SMA', 
    'opp_eFG%_CMA', 'opp_eFG%_EMA', 'opp_TOV%_SMA', 'opp_TOV%_CMA', 'opp_TOV%_EMA', 'opp_ORB%_SMA', 'opp_ORB%_CMA', 'opp_ORB%_EMA', 'opp_FT/FGA_SMA', 'opp_FT/FGA_CMA', 'opp_FT/FGA_EMA', 'Neutral', 'Win'
]

def generate_season_dataset(season, span = 5, isWomens = False):
    all_data_df = pd.DataFrame()

    teams = WOMENS_SR_SCHOOL_KEYS if isWomens else MENS_SR_SCHOOL_KEYS
    for school_key in tqdm(teams):
        file_path = get_team_season_file_path(school_key, season, f'{school_key}_{span}span_full.csv', isWomens)

        if os.path.exists(file_path):
            team_df = pd.read_csv(file_path)
            all_data_df = pd.concat([all_data_df, team_df])
        else:
            continue

    # sort by date
    all_data_df = all_data_df.sort_values(by="Date")

    # Add feature for if game is neutral site
    all_data_df['Neutral'] = all_data_df['Location'].apply(lambda x: 1 if x == 'N' else 0)

    # add label for win (1 = win, 0 = loss)
    all_data_df['Win'] = (all_data_df['Tm'] > all_data_df['Opp']).astype(int)

    # remove meta columns
    all_data_df = all_data_df.drop(META_LABELS, axis=1)

    # remove non moving average columns
    all_data_df = all_data_df.drop(STAT_LABELS, axis=1)
    all_data_df = all_data_df.drop([f'opp_{col}' for col in STAT_LABELS], axis=1)

    # Drop any rows with NULL value
    all_data_df.dropna(inplace=True)

    # Drop any duplicate rows
    all_data_df.drop_duplicates(inplace=True)

    # Reorder columns
    all_data_df = all_data_df.reindex(FINAL_FEATURES, axis=1)

    assert all([all_data_df.columns.to_list()[i] == FINAL_FEATURES[i] for i in range(len(FINAL_FEATURES))])

    type = 'women' if isWomens else 'men'
    training_data_path = os.path.abspath(f'../../data/dataset/{type}/{season}_{span}span_dataset.csv')
    all_data_df.to_csv(training_data_path, index=False)

def generate_datasets(seasons, span = 5, isWomens = False):
    for season in seasons:
        generate_season_dataset(season, span, isWomens)

In [56]:
# generate_datasets(['2020', '2021', '2022', '2023'], isWomens=True)

## Create full train test split
A typical train-test split for machine learning models is to use 70% of the data for training and 30% for testing.

In [57]:
def generate_test_train(split_neutral = False, span = 5, isWomens = False):
    type = 'women' if isWomens else 'men'
    data_dir_path = os.path.abspath(f'../../data/dataset/{type}/')
    pattern = r"\d{4}_" + str(span) + r"span_dataset\.csv"
    filenames = [filename for filename in os.listdir(data_dir_path) if re.match(pattern, filename)]
    merged_df = pd.concat([pd.read_csv(os.path.join(data_dir_path, filename)) for filename in filenames], ignore_index=True)

    print(f'Original shape: {merged_df.shape}')
    merged_df.dropna(inplace=True)
    print(f'Shape after dropping rows with null values: {merged_df.shape}')
    merged_df.drop_duplicates(inplace=True)
    print(f'Shape after dropping duplicate rows: {merged_df.shape}')

    if split_neutral:
        # Split the DataFrame into two based on 'Neutral' column
        neutral_df = merged_df[merged_df['Neutral'] == 1]
        home_away_df = merged_df[merged_df['Neutral'] == 0]

        neutral_train_df, neutral_test_df = train_test_split(neutral_df, test_size=0.3)
        home_away_train_df, home_away_test_df = train_test_split(home_away_df, test_size=0.3)

        print(len(neutral_train_df), 'neutral train examples')
        print(len(neutral_test_df), 'neutral test examples')
        print(len(home_away_train_df), 'home/away train examples')
        print(len(home_away_test_df), 'home/away test examples')

        neutral_train_df.to_csv(os.path.join(data_dir_path, f'{span}span_neutral_training_set.csv'), index=False)
        neutral_test_df.to_csv(os.path.join(data_dir_path, f'{span}span_neutral_testing_set.csv'), index=False)
        home_away_train_df.to_csv(os.path.join(data_dir_path, f'{span}span_home_away_training_set.csv'), index=False)
        home_away_test_df.to_csv(os.path.join(data_dir_path, f'{span}span_home_away_testing_set.csv'), index=False)
    else:
        train_df, test_df = train_test_split(merged_df, test_size=0.3)

        print(len(train_df), 'train examples')
        print(len(test_df), 'test examples')

        train_df.to_csv(os.path.join(data_dir_path, f'{span}span_training_set.csv'), index=False)
        test_df.to_csv(os.path.join(data_dir_path, f'{span}span_testing_set.csv'), index=False)

In [58]:
# generate_test_train(isWomens=True)

In [59]:
# isWomens = True
# spans = [3, 5, 7]
# for span in spans:
#     generate_moving_averages(['2020', '2021', '2022', '2023'], span=span, isWomens=isWomens)
#     merge_opponent_data(['2020', '2021', '2022', '2023'], span=span, isWomens=isWomens)
#     generate_datasets(['2020', '2021', '2022', '2023'], span=span, isWomens=isWomens)
#     generate_test_train(split_neutral = False, span=span, isWomens=isWomens)

# Latest statistic for each team

In [60]:
def generate_latest_stats_dict(season = 2024, spans = [3, 5, 7], isWomens = False):
    download_gamelogs_for_single_season(season, isWomens)
    create_basic_gamelogs_for_single_season(season, isWomens)
    create_advanced_gamelogs_for_single_season(season, isWomens)
    combine_basic_advanced_gamelogs_for_single_season(season, isWomens)

    latest_stats = {}
    for span in spans:
        latest_stats[span] = {}
        generate_moving_averages_for_single_season(season, keep_latest = True, span = span, isWomens = isWomens)
    
        teams = WOMENS_SR_SCHOOL_KEYS if isWomens else MENS_SR_SCHOOL_KEYS
        for school_key in tqdm(teams):
            file_path = get_team_season_file_path(school_key, season, f'{school_key}_{span}ma.csv', isWomens)

            if os.path.exists(file_path):
                team_df = pd.read_csv(file_path)

                # latest date
                date = team_df.loc[(team_df['Date'] != LATEST) & (team_df['Opp key'] != LATEST)]['Date'].iat[-1]

                # only get latest stats
                team_df = team_df.loc[(team_df['Date'] == LATEST) & (team_df['Opp key'] == LATEST)]

                assert team_df.shape[0] == 1

                # remove meta columns
                team_df = team_df.drop(META_LABELS, axis=1)

                # remove non moving average columns
                team_df = team_df.drop(STAT_LABELS, axis=1)

                # Reorder columns
                final_features_no_opp = [feature for feature in FINAL_FEATURES if 'opp_' not in feature and feature not in ['Neutral', 'Win']]
                team_df = team_df.reindex(final_features_no_opp, axis=1)

                assert len(team_df.columns.to_list()) == len(final_features_no_opp)
                assert all([team_df.columns.to_list()[i] == final_features_no_opp[i] for i in range(len(final_features_no_opp))])

                latest = team_df.iloc[-1:].to_numpy().tolist()
                latest_stats[span][school_key] = {'lastPlayed': date, 'stats': latest[0]}
    
    type = 'women' if isWomens else 'men'
    with open(f'{type}_data.json', 'w') as json_file:
        json.dump(latest_stats, json_file, indent=2)
        
    return latest_stats

In [61]:
# generate_latest_stats_dict(isWomens=False)
# generate_latest_stats_dict(isWomens=True)

# Top 25

In [62]:
# mens_models = [model_filename.split('.pkl')[0] for model_filename in os.listdir('../../machine-learning/model/mens/')]
# womens_models = [model_filename.split('.pkl')[0] for model_filename in os.listdir('../../machine-learning/model/womens/')]
mens_models = ["3span_ensemble", "5span_ensemble", "7span_ensemble"]
womens_models = ["3span_ensemble", "5span_ensemble", "7span_ensemble"]

mens_ap_top_25_teams = [
    "connecticut", "purdue", "alabama", "houston", "tennessee",
    "illinois", "north-carolina", "iowa-state", "duke", "north-carolina-state",
    "marquette", "arizona", "creighton", "clemson", "gonzaga",
    "baylor", "san-diego-state", "auburn", "kansas", "kentucky",
    "saint-marys-ca", "utah-state", "washington-state", "dayton", "south-carolina"
]

womens_ap_top_25_teams = [
    "south-carolina", "iowa", "connecticut", "north-carolina-state", "southern-california",
    "louisiana-state", "texas", "oregon-state", "stanford", "ucla",
    "notre-dame", "indiana", "baylor", "gonzaga", "colorado",
    "ohio-state", "duke", "virginia-tech", "kansas-state", "syracuse",
    "oklahoma", "utah", "creighton", "west-virginia", "iowa-state"
]

def generate_top25(models, top_25_teams, isWomens = False):
    top_25_matchups = list(permutations(top_25_teams, 2))

    api_body = []
    for team1, team2 in top_25_matchups:
        for model in models:
            for isNeutral in [True, False]:
                api_body.append({"model": model, "isNeutral": isNeutral, "team1": team1, "team2": team2, "isWomens": isWomens})

    def chunk_list(lst, chunk_size):
        return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

    chunks = chunk_list(api_body, 60)

    all_predictions = []
    for chunk in tqdm(chunks):
        response = requests.post(url='http://mlmb-api.azurewebsites.net/predict', json=chunk)
        json_data = response.json()
        all_predictions += json_data

    scores = {}
    for prediction in all_predictions:
        team1_score = scores.get(prediction['team1'], 0)
        team2_score = scores.get(prediction['team2'], 0)
        scores[prediction['team1']] = round(team1_score + prediction['predictProba'][0], 2)
        scores[prediction['team2']] = round(team2_score + prediction['predictProba'][1], 2)

    type = 'women' if isWomens else 'men'
    with open(f'{type}_top_25.json', 'w') as json_file:
        json.dump(scores, json_file, indent=2)
    return scores

# generate_top25(mens_models, mens_ap_top_25_teams, isWomens = False)
# generate_top25(womens_models, womens_ap_top_25_teams, isWomens = True)

100%|██████████| 60/60 [04:20<00:00,  4.33s/it]
100%|██████████| 60/60 [03:18<00:00,  3.32s/it]


{'south-carolina': 218.46,
 'iowa': 164.46,
 'connecticut': 186.89,
 'north-carolina-state': 110.57,
 'southern-california': 143.06,
 'louisiana-state': 167.35,
 'texas': 192.32,
 'oregon-state': 126.77,
 'stanford': 167.46,
 'ucla': 167.59,
 'notre-dame': 149.81,
 'indiana': 162.91,
 'baylor': 121.72,
 'gonzaga': 163.51,
 'colorado': 139.22,
 'ohio-state': 131.64,
 'duke': 114.54,
 'virginia-tech': 139.98,
 'kansas-state': 123.08,
 'syracuse': 95.25,
 'oklahoma': 107.69,
 'utah': 141.92,
 'creighton': 140.08,
 'west-virginia': 116.95,
 'iowa-state': 106.77}