This file creates week folders 1-18, creates a CSV file for each game inside the correct week, and sorts the plays from start to finish and places into the game CSV file.

In [1]:
import pandas as pd
import re
import os

In [15]:
df = pd.read_csv('../../Data/pbp-2023.csv')

In [16]:
dropped_columns = ['Unnamed: 10', 'Unnamed: 12', 'Unnamed: 16', 'Unnamed: 17', 'IsMeasurement']
df = df.drop(columns = dropped_columns)

In [17]:
teams = []

for team in df['OffenseTeam']:
    if team not in teams:
        teams.append(team)

In [18]:
dates = (df['GameDate'].unique())
dates = pd.to_datetime(dates)
dates = sorted(dates)

In [7]:
def get_week(dates):
    week_numbers = []
    week_num = 1

    # Initialize week_start to the Thursday before the first game
    week_start = dates[0] - pd.Timedelta(days=(dates[0].weekday() + 4) % 7)

    for date in dates:
        # Check if we need to start a new week
        if date - week_start >= pd.Timedelta(days=7):  
            week_num += 1
            week_start = date - pd.Timedelta(days=(date.weekday() + 4) % 7)  # Adjust to the new Thursday

        date_str = date.strftime('%Y-%m-%d')
        week_numbers.append((date_str, week_num))

    return week_numbers

In [8]:
dates_with_weeks = get_week(dates)

In [9]:
for i in range(1, 19):
    file_dir = os.path.join('../../Clean_Data/Weeks', str(i))
    os.makedirs(file_dir)

In [10]:
dir = '../../Clean_Data/Weeks'
def create_files(week, game_id_arr):
    week_dir = os.path.join(dir, week)

    if not os.path.exists(week_dir):
        os.makedirs(week_dir)

    for game_id in game_id_arr:
        file_path = os.path.join(week_dir, f"{game_id}.csv")
        with open(file_path, 'w') as f:
            pass 
        


In [12]:
# Create blank placeholder files for each game inside of correct week folder

current_week = 1

game_ids = []

for date in dates_with_weeks:
    if date[1] is not current_week:
        create_files(str(current_week), game_ids)
        game_ids = []
        current_week += 1
       
    games_on_date = df[df['GameDate'] == date[0]]

    for _, row in games_on_date.iterrows():
        if row['GameId'] not in game_ids:
            game_ids.append(row['GameId'])

create_files(str(current_week), game_ids)


In [19]:
# Helper function for duration to order plays from start to finish of each game

def calc_duration(play):
    # elapsed
    quarter_second = {
        1 : 0,
        2 : 900,
        3 : 1800,
        4 : 2700,
        5 : 3600, # OT
    }

    quarter_elapsed_seconds = 900 - ((play['Minute'] * 60) + play['Second'])
    
    elapsed_seconds = quarter_second.get(play['Quarter'], 0) + quarter_elapsed_seconds

    return elapsed_seconds

In [20]:
# Place plays from each game into correct file where plays are ordered from beginning to end.

directory = '../../Clean_Data/Weeks'

for week in os.listdir(directory):
    week_path = os.path.join(directory, week)

    for game in os.listdir(os.path.join(directory, week)):
        game_path = os.path.join(week_path, game)

        game_id = game[:-4] # remove file ext

        game_df = df[df['GameId'] == int(game_id)]
        
        game_df['Duration'] = game_df.apply(calc_duration, axis=1)


        offense_team = game_df.iloc[0]['OffenseTeam']
        defense_team = game_df.iloc[0]['DefenseTeam']

        game_df[f'{offense_team}'] = 0
        game_df[f'{defense_team}'] = 0

        game_df.sort_values(by='Duration', inplace=True)
        game_df.to_csv(game_path, index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_df['Duration'] = game_df.apply(calc_duration, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_df[f'{offense_team}'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  game_df[f'{defense_team}'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i