In [1]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta
from correlate_event_to_game_schedule import correlate_event_to_game_schedule
from notes_filter import notes_filter

#--------------------------User Inputs---------------------------------
injurydata_file_path = '/Users/nya/NBA project/NBA Project/01/clean datasets/injurydata_cleaned.csv'
schedule_file_path = '/Users/nya/NBA project/NBA Project/02/processed datasets/teams_schedule_processed.csv'

#-------------------------Load Files-----------------------------------
injury_events_df = pd.read_csv(injurydata_file_path, parse_dates=['Date'])
teams_schedule_df = pd.read_csv(schedule_file_path, parse_dates=['Date'])

#-----Processing - Section 1: Count Number of Missed Games--------
print('Counting number of missed games')

# Initialize columns to avoid future warnings about chain assignment
injury_events_df['reg_games_missed'] = np.nan
injury_events_df['post_games_missed'] = np.nan
injury_events_df['Season'] = None
injury_events_df['Year'] = None
injury_events_df['Game_number'] = None

# Iterate over the DataFrame
for index, row in injury_events_df.iterrows():
    results = correlate_event_to_game_schedule(teams_schedule_df, row['Date'], row['Team'])
    if results:
        game_number_ie, game_date_ie, season_ie, year_ie, teams_total_ie = results
    else:
        # If results are missing, continue to the next iteration
        continue

    # Initialize missed games
    regular_season_games_missed = 0
    post_season_games_missed = 0

    # Process Day-To-Day (DTD) events, injuries, and returns to lineup
    if 'DTD' in row.Notes or 'returned to lineup' not in row.Notes:
        # Adjust missed games based on conditions
        if season_ie == 'regular':
            regular_season_games_missed = 1  # Placeholder value for missed games
        elif season_ie == 'post':
            post_season_games_missed = 1  # Placeholder value for missed games
    elif 'returned to lineup' in row.Notes:
        # No games missed if player returned to lineup
        regular_season_games_missed = 0
        post_season_games_missed = 0

    # Update DataFrame with calculated values
    injury_events_df.loc[index, 'reg_games_missed'] = regular_season_games_missed
    injury_events_df.loc[index, 'post_games_missed'] = post_season_games_missed
    injury_events_df.loc[index, 'Season'] = season_ie
    injury_events_df.loc[index, 'Year'] = year_ie
    injury_events_df.loc[index, 'Game_number'] = game_number_ie

# Convert 'Year' to int, handling None values by converting them to NaN
injury_events_df['Year'] = pd.to_numeric(injury_events_df['Year'], errors='coerce').astype('Int64')

# Add placeholder column to match inactive lists format
injury_events_df['Out_for_Season'] = ""

# Drop rows that correspond to 'activated' or 'acquired'
injury_events_df = injury_events_df[pd.isna(injury_events_df['Acquired'])]

# Drop 'Acquired' column
injury_events_df.drop(['Acquired'], axis=1, inplace=True)

#------------Processing - Section 2: Filter Notes --------------------
print('Filtering Notes')
notes_columns = injury_events_df.apply(notes_filter, axis=1, result_type='expand')
injury_events_df['note_keyword'], injury_events_df['category'] = notes_columns[0], notes_columns[1]

#-----------------------Save Data-----------------------------
# Directory to save cleaned files
save_directory = '/Users/nya/NBA project/NBA Project/02/processed datasets'

# Define file paths
cleaned_data_path = os.path.join(save_directory, 'injurydata_processed.csv')

# Saving files
injury_events_df.to_csv(cleaned_data_path, index=False)

print(f"Cleaned dataset saved to {cleaned_data_path}")

Counting number of missed games
Filtering Notes
Cleaned dataset saved to /Users/nya/NBA project/NBA Project/02/processed datasets/injurydata_processed.csv
