In [None]:
%run clean_data.ipynb

In [None]:
def process_game(game_id, base_path='C:/Users/erden/OneDrive/Desktop/PFFFC/'):
    #Processes a single game JSONL file and returns a cleaned DataFrame.

    print(f'Starting {game_id}')
    
    # Load the game JSONL file
    file_path = f'{base_path}{game_id}.jsonl'
    game_df = pd.read_json(file_path, lines=True).query('game_event.notnull()')
    
    # Add custom column 'gameId'
    game_df['gameId'] = game_id
    
    # Filter rows where team_name is 'Argentina'
    game_argentina = game_df[game_df['game_event'].apply(lambda x: x.get('team_name') == 'Argentina')]
    
    # Add 'sequence' column extracted from 'game_event'
    game_argentina = game_argentina.assign(
        sequence=game_argentina['game_event'].apply(lambda x: x.get('sequence'))
    )
    
    return game_argentina

In [None]:
arg_games = metadata_df.query('(`awayTeam.name` == "Argentina" or `homeTeam.name` == "Argentina") and id != 10517')['id']

In [None]:
games_df = pd.concat([process_game(game_id) for game_id in arg_games], ignore_index=True)

In [None]:
games_df = games_df.query('possession_event_id.notna()')

In [None]:
def create_cleaned_tracking_df(rosters_df, metadata_df, game_df):
    # Step 1: Merge game_df with metadata_df to include team information
    game_df = game_df.merge(
        metadata_df[['id', 'homeTeam.id', 'homeTeam.name', 'awayTeam.id', 'awayTeam.name', 'homeTeamStartLeft']],
        left_on='gameId', right_on='id', how='left'
    )
    
    # Step 2: Expand player tracking data
    def explode_tracking_data(df, col_name, team_id, team_name, home_indicator):
        # Explode the player tracking data
        exploded = df.explode(col_name)
        # Expand player attributes from the dictionaries in tracking data
        exploded = pd.concat(
            [exploded.drop(columns=[col_name]), exploded[col_name].apply(pd.Series)],
            axis=1
        )
        # Add team information and home indicator
        exploded['team.id'] = team_id
        exploded['team.name'] = team_name
        exploded['Home'] = home_indicator
        return exploded

    # Explode home and away tracking data
    home_tracking = explode_tracking_data(
        game_df, 'homePlayersSmoothed', game_df['homeTeam.id'], game_df['homeTeam.name'], 1
    )
    away_tracking = explode_tracking_data(
        game_df, 'awayPlayersSmoothed', game_df['awayTeam.id'], game_df['awayTeam.name'], 0
    )

    # Combine home and away tracking data
    tracking_data = pd.concat([home_tracking, away_tracking], ignore_index=True)

    # Step 3: Merge with rosters_df to add player information
    rosters_df['shirtNumber'] = rosters_df['shirtNumber'].astype(str)

    tracking_data = tracking_data.merge(
        rosters_df.drop(columns=['team.name']), left_on=['gameId', 'team.id', 'jerseyNum'], right_on=['game_id', 'team.id', 'shirtNumber'], how='left')

    # Step 4: Add ToRight column
    def calculate_to_right(row):
        if row['Home'] == 1:
            return int((row['homeTeamStartLeft'] and row['period'] % 2 == 1) or 
                       (not row['homeTeamStartLeft'] and row['period'] % 2 == 0))
        else:
            return int((not row['homeTeamStartLeft'] and row['period'] % 2 == 1) or 
                       (row['homeTeamStartLeft'] and row['period'] % 2 == 0))

    # Use homeTeamStartLeft and period to calculate ToRight
    tracking_data['ToRight'] = tracking_data.apply(calculate_to_right, axis=1)

    tracking_data['x_normalized'] = tracking_data.apply(lambda row: -row['x'] if row['ToRight'] == 0 else row['x'], axis=1)
    tracking_data['y_normalized'] = tracking_data.apply(lambda row: -row['y'] if row['ToRight'] == 0 else row['y'], axis=1)

    # Step 5: Select and reorder columns for the cleaned DataFrame
    cleaned_tracking_df = tracking_data[[
        'game_id', 'sequence', 'possession_event_id', 'game_event_id',
        'frameNum', 'period',  # Include frameNum and period
        'player.id', 'player.nickname', 'team.id', 'team.name', 'Home',
        'ToRight', 'x', 'y', 'x_normalized', 'y_normalized', 'confidence'
    ]].copy()

    return cleaned_tracking_df

In [None]:
clean_tracking_df = create_cleaned_tracking_df(rosters_df, metadata_df, games_df)

In [None]:
clean_tracking_df.to_csv('C:/Users/erden/OneDrive/Desktop/PFFFC/clean_tracking.csv')

In [None]:
tracking_data = pd.concat([home_tracking, away_tracking], ignore_index=True)