<a href="https://colab.research.google.com/github/marclamberts/football-analysis/blob/main/Substitute_Interval_Entropy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Load data
file_path = '/content/drive/MyDrive/Recruitment/Brøndby 1-1 Nordsjælland.csv'
df = pd.read_csv(file_path)

# Create output directory
output_dir = '/content/drive/MyDrive/Recruitment/Match Analysis'
os.makedirs(output_dir, exist_ok=True)

# Get all key events
interval_events = df[df['typeId'].isin([16, 18, 19])][['typeId', 'playerName', 'contestantId', 'timeMin']]
pass_events = df[(df['typeId'] == 1) & (df['outcome'] == 1)][['contestantId', 'timeMin', 'playerName']]
shot_events = df[df['typeId'].isin([13, 14, 15, 16])][['contestantId', 'timeMin', 'playerName']]

# Match structure
match_intervals = {
    'first_half_start': 0,
    'first_half_end': 45,
    'second_half_start': 45,
    'match_end': max(df['timeMin']) if not df.empty else 90
}

# Entropy calculation function
def calculate_entropy(actions):
    if len(actions) == 0:
        return 0
    proportions = np.array(actions) / sum(actions)
    return -np.sum(proportions * np.log2(proportions + 1e-10))

# ==================== PLAYER DATA ====================
player_data = []
for team_id in interval_events['contestantId'].unique():
    team_events = interval_events[interval_events['contestantId'] == team_id]
    team_passes = pass_events[pass_events['contestantId'] == team_id]
    team_shots = shot_events[shot_events['contestantId'] == team_id]

    subs_on = team_events[team_events['typeId'] == 18]
    subs_off = team_events[team_events['typeId'] == 19]

    for player in set(subs_on['playerName']).union(set(subs_off['playerName'])):
        on = sorted(subs_on[subs_on['playerName'] == player]['timeMin'])
        off = sorted(subs_off[subs_off['playerName'] == player]['timeMin'])

        if off and (not on or off[0] < on[0]):
            on.insert(0, match_intervals['first_half_start'])
        if on and (not off or on[-1] > off[-1]):
            off.append(match_intervals['match_end'])

        # Calculate per-interval actions
        interval_actions = []
        total_passes = 0
        total_shots = 0

        for start, end in zip(on, off):
            # Get actions during this specific interval
            passes = team_passes[
                (team_passes['playerName'] == player) &
                (team_passes['timeMin'] >= start) &
                (team_passes['timeMin'] < end)
            ]
            shots = team_shots[
                (team_shots['playerName'] == player) &
                (team_shots['timeMin'] >= start) &
                (team_shots['timeMin'] < end)
            ]

            pass_cnt = len(passes)
            shot_cnt = len(shots)
            total_passes += pass_cnt
            total_shots += shot_cnt

            if pass_cnt > 0 or shot_cnt > 0:
                interval_actions.append({
                    'start': start,
                    'end': end,
                    'passes': pass_cnt,
                    'shots': shot_cnt,
                    'duration': end - start
                })

        # Calculate entropy based on per-interval actions
        pass_entropy = calculate_entropy([ia['passes'] for ia in interval_actions])
        shot_entropy = calculate_entropy([ia['shots'] for ia in interval_actions])
        total_mins = sum(ia['duration'] for ia in interval_actions)

        player_data.append({
            'TeamID': team_id,
            'PlayerName': player,
            'TotalMinutes': total_mins,
            'TotalPasses': total_passes,
            'TotalShots': total_shots,
            'PassEntropy': pass_entropy,
            'ShotEntropy': shot_entropy,
            'PassPerMin': round(total_passes/total_mins, 2) if total_mins > 0 else 0,
            'ShotPerMin': round(total_shots/total_mins, 2) if total_mins > 0 else 0,
            'ActiveIntervals': len(interval_actions),
            'FirstIntervalStart': on[0],
            'LastIntervalEnd': off[-1]
        })

player_df = pd.DataFrame(player_data)

# ==================== TEAM SEGMENT DATA ====================
all_points = sorted(set(interval_events['timeMin'].tolist() + [
    match_intervals['first_half_start'],
    match_intervals['first_half_end'],
    match_intervals['second_half_start'],
    match_intervals['match_end']
]))

team_segments = []
for seg_num, (start, end) in enumerate(zip(all_points[:-1], all_points[1:]), 1):
    segment_data = {
        'Segment': seg_num,
        'StartMin': start,
        'EndMin': end,
        'Duration': end - start
    }

    for team_id in interval_events['contestantId'].unique():
        team_passes = pass_events[
            (pass_events['contestantId'] == team_id) &
            (pass_events['timeMin'] >= start) &
            (pass_events['timeMin'] < end)
        ]
        team_shots = shot_events[
            (shot_events['contestantId'] == team_id) &
            (shot_events['timeMin'] >= start) &
            (shot_events['timeMin'] < end)
        ]

        pass_entropy = calculate_entropy(team_passes['playerName'].value_counts())
        shot_entropy = calculate_entropy(team_shots['playerName'].value_counts())

        segment_data.update({
            f'Team_{team_id}_Passes': len(team_passes),
            f'Team_{team_id}_PassEntropy': pass_entropy,
            f'Team_{team_id}_Shots': len(team_shots),
            f'Team_{team_id}_ShotEntropy': shot_entropy,
            f'Team_{team_id}_PassPerMin': round(len(team_passes)/(end-start), 2) if (end-start) > 0 else 0,
            f'Team_{team_id}_ShotPerMin': round(len(team_shots)/(end-start), 2) if (end-start) > 0 else 0
        })

    team_segments.append(segment_data)

team_df = pd.DataFrame(team_segments)

# ==================== SAVE FILES ====================
team_file = f"{output_dir}/Team_Segment_Performance.xlsx"
player_file = f"{output_dir}/Player_Performance_Corrected.xlsx"

team_df.to_excel(team_file, index=False)
player_df.to_excel(player_file, index=False)

print("Files successfully created:")
print(f"- Team segment performance: {team_file}")
print(f"- Corrected player performance: {player_file}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files successfully created:
- Team segment performance: /content/drive/MyDrive/Recruitment/Match Analysis/Team_Segment_Performance.xlsx
- Corrected player performance: /content/drive/MyDrive/Recruitment/Match Analysis/Player_Performance_Corrected.xlsx


  proportions = np.array(actions) / sum(actions)


In [16]:
import pandas as pd
import numpy as np
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Load data
file_path = '/content/drive/MyDrive/Recruitment/Brøndby 1-1 Nordsjælland.csv'
df = pd.read_csv(file_path)

# Create output directory
output_dir = '/content/drive/MyDrive/Recruitment/Match Analysis'
os.makedirs(output_dir, exist_ok=True)

# Get all key events - corrected definitions
sub_events = df[df['typeId'].isin([18, 19])][['typeId', 'playerName', 'contestantId', 'timeMin']]
pass_events = df[(df['typeId'] == 1) & (df['outcome'] == 1)][['contestantId', 'timeMin', 'playerName']]
shot_events = df[df['typeId'].isin([13, 14, 15, 16])][['contestantId', 'timeMin', 'playerName', 'typeId']]

# Match structure
match_intervals = {
    'first_half_start': 0,
    'first_half_end': 45,
    'second_half_start': 45,
    'match_end': max(df['timeMin']) if not df.empty else 90
}

# Entropy calculation function
def calculate_entropy(actions):
    if len(actions) == 0:
        return 0
    proportions = np.array(actions) / sum(actions)
    return -np.sum(proportions * np.log2(proportions + 1e-10))

# ==================== PLAYER PERFORMANCE ====================
player_data = []
for team_id in sub_events['contestantId'].unique():
    team_subs = sub_events[sub_events['contestantId'] == team_id]
    team_passes = pass_events[pass_events['contestantId'] == team_id]
    team_shots = shot_events[shot_events['contestantId'] == team_id]

    subs_on = team_subs[team_subs['typeId'] == 18]
    subs_off = team_subs[team_subs['typeId'] == 19]

    for player in set(subs_on['playerName']).union(set(subs_off['playerName'])):
        # Get all intervals the player was active
        on_times = sorted(subs_on[subs_on['playerName'] == player]['timeMin'])
        off_times = sorted(subs_off[subs_off['playerName'] == player]['timeMin'])

        # Handle players who started (no sub-on before first sub-off)
        if off_times and (not on_times or off_times[0] < on_times[0]):
            on_times.insert(0, match_intervals['first_half_start'])
        # Handle players who weren't subbed off
        if on_times and (not off_times or on_times[-1] > off_times[-1]):
            off_times.append(match_intervals['match_end'])

        # Calculate stats for each active interval
        intervals = []
        total_passes = 0
        total_shots = 0

        for start, end in zip(on_times, off_times):
            # Passes in this interval
            passes = team_passes[
                (team_passes['playerName'] == player) &
                (team_passes['timeMin'] >= start) &
                (team_passes['timeMin'] < end)
            ]
            pass_cnt = len(passes)
            total_passes += pass_cnt

            # Shots in this interval
            shots = team_shots[
                (team_shots['playerName'] == player) &
                (team_shots['timeMin'] >= start) &
                (team_shots['timeMin'] < end)
            ]
            shot_cnt = len(shots)
            total_shots += shot_cnt

            intervals.append({
                'start': start,
                'end': end,
                'passes': pass_cnt,
                'shots': shot_cnt,
                'duration': end - start
            })

        # Calculate metrics
        total_mins = sum(i['duration'] for i in intervals)
        pass_entropy = calculate_entropy([i['passes'] for i in intervals])
        shot_entropy = calculate_entropy([i['shots'] for i in intervals])

        # Shot type breakdown
        player_shots = team_shots[team_shots['playerName'] == player]
        shot_types = player_shots['typeId'].value_counts().to_dict()

        player_data.append({
            'TeamID': team_id,
            'PlayerName': player,
            'TotalMinutes': total_mins,
            'Passes': total_passes,
            'PassesPerMin': round(total_passes/total_mins, 2) if total_mins > 0 else 0,
            'PassEntropy': pass_entropy,
            'Shots': total_shots,
            'ShotTypes': ', '.join(f"{k}:{v}" for k,v in shot_types.items()),
            'ShotsPerMin': round(total_shots/total_mins, 2) if total_mins > 0 else 0,
            'ShotEntropy': shot_entropy,
            'FirstActive': min(on_times) if on_times else 0,
            'LastActive': max(off_times) if off_times else 0,
            'ActiveIntervals': len(intervals)
        })

player_df = pd.DataFrame(player_data)

# ==================== TEAM SEGMENTS ====================
# Get all time points that create segments
segment_points = sorted(set(sub_events['timeMin'].tolist() + [
    match_intervals['first_half_start'],
    match_intervals['first_half_end'],
    match_intervals['second_half_start'],
    match_intervals['match_end']
]))

team_segments = []
for seg_num, (start, end) in enumerate(zip(segment_points[:-1], segment_points[1:]), 1):
    segment_data = {
        'Segment': seg_num,
        'StartMin': start,
        'EndMin': end,
        'Duration': end - start
    }

    for team_id in sub_events['contestantId'].unique():
        # Team passes
        passes = pass_events[
            (pass_events['contestantId'] == team_id) &
            (pass_events['timeMin'] >= start) &
            (pass_events['timeMin'] < end)
        ]
        pass_dist = passes['playerName'].value_counts()

        # Team shots
        shots = shot_events[
            (shot_events['contestantId'] == team_id) &
            (shot_events['timeMin'] >= start) &
            (shot_events['timeMin'] < end)
        ]
        shot_dist = shots['playerName'].value_counts()
        shot_type_dist = shots['typeId'].value_counts().to_dict()

        segment_data.update({
            f'Team_{team_id}_Passes': len(passes),
            f'Team_{team_id}_PassEntropy': calculate_entropy(pass_dist),
            f'Team_{team_id}_Shots': len(shots),
            f'Team_{team_id}_ShotTypes': ', '.join(f"{k}:{v}" for k,v in shot_type_dist.items()),
            f'Team_{team_id}_ShotEntropy': calculate_entropy(shot_dist),
            f'Team_{team_id}_PassPerMin': round(len(passes)/(end-start), 2) if (end-start) > 0 else 0,
            f'Team_{team_id}_ShotPerMin': round(len(shots)/(end-start), 2) if (end-start) > 0 else 0
        })

    team_segments.append(segment_data)

team_df = pd.DataFrame(team_segments)

# ==================== SAVE FILES ====================
team_file = f"{output_dir}/Team_Segment_Performance.xlsx"
player_file = f"{output_dir}/Player_Performance.xlsx"

team_df.to_excel(team_file, index=False)
player_df.to_excel(player_file, index=False)

print("Files successfully created:")
print(f"- Team segments: {team_file}")
print(f"- Player performance: {player_file}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files successfully created:
- Player passing impact (pivot): /content/drive/MyDrive/Recruitment/Match Analysis/Player_Passing_Impact.xlsx
- Detailed passing impact: /content/drive/MyDrive/Recruitment/Match Analysis/Player_Passing_Impact_Detailed.xlsx


In [22]:
import pandas as pd
import numpy as np
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Load data
file_path = '/content/drive/MyDrive/Recruitment/Brøndby 1-1 Nordsjælland.csv'
df = pd.read_csv(file_path)

# Create output directory
output_dir = '/content/drive/MyDrive/Recruitment/Match Analysis'
os.makedirs(output_dir, exist_ok=True)

# Get all events - CORRECTED FILTERS
sub_events = df[df['typeId'].isin([18, 19])][['typeId', 'playerName', 'contestantId', 'timeMin']]
pass_events = df[df['typeId'] == 1][['contestantId', 'timeMin', 'playerName', 'outcome']]  # All passes
starting_lineup = df[df['typeId'] == 35][['contestantId', 'playerName']].drop_duplicates()  # Starting players

# Match structure
match_intervals = {
    'first_half_start': 0,
    'first_half_end': 45,
    'second_half_start': 45,
    'match_end': max(df['timeMin']) if not df.empty else 90
}

# Get all time points that create segments
all_events = sorted(set(sub_events['timeMin'].tolist() + [
    match_intervals['first_half_start'],
    match_intervals['first_half_end'],
    match_intervals['second_half_start'],
    match_intervals['match_end']
]))

segments = [(start, end) for start, end in zip(all_events[:-1], all_events[1:])]

# ==================== PLAYER PASSING ANALYSIS ====================
# Get ALL players (starters + substitutes)
all_players = pd.concat([
    starting_lineup,
    sub_events[sub_events['typeId'] == 18][['contestantId', 'playerName']]
]).drop_duplicates()

player_data = []

for _, player in all_players.iterrows():
    team_id = player['contestantId']
    player_name = player['playerName']

    # Get player's substitution times
    sub_on = sub_events[(sub_events['playerName'] == player_name) &
                       (sub_events['typeId'] == 18)]['timeMin'].tolist()
    sub_off = sub_events[(sub_events['playerName'] == player_name) &
                        (sub_events['typeId'] == 19)]['timeMin'].tolist()

    # Determine active intervals
    active_intervals = []

    # Check if player started (either in starting lineup or subbed off before subbed on)
    if (player_name in starting_lineup[starting_lineup['contestantId'] == team_id]['playerName'].values) or \
       (len(sub_off) > 0 and (len(sub_on) == 0 or sub_off[0] < sub_on[0])):
        active_intervals.append([match_intervals['first_half_start'], sub_off[0] if len(sub_off) > 0 else match_intervals['match_end']])

    # Add middle intervals (between sub-on and sub-off)
    for i in range(len(sub_on)):
        start = sub_on[i]
        end = sub_off[i] if i < len(sub_off) else match_intervals['match_end']
        active_intervals.append([start, end])

    # Handle players who weren't subbed off
    if len(sub_on) > 0 and (len(sub_off) == 0 or sub_on[-1] > sub_off[-1]):
        active_intervals[-1][1] = match_intervals['match_end']

    # Get ALL passes for this player
    player_passes = pass_events[(pass_events['contestantId'] == team_id) &
                               (pass_events['playerName'] == player_name)]

    # Calculate for each segment
    for seg_num, (seg_start, seg_end) in enumerate(segments, 1):
        # Check if player was active in this segment
        active = any((interval[0] < seg_end and interval[1] > seg_start) for interval in active_intervals)

        if not active:
            player_data.append({
                'TeamID': team_id,
                'PlayerName': player_name,
                'Segment': seg_num,
                'SegmentStart': seg_start,
                'SegmentEnd': seg_end,
                'Passes': np.nan,
                'Status': 'Off'
            })
            continue

        # Count passes in this segment
        seg_passes = player_passes[
            (player_passes['timeMin'] >= seg_start) &
            (player_passes['timeMin'] < seg_end)
        ]

        # Team passes in this segment
        team_passes = pass_events[
            (pass_events['contestantId'] == team_id) &
            (pass_events['timeMin'] >= seg_start) &
            (pass_events['timeMin'] < seg_end)
        ]

        player_data.append({
            'TeamID': team_id,
            'PlayerName': player_name,
            'Segment': seg_num,
            'SegmentStart': seg_start,
            'SegmentEnd': seg_end,
            'Passes': len(seg_passes),
            'TeamPasses': len(team_passes),
            'PassImpact': round((len(seg_passes)/len(team_passes))*100, 2) if len(team_passes) > 0 else 0,
            'Status': 'On'
        })

# Create DataFrame
passing_df = pd.DataFrame(player_data)

# ==================== SAVE FILES ====================
# Detailed version
detailed_file = f"{output_dir}/Player_Passing_Detailed.xlsx"

# Pivoted version (pass counts)
pivot_passes = passing_df.pivot_table(
    index=['TeamID', 'PlayerName'],
    columns='Segment',
    values='Passes',
    aggfunc='first'
)

# Pivoted version (pass impact)
pivot_impact = passing_df.pivot_table(
    index=['TeamID', 'PlayerName'],
    columns='Segment',
    values='PassImpact',
    aggfunc='first'
)

with pd.ExcelWriter(f"{output_dir}/Player_Passing_Analysis.xlsx") as writer:
    # Detailed sheet
    passing_df.to_excel(writer, sheet_name='Detailed Data', index=False)

    # Pass counts sheet
    pivot_passes.to_excel(writer, sheet_name='Pass Counts')

    # Impact percentage sheet
    pivot_impact.to_excel(writer, sheet_name='Pass Impact %')

    # Segment info sheet
    pd.DataFrame([
        {'Segment': i+1, 'Start': start, 'End': end, 'Duration': end-start}
        for i, (start, end) in enumerate(segments)
    ]).to_excel(writer, sheet_name='Segment Info', index=False)

print("Files successfully created:")
print(f"- Detailed passing data: {detailed_file}")
print(f"- Analysis workbook: {output_dir}/Player_Passing_Analysis.xlsx")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files successfully created:
- Detailed passing data: /content/drive/MyDrive/Recruitment/Match Analysis/Player_Passing_Detailed.xlsx
- Analysis workbook: /content/drive/MyDrive/Recruitment/Match Analysis/Player_Passing_Analysis.xlsx
