In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Step 1: Load Data
file_path = "/Users/marclambertes/Downloads/Wyscout/Slavia Praha_Bohemians 1905_3941550.csv"

try:
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()  # Remove spaces in column names
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit()

# Step 2: Ensure Required Columns Exist
required_cols = {'player_name', 'team_name', 'team_id', 'location_x', 'location_y', 'end_location_x', 'end_location_y',
                 'event_type_name', 'under_pressure', 'pass_recipient_name', 'pass_success_probability',
                 'obv_for_before', 'obv_for_after', 'obv_for_net'}

missing_cols = required_cols - set(df.columns)
if missing_cols:
    print("Error: Missing columns:", missing_cols)
    exit()

# Step 3: Filter Passes and Defensive Actions
passes = df[df['event_type_name'] == 'Pass'].copy()
defensive_actions = df[df['event_type_name'].isin(['Pressure', 'Tackle', 'Interception'])].copy()

# Fill missing recipient names
passes['pass_recipient_name'] = passes['pass_recipient_name'].fillna('Unknown')

# Step 4: Match Defensive Actions to Passes
matched_actions = []
for _, pass_row in passes.iterrows():
    for _, def_row in defensive_actions.iterrows():
        if pass_row['team_id'] != def_row['team_id']:  # Opponent check
            # Distance between pass location and defensive action
            distance = np.sqrt((pass_row['end_location_x'] - def_row['location_x']) ** 2 +
                               (pass_row['end_location_y'] - def_row['location_y']) ** 2)

            # Determine pressing success based on under_pressure & pass outcome
            pressing_successful = (pass_row['under_pressure'] and pass_row['obv_for_after'] < pass_row['obv_for_before'])

            # Store matched data
            matched_actions.append({
                'defender': def_row['player_name'],
                'team_name': def_row['team_name'],  # Include team name
                'pressing_successful': 1 if pressing_successful else 0,
                'pass_success_probability': pass_row['pass_success_probability'],
                'obv_for_before': pass_row['obv_for_before'],
                'obv_for_after': pass_row['obv_for_after'],
                'obv_for_net': pass_row['obv_for_net'],
                'distance': distance
            })

# Step 5: Convert to DataFrame
matched_df = pd.DataFrame(matched_actions)

# Step 6: Handle Missing Values
matched_df.fillna(0, inplace=True)  # Replace all NaN with 0

# Step 7: Train xOBV Model
features = ['pressing_successful', 'pass_success_probability', 'distance']
X = matched_df[features]
y = matched_df['obv_for_net']  # The actual impact of defensive actions

# Split data for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
xobv_model = RandomForestRegressor(n_estimators=100, random_state=42)
xobv_model.fit(X_train, y_train)

# Predict xOBV for each defensive event
matched_df['xOBV'] = xobv_model.predict(X)

# Step 8: Calculate Totals for Each Defender (Now Including Team Name)
player_totals = matched_df.groupby(['defender', 'team_name']).agg(
    total_pressures=('pressing_successful', 'count'),  # Total defensive actions
    successful_pressures=('pressing_successful', 'sum'),  # Successful pressures
    total_obv_impact=('obv_for_net', 'sum'),  # Sum of actual OBV impact
    expected_obv_impact=('xOBV', 'sum'),  # Expected OBV impact (xOBV)
    avg_distance=('distance', 'mean')  # Average distance to the pass event
).reset_index()

# Calculate pressing success rate
player_totals['pressing_success_rate'] = player_totals['successful_pressures'] / player_totals['total_pressures']
player_totals.fillna(0, inplace=True)  # Ensure no NaNs due to division

# Normalize OBV Impact (Scale between 0 and 1)
scaler = MinMaxScaler()
player_totals[['normalized_obv_impact', 'normalized_xOBV']] = scaler.fit_transform(
    player_totals[['total_obv_impact', 'expected_obv_impact']]
)

# Calculate Overperformance (Actual vs Expected)
player_totals['overperformance'] = player_totals['normalized_obv_impact'] - player_totals['normalized_xOBV']

# Step 9: Save Player Stats (WITH TEAM NAME)
output_filename = "Player_Defensive_Impact_with_Team_and_xOBV.xlsx"
player_totals.to_excel(output_filename, index=False)
print(f"Player totals saved to '{output_filename}'")

# Step 10: Print Top Defenders by Overperformance
print("\n🔝 Top Overperforming Defenders (Prevent More Threat Than Expected):")
print(player_totals[['defender', 'team_name', 'normalized_obv_impact', 'normalized_xOBV', 'overperformance']].sort_values(
    by='overperformance', ascending=False).head(10))


Player totals saved to 'Player_Defensive_Impact_with_Team_and_xOBV.xlsx'

🔝 Top Overperforming Defenders (Prevent More Threat Than Expected):
                 defender      team_name  normalized_obv_impact  \
6   El Hadji Malick Diouf   Slavia Praha               0.062372   
19           Michal Kohút  Baník Ostrava               0.887899   
7             Erik Prekop  Baník Ostrava               0.588962   
2            David Buchta  Baník Ostrava               0.290026   
24             Tomáš Rigo  Baník Ostrava               0.588962   
15           Lukáš Provod   Slavia Praha               0.098014   
13             Jiří Boula  Baník Ostrava               0.738431   
14          Karel Pojezný  Baník Ostrava               0.514228   
9            Filip Kubala  Baník Ostrava               0.327393   
11              Igoh Ogbu   Slavia Praha               0.187117   

    normalized_xOBV  overperformance  
6          0.062130         0.000243  
19         0.887690         0.000209  
7  

In [30]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Step 1: Load Data
file_path = "/Users/marclambertes/Downloads/Wyscout/Slavia Praha_Baník Ostrava_3941530.csv"

try:
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()  # Remove spaces in column names
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit()

# Step 2: Ensure Required Columns Exist
required_cols = {'player_name', 'team_name', 'team_id', 'location_x', 'location_y', 'end_location_x', 'end_location_y',
                 'event_type_name', 'under_pressure', 'pass_recipient_name', 'pass_success_probability',
                 'obv_for_before', 'obv_for_after', 'obv_for_net'}

missing_cols = required_cols - set(df.columns)
if missing_cols:
    print("Error: Missing columns:", missing_cols)
    exit()

# Step 3: Filter Passes and Defensive Actions
passes = df[df['event_type_name'] == 'Pass'].copy()
defensive_actions = df[df['event_type_name'].isin(['Pressure', 'Tackle', 'Interception'])].copy()

# Fill missing recipient names
passes['pass_recipient_name'] = passes['pass_recipient_name'].fillna('Unknown')

# Step 4: Match Defensive Actions to Passes
matched_actions = []
for _, pass_row in passes.iterrows():
    for _, def_row in defensive_actions.iterrows():
        if pass_row['team_id'] != def_row['team_id']:  # Opponent check
            # Distance between pass location and defensive action
            distance = np.sqrt((pass_row['end_location_x'] - def_row['location_x']) ** 2 +
                               (pass_row['end_location_y'] - def_row['location_y']) ** 2)

            # Determine pressing success based on under_pressure & pass outcome
            pressing_successful = (pass_row['under_pressure'] and pass_row['obv_for_after'] < pass_row['obv_for_before'])

            # Assign weights based on event type (Pressure > Tackle > Interception)
            event_weight = {
                'Pressure': 1.2,  # Higher weight for pressing
                'Tackle': 1.0,
                'Interception': 0.8  # Lower weight for interceptions
            }.get(def_row['event_type_name'], 1.0)

            # Apply weight to OBV impact
            weighted_obv = pass_row['obv_for_net'] * event_weight

            # Store matched data
            matched_actions.append({
                'defender': def_row['player_name'],
                'team_name': def_row['team_name'],  # Include team name
                'event_type_name': def_row['event_type_name'],  # Defensive action type
                'pressing_successful': 1 if pressing_successful else 0,
                'pass_success_probability': pass_row['pass_success_probability'],
                'obv_for_before': pass_row['obv_for_before'],
                'obv_for_after': pass_row['obv_for_after'],
                'weighted_obv': weighted_obv,  # Weighted OBV impact
                'distance': distance
            })

# Step 5: Convert to DataFrame
matched_df = pd.DataFrame(matched_actions)

# Step 6: Handle Missing Values
matched_df.fillna(0, inplace=True)  # Replace all NaN with 0

# Step 7: Train xDEF Model (Learning from Weighted OBV)
features = ['pressing_successful', 'pass_success_probability', 'distance']
X = matched_df[features]
y = matched_df['weighted_obv']  # Using weighted OBV for training

# Split data for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
xdef_model = RandomForestRegressor(n_estimators=100, random_state=42)
xdef_model.fit(X_train, y_train)

# Predict xDEF for each defensive event
matched_df['xDEF'] = xdef_model.predict(X)

# Step 8: Calculate Totals for Each Defender by Event Type
player_totals = matched_df.groupby(['defender', 'team_name', 'event_type_name']).agg(
    aDEF=('weighted_obv', 'sum'),  # Actual Defensive Impact
    xDEF=('xDEF', 'sum')  # Expected Defensive Impact
).reset_index()

# Normalize aDEF & xDEF (Scale between 0 and 1)
scaler = MinMaxScaler()
player_totals[['aDEF', 'xDEF']] = scaler.fit_transform(player_totals[['aDEF', 'xDEF']])

# Pivot to get separate columns for Pressure, Tackle, Interception
player_totals_pivot = player_totals.pivot(index=['defender', 'team_name'], columns='event_type_name', values=['aDEF', 'xDEF'])

# Flatten MultiIndex columns
player_totals_pivot.columns = [f"{col[0]}_{col[1]}" for col in player_totals_pivot.columns]

# Reset index to make it a proper DataFrame
player_totals_pivot.reset_index(inplace=True)

# Step 9: Compute Total aDEF and xDEF
player_totals_pivot['aDEF_Total'] = player_totals_pivot.filter(like='aDEF_').sum(axis=1)
player_totals_pivot['xDEF_Total'] = player_totals_pivot.filter(like='xDEF_').sum(axis=1)

# Step 10: Generate Output Filename Based on Match Title
filename = os.path.basename(file_path)  # Get only the file name
match_title = "_".join(filename.split("_")[:2])  # Extract up to second underscore

# Construct output file name
output_filename = f"{match_title}_Defensive_Impact.xlsx"

# Save the file with NaN values replaced by 0
player_totals_pivot.to_excel(output_filename, index=False, na_rep=0)

print(f"✅ Player totals saved to '{output_filename}'")

# Step 11: Print Top Defenders by Total aDEF
print("\n🔝 Top Defenders Overall (Highest aDEF_Total):")
print(player_totals_pivot.sort_values(by='aDEF_Total', ascending=False).head(10))


✅ Player totals saved to 'Slavia Praha_Baník Ostrava_Defensive_Impact.xlsx'

🔝 Top Defenders Overall (Highest aDEF_Total):
           defender      team_name  aDEF_Interception  aDEF_Pressure  \
16     Matěj Chaluš  Baník Ostrava           0.077221       1.000000   
19     Michal Kohút  Baník Ostrava           0.077221       0.874167   
13       Jiří Boula  Baník Ostrava                NaN       0.832222   
18  Michal Frydrych  Baník Ostrava                NaN       0.706389   
24       Tomáš Rigo  Baník Ostrava                NaN       0.664444   
7       Erik Prekop  Baník Ostrava                NaN       0.664444   
14    Karel Pojezný  Baník Ostrava                NaN       0.580555   
9      Filip Kubala  Baník Ostrava                NaN       0.370833   
2      David Buchta  Baník Ostrava                NaN       0.328888   
17        Matěj Šín  Baník Ostrava                NaN       0.328888   

    xDEF_Interception  xDEF_Pressure  aDEF_Total  xDEF_Total  
16           0.09613

In [35]:
import pandas as pd

# Path to your CSV file
file_path = "/Users/marclambertes/Downloads/Wyscout/SkillCorner-2025-02-23.csv"

# Read the CSV with the correct delimiter
df = pd.read_csv(file_path, sep=";", encoding="utf-8")

# Print column names to check if they are correctly separated
print("Fixed Column Names:", df.columns.tolist())

# Ensure "Player" column exists before proceeding
if "Player" in df.columns:
    # Function to fix unclosed quotation marks in the "Player" column
    def fix_unclosed_quotes(player_name):
        if isinstance(player_name, str) and player_name.startswith("'") and not player_name.endswith("'"):
            return player_name + "'"  # Add the closing quote
        return player_name  # Keep it unchanged

    # Apply the function
    df["Player"] = df["Player"].apply(fix_unclosed_quotes)

    # Save the cleaned file
    df.to_csv("final_fixed_file.csv", index=False, sep=";")
    print("Fixed CSV saved as 'final_fixed_file.csv'.")
else:
    print("Error: 'Player' column not found after fixing headers!")


Fixed Column Names: ['Player', 'Position', 'Team', 'third', 'channel', 'Minutes played per match', 'Adjusted min TIP per match', 'Count performances that pass the quality check', 'Count performances that fail the quality check', 'Count Runs in sample', 'Count Runs per Match', 'Count Dangerous Runs per Match', 'Threat of Runs per Match', 'Count Runs leading to goal per Match', 'Count Runs targeted per Match', 'Count Runs received per Match', 'Count Runs leading to shot per Match', 'Threat of Runs targeted per Match', 'Threat of Runs received per Match', 'Count dangerous Runs targeted per Match', 'Count dangerous Runs received per Match']
Fixed CSV saved as 'final_fixed_file.csv'.


In [36]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# --- Step 1: Load First CSV (SkillCorner) and Rename Column ---
file_path1 = "/Users/marclambertes/Downloads/Wyscout/SkillCorner-2025-02-23.csv"

# Read with correct delimiter
df1 = pd.read_csv(file_path1, sep=";", encoding="utf-8")
df1.columns = df1.columns.str.replace('"', '').str.strip()  # Remove stray quotes and spaces

# Rename 'Player' to 'player_name' for merging
if "Player" in df1.columns:
    df1.rename(columns={"Player": "player_name"}, inplace=True)
else:
    print("Error: 'Player' column not found in first dataset!")
    exit()

# --- Step 2: Load Second CSV (Match Data) ---
file_path2 = "/Users/marclambertes/Downloads/Wyscout/Slavia Praha_Baník Ostrava_3941530.csv"

try:
    df2 = pd.read_csv(file_path2)
    df2.columns = df2.columns.str.strip()  # Remove spaces in column names
except FileNotFoundError:
    print(f"Error: File not found at {file_path2}")
    exit()

# --- Step 3: Merge Datasets on 'player_name' ---
df_merged = df1.merge(df2, on="player_name", how="inner")  # Use 'inner' to keep only matching rows

# --- Step 4: Ensure Required Columns Exist ---
required_cols = {'player_name', 'team_name', 'team_id', 'location_x', 'location_y', 'end_location_x', 'end_location_y',
                 'event_type_name', 'under_pressure', 'pass_recipient_name', 'pass_success_probability',
                 'obv_for_before', 'obv_for_after', 'obv_for_net'}

missing_cols = required_cols - set(df_merged.columns)
if missing_cols:
    print("Error: Missing columns after merge:", missing_cols)
    exit()

# --- Step 5: Filter Passes and Defensive Actions ---
passes = df_merged[df_merged['event_type_name'] == 'Pass'].copy()
defensive_actions = df_merged[df_merged['event_type_name'].isin(['Pressure', 'Tackle', 'Interception'])].copy()

# Fill missing recipient names
passes['pass_recipient_name'] = passes['pass_recipient_name'].fillna('Unknown')

# --- Step 6: Match Defensive Actions to Passes ---
matched_actions = []
for _, pass_row in passes.iterrows():
    for _, def_row in defensive_actions.iterrows():
        if pass_row['team_id'] != def_row['team_id']:  # Opponent check
            # Distance between pass location and defensive action
            distance = np.sqrt((pass_row['end_location_x'] - def_row['location_x']) ** 2 +
                               (pass_row['end_location_y'] - def_row['location_y']) ** 2)

            # Determine pressing success based on under_pressure & pass outcome
            pressing_successful = (pass_row['under_pressure'] and pass_row['obv_for_after'] < pass_row['obv_for_before'])

            # Assign weights based on event type (Pressure > Tackle > Interception)
            event_weight = {
                'Pressure': 1.2,  # Higher weight for pressing
                'Tackle': 1.0,
                'Interception': 0.8  # Lower weight for interceptions
            }.get(def_row['event_type_name'], 1.0)

            # Apply weight to OBV impact
            weighted_obv = pass_row['obv_for_net'] * event_weight

            # Store matched data
            matched_actions.append({
                'defender': def_row['player_name'],
                'team_name': def_row['team_name'],
                'event_type_name': def_row['event_type_name'],
                'pressing_successful': 1 if pressing_successful else 0,
                'pass_success_probability': pass_row['pass_success_probability'],
                'obv_for_before': pass_row['obv_for_before'],
                'obv_for_after': pass_row['obv_for_after'],
                'weighted_obv': weighted_obv,
                'distance': distance
            })

# --- Step 7: Convert to DataFrame ---
matched_df = pd.DataFrame(matched_actions)

# --- Step 8: Handle Missing Values ---
matched_df.fillna(0, inplace=True)

# --- Step 9: Train xDEF Model ---
features = ['pressing_successful', 'pass_success_probability', 'distance']
X = matched_df[features]
y = matched_df['weighted_obv']

# Split data for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
xdef_model = RandomForestRegressor(n_estimators=100, random_state=42)
xdef_model.fit(X_train, y_train)

# Predict xDEF for each defensive event
matched_df['xDEF'] = xdef_model.predict(X)

# --- Step 10: Calculate Totals for Each Defender by Event Type ---
player_totals = matched_df.groupby(['defender', 'team_name', 'event_type_name']).agg(
    aDEF=('weighted_obv', 'sum'),
    xDEF=('xDEF', 'sum')
).reset_index()

# Normalize aDEF & xDEF
scaler = MinMaxScaler()
player_totals[['aDEF', 'xDEF']] = scaler.fit_transform(player_totals[['aDEF', 'xDEF']])

# Pivot to separate columns for Pressure, Tackle, Interception
player_totals_pivot = player_totals.pivot(index=['defender', 'team_name'], columns='event_type_name', values=['aDEF', 'xDEF'])

# Flatten MultiIndex columns
player_totals_pivot.columns = [f"{col[0]}_{col[1]}" for col in player_totals_pivot.columns]
player_totals_pivot.reset_index(inplace=True)

# --- Step 11: Compute Total aDEF and xDEF ---
player_totals_pivot['aDEF_Total'] = player_totals_pivot.filter(like='aDEF_').sum(axis=1)
player_totals_pivot['xDEF_Total'] = player_totals_pivot.filter(like='xDEF_').sum(axis=1)

# --- Step 12: Generate Output Filename ---
filename = os.path.basename(file_path2)
match_title = "_".join(filename.split("_")[:2])
output_filename = f"{match_title}_Defensive_Impact.xlsx"

# Save the file
player_totals_pivot.to_excel(output_filename, index=False, na_rep=0)

print(f"✅ Player totals saved to '{output_filename}'")

# --- Step 13: Print Top Defenders ---
print("\n🔝 Top Defenders Overall (Highest aDEF_Total):")
print(player_totals_pivot.sort_values(by='aDEF_Total', ascending=False).head(10))


✅ Player totals saved to 'Slavia Praha_Baník Ostrava_Defensive_Impact.xlsx'

🔝 Top Defenders Overall (Highest aDEF_Total):
            defender      team_name  aDEF_Interception  aDEF_Pressure  \
11      Matěj Chaluš  Baník Ostrava           0.078095       1.000000   
13      Michal Kohút  Baník Ostrava           0.078095       0.874286   
8         Jiří Boula  Baník Ostrava                NaN       0.832381   
12   Michal Frydrych  Baník Ostrava                NaN       0.706666   
5        Erik Prekop  Baník Ostrava                NaN       0.664762   
9      Karel Pojezný  Baník Ostrava                NaN       0.580952   
2       David Buchta  Baník Ostrava                NaN       0.329523   
1      Daniel Holzer  Baník Ostrava                NaN       0.287619   
6          Igoh Ogbu   Slavia Praha           0.011430       0.157159   
15  Štěpán Chaloupek   Slavia Praha           0.005715       0.140014   

    xDEF_Interception  xDEF_Pressure  aDEF_Total  xDEF_Total  
11       

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# --- Step 1: Load First CSV (SkillCorner) and Rename Column ---
file_path1 = "/Users/marclambertes/Downloads/Wyscout/SkillCorner-2025-03-09.csv"

# Read with correct delimiter
df1 = pd.read_csv(file_path1, sep=";", encoding="utf-8")
df1.columns = df1.columns.str.replace('"', '').str.strip()  # Remove stray quotes and spaces

# Rename 'Player' to 'player_name' for merging
if "Player" in df1.columns:
    df1.rename(columns={"Player": "player_name"}, inplace=True)
else:
    print("Error: 'Player' column not found in first dataset!")
    exit()

# Select relevant columns for runs
run_cols = ["player_name", "Count Runs targeted per Match", "Count Runs received per Match"]
df_runs = df1[run_cols]

# --- Step 2: Load Second CSV (Match Data) ---
file_path2 = "/Users/marclambertes/Downloads/Wyscout/Slavia Praha_Bohemians 1905_3941550.csv"

try:
    df2 = pd.read_csv(file_path2)
    df2.columns = df2.columns.str.strip()  # Remove spaces in column names
except FileNotFoundError:
    print(f"Error: File not found at {file_path2}")
    exit()

# --- Step 3: Merge Datasets on 'player_name' ---
df_merged = df2.merge(df_runs, on="player_name", how="left")  # Include run statistics

# Fill missing values in run columns with 0
df_merged.fillna({"Count Runs targeted per Match": 0, "Count Runs received per Match": 0}, inplace=True)

# --- Step 4: Ensure Required Columns Exist ---
required_cols = {'player_name', 'team_name', 'team_id', 'location_x', 'location_y', 'end_location_x', 'end_location_y',
                 'event_type_name', 'under_pressure', 'pass_recipient_name', 'pass_success_probability',
                 'obv_for_before', 'obv_for_after', 'obv_for_net', 'Count Runs targeted per Match', 'Count Runs received per Match'}

missing_cols = required_cols - set(df_merged.columns)
if missing_cols:
    print("Error: Missing columns after merge:", missing_cols)
    exit()

# --- Step 5: Filter Passes and Defensive Actions ---
passes = df_merged[df_merged['event_type_name'] == 'Pass'].copy()
defensive_actions = df_merged[df_merged['event_type_name'].isin(['Pressure', 'Tackle', 'Interception'])].copy()

# Fill missing recipient names
passes['pass_recipient_name'] = passes['pass_recipient_name'].fillna('Unknown')

# --- Step 6: Match Defensive Actions to Passes ---
matched_actions = []
for _, pass_row in passes.iterrows():
    for _, def_row in defensive_actions.iterrows():
        if pass_row['team_id'] != def_row['team_id']:  # Opponent check
            # Distance between pass location and defensive action
            distance = np.sqrt((pass_row['end_location_x'] - def_row['location_x']) ** 2 +
                               (pass_row['end_location_y'] - def_row['location_y']) ** 2)

            # Determine pressing success based on under_pressure & pass outcome
            pressing_successful = (pass_row['under_pressure'] and pass_row['obv_for_after'] < pass_row['obv_for_before'])

            # Assign weights based on event type (Pressure > Tackle > Interception)
            event_weight = {
                'Pressure': 1.2,
                'Tackle': 1.0,
                'Interception': 0.8
            }.get(def_row['event_type_name'], 1.0)

            # Apply weight to OBV impact and incorporate run statistics
            weighted_obv = pass_row['obv_for_net'] * event_weight

            # Adjust defensive value based on runs
            run_targeted_weight = def_row['Count Runs targeted per Match'] * 0.5  # Weight for targeted runs
            run_received_weight = def_row['Count Runs received per Match'] * 0.3  # Weight for received runs
            adjusted_weighted_obv = weighted_obv + run_targeted_weight + run_received_weight

            # Store matched data
            matched_actions.append({
                'defender': def_row['player_name'],
                'team_name': def_row['team_name'],
                'event_type_name': def_row['event_type_name'],
                'pressing_successful': 1 if pressing_successful else 0,
                'pass_success_probability': pass_row['pass_success_probability'],
                'obv_for_before': pass_row['obv_for_before'],
                'obv_for_after': pass_row['obv_for_after'],
                'weighted_obv': adjusted_weighted_obv,
                'distance': distance
            })

# --- Step 7: Convert to DataFrame ---
matched_df = pd.DataFrame(matched_actions)

# --- Step 8: Handle Missing Values ---
matched_df.fillna(0, inplace=True)

# --- Step 9: Train xDEF Model ---
features = ['pressing_successful', 'pass_success_probability', 'distance']
X = matched_df[features]
y = matched_df['weighted_obv']

# Split data for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
xdef_model = RandomForestRegressor(n_estimators=100, random_state=42)
xdef_model.fit(X_train, y_train)

# Predict xDEF for each defensive event
matched_df['xDEF'] = xdef_model.predict(X)

# --- Step 10: Calculate Totals for Each Defender by Event Type ---
player_totals = matched_df.groupby(['defender', 'team_name', 'event_type_name']).agg(
    aDEF=('weighted_obv', 'sum'),
    xDEF=('xDEF', 'sum')
).reset_index()

# Normalize aDEF & xDEF
scaler = MinMaxScaler()
player_totals[['aDEF', 'xDEF']] = scaler.fit_transform(player_totals[['aDEF', 'xDEF']])

# Pivot to separate columns for Pressure, Tackle, Interception
player_totals_pivot = player_totals.pivot(index=['defender', 'team_name'], columns='event_type_name', values=['aDEF', 'xDEF'])

# Flatten MultiIndex columns
player_totals_pivot.columns = [f"{col[0]}_{col[1]}" for col in player_totals_pivot.columns]
player_totals_pivot.reset_index(inplace=True)

# --- Step 11: Compute Total aDEF and xDEF ---
player_totals_pivot['aDEF_Total'] = player_totals_pivot.filter(like='aDEF_').sum(axis=1)
player_totals_pivot['xDEF_Total'] = player_totals_pivot.filter(like='xDEF_').sum(axis=1)

# --- Step 12: Generate Output Filename ---
filename = os.path.basename(file_path2)
match_title = "_".join(filename.split("_")[:2])
output_filename = f"{match_title}_Defensive_Impact.xlsx"

# Save the file
player_totals_pivot.to_excel(output_filename, index=False, na_rep=0)

print(f"✅ Player totals saved to '{output_filename}'")

# --- Step 13: Print Top Defenders ---
print("\n🔝 Top Defenders Overall (Highest aDEF_Total):")
print(player_totals_pivot.sort_values(by='aDEF_Total', ascending=False).head(10))


✅ Player totals saved to 'Slavia Praha_Bohemians 1905_Defensive_Impact.xlsx'

🔝 Top Defenders Overall (Highest aDEF_Total):
                 defender       team_name  aDEF_Interception  aDEF_Pressure  \
20           Robert Hrubý  Bohemians 1905                NaN       1.000000   
26          Václav Drchal  Bohemians 1905                NaN       0.673850   
9         Dominik Pleštil  Bohemians 1905                NaN       0.653633   
15          Martin Dostál  Bohemians 1905           0.058589       0.527630   
10  El Hadji Malick Diouf    Slavia Praha                NaN       0.461645   
16          Mojmír Chytil    Slavia Praha                NaN       0.454673   
22            Tomáš Holeš    Slavia Praha           0.078489       0.353231   
21            Tomáš Chorý    Slavia Praha                NaN       0.402187   
17           Ondřej Lingr    Slavia Praha                NaN       0.308970   
3          Antonín Křapka  Bohemians 1905                NaN       0.234893   

    xD

In [5]:
# --- Step 1: Load First CSV (SkillCorner) and Rename Column ---
file_path1 = "/Users/marclambertes/Downloads/Wyscout/SkillCorner-2025-03-09.csv"

try:
    df1 = pd.read_csv(file_path1, sep=";", encoding="utf-8")
    df1.columns = df1.columns.str.replace('"', '').str.strip()  # Remove stray quotes and spaces

    # Rename 'Player' to 'player_name' for merging
    if "Player" in df1.columns:
        df1.rename(columns={"Player": "player_name"}, inplace=True)
    else:
        logging.error("Error: 'Player' column not found in first dataset!")
        exit()

    # Select relevant columns for runs
    run_cols = ["player_name", "Count Runs targeted per Match", "Count Runs received per Match"]
    
    # Check if the required columns exist in df1
    missing_run_cols = set(run_cols) - set(df1.columns)
    if missing_run_cols:
        logging.error(f"Error: Missing columns in first dataset: {missing_run_cols}")
        exit()

    df_runs = df1[run_cols]

except Exception as e:
    logging.error(f"Error loading first CSV: {e}")
    exit()

# --- Step 2: Load Second CSV (Match Data) ---
file_path2 = "/Users/marclambertes/Downloads/Wyscout/Slavia Praha_Bohemians 1905_3941550.csv"

try:
    df2 = pd.read_csv(file_path2)
    df2.columns = df2.columns.str.strip()  # Remove spaces in column names
except FileNotFoundError:
    logging.error(f"Error: File not found at {file_path2}")
    exit()
except Exception as e:
    logging.error(f"Error loading second CSV: {e}")
    exit()

# --- Step 3: Merge Datasets on 'player_name' ---
df_merged = df2.merge(df_runs, on="player_name", how="left")  # Include run statistics

# Check if the run columns were successfully merged
if 'Count Runs targeted per Match' not in df_merged.columns or 'Count Runs received per Match' not in df_merged.columns:
    logging.error("Error: Run columns not found after merge!")
    exit()

# Fill missing values in run columns with 0
df_merged.fillna({"Count Runs targeted per Match": 0, "Count Runs received per Match": 0}, inplace=True)

# --- Step 4: Ensure Required Columns Exist ---
required_cols = {'player_name', 'team_name', 'team_id', 'location_x', 'location_y', 'end_location_x', 'end_location_y',
                 'event_type_name', 'under_pressure', 'pass_recipient_name', 'pass_success_probability',
                 'obv_for_before', 'obv_for_after', 'obv_for_net', 'Count Runs targeted per Match', 'Count Runs received per Match'}

missing_cols = required_cols - set(df_merged.columns)
if missing_cols:
    logging.error(f"Error: Missing columns after merge: {missing_cols}")
    exit()

# --- Step 5: Filter Passes and Defensive Actions ---
passes = df_merged[df_merged['event_type_name'] == 'Pass'].copy()
defensive_actions = df_merged[df_merged['event_type_name'].isin(['Pressure', 'Tackle', 'Interception'])].copy()

# Fill missing recipient names
passes['pass_recipient_name'] = passes['pass_recipient_name'].fillna('Unknown')

# --- Step 6: Match Defensive Actions to Passes ---
matched_actions = []
for _, pass_row in passes.iterrows():
    for _, def_row in defensive_actions.iterrows():
        if pass_row['team_id'] != def_row['team_id']:  # Opponent check
            # Distance between pass location and defensive action
            distance = np.sqrt((pass_row['end_location_x'] - def_row['location_x']) ** 2 +
                               (pass_row['end_location_y'] - def_row['location_y']) ** 2)

            # Determine pressing success based on under_pressure & pass outcome
            pressing_successful = (pass_row['under_pressure'] and pass_row['obv_for_after'] < pass_row['obv_for_before'])

            # Assign weights based on event type (Pressure > Tackle > Interception)
            event_weight = {
                'Pressure': 1.2,
                'Tackle': 1.0,
                'Interception': 0.8
            }.get(def_row['event_type_name'], 1.0)

            # Apply weight to OBV impact and incorporate run statistics
            weighted_obv = pass_row['obv_for_net'] * event_weight

            # Adjust defensive value based on runs
            run_targeted_weight = def_row['Count Runs targeted per Match'] * 0.5  # Weight for targeted runs
            run_received_weight = def_row['Count Runs received per Match'] * 0.3  # Weight for received runs
            adjusted_weighted_obv = weighted_obv + run_targeted_weight + run_received_weight

            # Store matched data
            matched_actions.append({
                'defender': def_row['player_name'],
                'team_name': def_row['team_name'],
                'event_type_name': def_row['event_type_name'],
                'pressing_successful': 1 if pressing_successful else 0,
                'pass_success_probability': pass_row['pass_success_probability'],
                'obv_for_before': pass_row['obv_for_before'],
                'obv_for_after': pass_row['obv_for_after'],
                'weighted_obv': adjusted_weighted_obv,
                'distance': distance,
                'Count Runs targeted per Match': def_row['Count Runs targeted per Match'],  # Ensure this is included
                'Count Runs received per Match': def_row['Count Runs received per Match']   # Ensure this is included
            })

# --- Step 7: Convert to DataFrame ---
matched_df = pd.DataFrame(matched_actions)

# --- Step 8: Handle Missing Values ---
matched_df.fillna(0, inplace=True)

# --- Step 9: Feature Engineering ---
# Ensure the columns exist before creating new features
if 'Count Runs targeted per Match' in matched_df.columns and 'Count Runs received per Match' in matched_df.columns:
    matched_df['defensive_action_strength'] = matched_df['event_type_name'].map({'Pressure': 1.2, 'Tackle': 1.0, 'Interception': 0.8})
    matched_df['run_impact'] = matched_df['Count Runs targeted per Match'] * 0.5 + matched_df['Count Runs received per Match'] * 0.3
else:
    logging.error("Error: Run columns missing in matched_df!")
    exit()
    
# --- Step 10: Save to New Excel File ---
output_file_path = "/Users/marclambertes/Downloads/Wyscout/Processed_Defensive_Actions.xlsx"

try:
    # Save the DataFrame to an Excel file
    matched_df.to_excel(output_file_path, index=False, engine='openpyxl')
    logging.info(f"✅ Processed data saved to '{output_file_path}'")
except Exception as e:
    logging.error(f"Error saving to Excel: {e}")

2025-03-09 16:25:09,620 - INFO - ✅ Processed data saved to '/Users/marclambertes/Downloads/Wyscout/Processed_Defensive_Actions.xlsx'
