In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Step 1: Load Data
file_path = "/Users/marclambertes/Downloads/Wyscout/Slavia Praha_Baník Ostrava_3941530.csv"

try:
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()  # Remove spaces in column names
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit()

# Step 2: Ensure Required Columns Exist
required_cols = {'player_name', 'team_name', 'team_id', 'location_x', 'location_y', 'end_location_x', 'end_location_y',
                 'event_type_name', 'under_pressure', 'pass_recipient_name', 'pass_success_probability',
                 'obv_for_before', 'obv_for_after', 'obv_for_net'}

missing_cols = required_cols - set(df.columns)
if missing_cols:
    print("Error: Missing columns:", missing_cols)
    exit()

# Step 3: Filter Passes and Defensive Actions
passes = df[df['event_type_name'] == 'Pass'].copy()
defensive_actions = df[df['event_type_name'].isin(['Pressure', 'Tackle', 'Interception'])].copy()

# Fill missing recipient names
passes['pass_recipient_name'] = passes['pass_recipient_name'].fillna('Unknown')

# Step 4: Match Defensive Actions to Passes
matched_actions = []
for _, pass_row in passes.iterrows():
    for _, def_row in defensive_actions.iterrows():
        if pass_row['team_id'] != def_row['team_id']:  # Opponent check
            # Distance between pass location and defensive action
            distance = np.sqrt((pass_row['end_location_x'] - def_row['location_x']) ** 2 +
                               (pass_row['end_location_y'] - def_row['location_y']) ** 2)

            # Determine pressing success based on under_pressure & pass outcome
            pressing_successful = (pass_row['under_pressure'] and pass_row['obv_for_after'] < pass_row['obv_for_before'])

            # Store matched data
            matched_actions.append({
                'defender': def_row['player_name'],
                'team_name': def_row['team_name'],  # Include team name
                'pressing_successful': 1 if pressing_successful else 0,
                'pass_success_probability': pass_row['pass_success_probability'],
                'obv_for_before': pass_row['obv_for_before'],
                'obv_for_after': pass_row['obv_for_after'],
                'obv_for_net': pass_row['obv_for_net'],
                'distance': distance
            })

# Step 5: Convert to DataFrame
matched_df = pd.DataFrame(matched_actions)

# Step 6: Handle Missing Values
matched_df.fillna(0, inplace=True)  # Replace all NaN with 0

# Step 7: Train xOBV Model
features = ['pressing_successful', 'pass_success_probability', 'distance']
X = matched_df[features]
y = matched_df['obv_for_net']  # The actual impact of defensive actions

# Split data for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
xobv_model = RandomForestRegressor(n_estimators=100, random_state=42)
xobv_model.fit(X_train, y_train)

# Predict xOBV for each defensive event
matched_df['xOBV'] = xobv_model.predict(X)

# Step 8: Calculate Totals for Each Defender (Now Including Team Name)
player_totals = matched_df.groupby(['defender', 'team_name']).agg(
    total_pressures=('pressing_successful', 'count'),  # Total defensive actions
    successful_pressures=('pressing_successful', 'sum'),  # Successful pressures
    total_obv_impact=('obv_for_net', 'sum'),  # Sum of actual OBV impact
    expected_obv_impact=('xOBV', 'sum'),  # Expected OBV impact (xOBV)
    avg_distance=('distance', 'mean')  # Average distance to the pass event
).reset_index()

# Calculate pressing success rate
player_totals['pressing_success_rate'] = player_totals['successful_pressures'] / player_totals['total_pressures']
player_totals.fillna(0, inplace=True)  # Ensure no NaNs due to division

# Normalize OBV Impact (Scale between 0 and 1)
scaler = MinMaxScaler()
player_totals[['normalized_obv_impact', 'normalized_xOBV']] = scaler.fit_transform(
    player_totals[['total_obv_impact', 'expected_obv_impact']]
)

# Calculate Overperformance (Actual vs Expected)
player_totals['overperformance'] = player_totals['normalized_obv_impact'] - player_totals['normalized_xOBV']

# Step 9: Save Player Stats (WITH TEAM NAME)
output_filename = "Player_Defensive_Impact_with_Team_and_xOBV.xlsx"
player_totals.to_excel(output_filename, index=False)
print(f"Player totals saved to '{output_filename}'")

# Step 10: Print Top Defenders by Overperformance
print("\n🔝 Top Overperforming Defenders (Prevent More Threat Than Expected):")
print(player_totals[['defender', 'team_name', 'normalized_obv_impact', 'normalized_xOBV', 'overperformance']].sort_values(
    by='overperformance', ascending=False).head(10))


Player totals saved to 'Player_Defensive_Impact_with_Team_and_xOBV.xlsx'

🔝 Top Overperforming Defenders (Prevent More Threat Than Expected):
                 defender      team_name  normalized_obv_impact  \
6   El Hadji Malick Diouf   Slavia Praha               0.062372   
19           Michal Kohút  Baník Ostrava               0.887899   
7             Erik Prekop  Baník Ostrava               0.588962   
2            David Buchta  Baník Ostrava               0.290026   
24             Tomáš Rigo  Baník Ostrava               0.588962   
15           Lukáš Provod   Slavia Praha               0.098014   
13             Jiří Boula  Baník Ostrava               0.738431   
14          Karel Pojezný  Baník Ostrava               0.514228   
9            Filip Kubala  Baník Ostrava               0.327393   
11              Igoh Ogbu   Slavia Praha               0.187117   

    normalized_xOBV  overperformance  
6          0.062130         0.000243  
19         0.887690         0.000209  
7  

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Step 1: Load Data
file_path = "/Users/marclambertes/Downloads/Wyscout/Slavia Praha_Baník Ostrava_3941530.csv"

try:
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()  # Remove spaces in column names
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit()

# Step 2: Ensure Required Columns Exist
required_cols = {'player_name', 'team_name', 'team_id', 'location_x', 'location_y', 'end_location_x', 'end_location_y',
                 'event_type_name', 'under_pressure', 'pass_recipient_name', 'pass_success_probability',
                 'obv_for_before', 'obv_for_after', 'obv_for_net'}

missing_cols = required_cols - set(df.columns)
if missing_cols:
    print("Error: Missing columns:", missing_cols)
    exit()

# Step 3: Filter Passes and Defensive Actions
passes = df[df['event_type_name'] == 'Pass'].copy()
defensive_actions = df[df['event_type_name'].isin(['Pressure', 'Tackle', 'Interception'])].copy()

# Fill missing recipient names
passes['pass_recipient_name'] = passes['pass_recipient_name'].fillna('Unknown')

# Step 4: Match Defensive Actions to Passes
matched_actions = []
for _, pass_row in passes.iterrows():
    for _, def_row in defensive_actions.iterrows():
        if pass_row['team_id'] != def_row['team_id']:  # Opponent check
            # Distance between pass location and defensive action
            distance = np.sqrt((pass_row['end_location_x'] - def_row['location_x']) ** 2 +
                               (pass_row['end_location_y'] - def_row['location_y']) ** 2)

            # Determine pressing success based on under_pressure & pass outcome
            pressing_successful = (pass_row['under_pressure'] and pass_row['obv_for_after'] < pass_row['obv_for_before'])

            # Assign weights based on event type (Pressure > Tackle > Interception)
            event_weight = {
                'Pressure': 1.2,  # Higher weight for pressing
                'Tackle': 1.0,
                'Interception': 0.8  # Lower weight for interceptions
            }.get(def_row['event_type_name'], 1.0)

            # Apply weight to OBV impact
            weighted_obv = pass_row['obv_for_net'] * event_weight

            # Store matched data
            matched_actions.append({
                'defender': def_row['player_name'],
                'team_name': def_row['team_name'],  # Include team name
                'event_type_name': def_row['event_type_name'],  # Defensive action type
                'pressing_successful': 1 if pressing_successful else 0,
                'pass_success_probability': pass_row['pass_success_probability'],
                'obv_for_before': pass_row['obv_for_before'],
                'obv_for_after': pass_row['obv_for_after'],
                'weighted_obv': weighted_obv,  # Weighted OBV impact
                'distance': distance
            })

# Step 5: Convert to DataFrame
matched_df = pd.DataFrame(matched_actions)

# Step 6: Handle Missing Values
matched_df.fillna(0, inplace=True)  # Replace all NaN with 0

# Step 7: Train xOBV Model (Now Learning from Weighted OBV)
features = ['pressing_successful', 'pass_success_probability', 'distance']
X = matched_df[features]
y = matched_df['weighted_obv']  # Using weighted OBV for training

# Split data for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
xobv_model = RandomForestRegressor(n_estimators=100, random_state=42)
xobv_model.fit(X_train, y_train)

# Predict xOBV for each defensive event
matched_df['xOBV'] = xobv_model.predict(X)

# Step 8: Calculate Totals for Each Defender (Now Incorporating Pressures)
player_totals = matched_df.groupby(['defender', 'team_name']).agg(
    total_pressures=('pressing_successful', 'count'),  # Total defensive actions
    successful_pressures=('pressing_successful', 'sum'),  # Successful pressures
    total_obv_impact=('weighted_obv', 'sum'),  # Sum of weighted OBV impact
    expected_obv_impact=('xOBV', 'sum'),  # Expected OBV impact (xOBV)
    avg_distance=('distance', 'mean')  # Average distance to the pass event
).reset_index()

# Calculate pressing success rate
player_totals['pressing_success_rate'] = player_totals['successful_pressures'] / player_totals['total_pressures']
player_totals.fillna(0, inplace=True)  # Ensure no NaNs due to division

# Normalize OBV Impact (Scale between 0 and 1)
scaler = MinMaxScaler()
player_totals[['normalized_obv_impact', 'normalized_xOBV']] = scaler.fit_transform(
    player_totals[['total_obv_impact', 'expected_obv_impact']]
)

# Calculate Overperformance (Actual vs Expected)
player_totals['overperformance'] = player_totals['normalized_obv_impact'] - player_totals['normalized_xOBV']

# Step 9: Save Player Stats (WITH PRESSURE INCORPORATED)
output_filename = "Player_Defensive_Impact_with_xOBV_and_Pressure.xlsx"
player_totals.to_excel(output_filename, index=False)
print(f"Player totals saved to '{output_filename}'")

# Step 10: Print Top Defenders by Overperformance
print("\n🔝 Top Overperforming Defenders (Prevent More Threat Than Expected):")
print(player_totals[['defender', 'team_name', 'normalized_obv_impact', 'normalized_xOBV', 'overperformance']].sort_values(
    by='overperformance', ascending=False).head(10))


Player totals saved to 'Player_Defensive_Impact_with_xOBV_and_Pressure.xlsx'

🔝 Top Overperforming Defenders (Prevent More Threat Than Expected):
             defender      team_name  normalized_obv_impact  normalized_xOBV  \
13         Jiří Boula  Baník Ostrava               0.767095         0.752583   
18    Michal Frydrych  Baník Ostrava               0.650642         0.638237   
14      Karel Pojezný  Baník Ostrava               0.534189         0.524440   
7         Erik Prekop  Baník Ostrava               0.611825         0.603259   
24         Tomáš Rigo  Baník Ostrava               0.611825         0.604570   
2        David Buchta  Baník Ostrava               0.301284         0.294934   
17          Matěj Šín  Baník Ostrava               0.301284         0.295067   
1       Daniel Holzer  Baník Ostrava               0.262467         0.257170   
9        Filip Kubala  Baník Ostrava               0.340102         0.335926   
0   Christos Zafeiris   Slavia Praha               0.1