In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# --- Step 1: Load Match Event Data ---
df = pd.read_csv("/Users/marclambertes/Python/Matches/Men/2024-2025/Eredivisie 2024-2025/PSV 1-1 Willem II.csv")  # Change to your match file

# --- Step 2: Infer Carries from Consecutive Events ---
def infer_carries(df):
    df = df.sort_values(by=['playerName', 'periodId', 'timeMin', 'timeSec'])
    carry_rows = []

    for i in range(len(df) - 1):
        current = df.iloc[i]
        next_event = df.iloc[i + 1]

        if current['playerName'] == next_event['playerName']:
            time_diff = (next_event['timeMin'] * 60 + next_event['timeSec']) - (current['timeMin'] * 60 + current['timeSec'])
            dist = np.sqrt((next_event['x'] - current['x'])**2 + (next_event['y'] - current['y'])**2)

            if 0 < time_diff <= 10 and dist > 2:
                carry_rows.append({
                    'playerName': current['playerName'],
                    'x': current['x'],
                    'y': current['y'],
                    'Next_X': next_event['x'],
                    'Next_Y': next_event['y'],
                    'timeMin': current['timeMin'],
                    'timeSec': current['timeSec'],
                    'contestantId': current['contestantId']
                })

    return pd.DataFrame(carry_rows)

df_carries = infer_carries(df)

# --- Step 3: Calculate Distance to Goal and Progressiveness ---
df_carries['beginning'] = np.sqrt((105 - df_carries['x'])**2 + (52.5 - df_carries['y'])**2)
df_carries['end'] = np.sqrt((105 - df_carries['Next_X'])**2 + (52.5 - df_carries['Next_Y'])**2)
df_carries['progressive'] = (df_carries['end'] / df_carries['beginning']) < 0.75

# --- Step 4: Load xT Grid and Calculate xT Gain ---
xT = pd.read_csv("xT_Grid.csv", header=None).to_numpy()
xT_rows, xT_cols = xT.shape

df_carries = df_carries.dropna(subset=['x', 'y', 'Next_X', 'Next_Y'])
for col in ['x', 'y', 'Next_X', 'Next_Y']:
    df_carries[col] = pd.to_numeric(df_carries[col], errors='coerce')
df_carries.dropna(subset=['x', 'y', 'Next_X', 'Next_Y'], inplace=True)

df_carries['x1_bin'] = pd.cut(df_carries['x'], bins=xT_cols, labels=False)
df_carries['y1_bin'] = pd.cut(df_carries['y'], bins=xT_rows, labels=False)
df_carries['x2_bin'] = pd.cut(df_carries['Next_X'], bins=xT_cols, labels=False)
df_carries['y2_bin'] = pd.cut(df_carries['Next_Y'], bins=xT_rows, labels=False)

df_carries['start_zone_value'] = df_carries[['x1_bin', 'y1_bin']].apply(lambda x: xT[int(x[1]), int(x[0])] if pd.notnull(x[0]) and pd.notnull(x[1]) else np.nan, axis=1)
df_carries['end_zone_value'] = df_carries[['x2_bin', 'y2_bin']].apply(lambda x: xT[int(x[1]), int(x[0])] if pd.notnull(x[0]) and pd.notnull(x[1]) else np.nan, axis=1)
df_carries.dropna(subset=['start_zone_value', 'end_zone_value'], inplace=True)
df_carries['xT'] = df_carries['end_zone_value'] - df_carries['start_zone_value']

# --- Step 5: Load EPV Grid and Calculate EPV Gain ---
epv = pd.read_csv("epv_grid.csv", header=None).to_numpy()
epv_rows, epv_cols = epv.shape

df_carries['x1_bin'] = pd.cut(df_carries['x'], bins=epv_cols, labels=False).astype('Int64')
df_carries['y1_bin'] = pd.cut(df_carries['y'], bins=epv_rows, labels=False).astype('Int64')
df_carries['x2_bin'] = pd.cut(df_carries['Next_X'], bins=epv_cols, labels=False).astype('Int64')
df_carries['y2_bin'] = pd.cut(df_carries['Next_Y'], bins=epv_rows, labels=False).astype('Int64')

def get_epv_value(bin_indices, epv_grid):
    if pd.notnull(bin_indices[0]) and pd.notnull(bin_indices[1]):
        return epv_grid[int(bin_indices[1])][int(bin_indices[0])]
    return np.nan

df_carries['start_zone_value_epv'] = df_carries[['x1_bin', 'y1_bin']].apply(lambda x: get_epv_value(x, epv), axis=1)
df_carries['end_zone_value_epv'] = df_carries[['x2_bin', 'y2_bin']].apply(lambda x: get_epv_value(x, epv), axis=1)

df_carries['start_zone_value_epv'] = pd.to_numeric(df_carries['start_zone_value_epv'], errors='coerce').clip(0, 1)
df_carries['end_zone_value_epv'] = pd.to_numeric(df_carries['end_zone_value_epv'], errors='coerce').clip(0, 1)

df_carries['epv'] = df_carries['end_zone_value_epv'] - df_carries['start_zone_value_epv']

# --- Step 6: Look Ahead for Shot within Next 5 Events ---
def look_ahead_for_shot(index, max_lookahead=5):
    shot_types = [13, 14, 15, 16]  # Shot-related typeIds
    if index + max_lookahead < len(df):
        return df.iloc[index + 1:index + max_lookahead + 1]['typeId'].isin(shot_types).any()
    return False

df_carries['leads_to_shot'] = df_carries.index.to_series().apply(look_ahead_for_shot)
df_carries['xShot'] = df_carries['leads_to_shot'].astype(int)

# --- Step 7: Create xCarry Label (Based on Progressiveness) ---
df_carries['xCarry'] = df_carries['progressive'].astype(int)

# --- Step 8: Train Models (xShot and xCarry) ---
features = ['beginning', 'end', 'xT', 'epv', 'progressive']
X = df_carries[features]
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# xShot model
y_shot = df_carries['xShot']
model_shot = LogisticRegression(solver='liblinear')
model_shot.fit(X_imputed, y_shot)
df_carries['xShot'] = np.clip(np.round(model_shot.predict_proba(X_imputed)[:, 1], 2), 0, 1)

# xCarry model
y_carry = df_carries['xCarry']
model_carry = LogisticRegression(solver='liblinear')
model_carry.fit(X_imputed, y_carry)
df_carries['xCarry'] = np.clip(np.round(model_carry.predict_proba(X_imputed)[:, 1], 2), 0, 1)

# --- Step 9: Export or Display ---
df_carries[['playerName', 'contestantId', 'x', 'y', 'Next_X', 'Next_Y', 
            'progressive', 'xT', 'epv', 'xShot', 'xCarry']].to_excel("progressive_carries_output.xlsx", index=False)


In [7]:
# Group carries by player and compute xCarry stats
player_xcarry_stats = df_carries.groupby('playerName').agg(
    total_carries=('xCarry', 'count'),
    avg_xCarry=('xCarry', 'mean'),
    sum_xCarry=('xCarry', 'sum')
).reset_index()

# Sort players by total xCarry value
player_xcarry_stats = player_xcarry_stats.sort_values(by='sum_xCarry', ascending=False)

# Show top players
print(player_xcarry_stats.head(10))

# Optional: Save to Excel
player_xcarry_stats.to_excel("xcarry_by_player.xlsx", index=False)


     playerName  total_carries  avg_xCarry  sum_xCarry
7        G. Til             15    0.521333        7.82
19      N. Lang             25    0.202400        5.06
1     A. Nagalo             41    0.097805        4.01
15   L. de Jong             17    0.187059        3.18
4     C. Sandra             20    0.154000        3.08
0      A. Fatah             13    0.233077        3.03
26   T. Malacia             18    0.166667        3.00
14    K. Vaesen             17    0.168235        2.86
13   J. Veerman             30    0.091333        2.74
10  J. Bakayoko             16    0.134375        2.15


In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

# Load data
df = pd.read_csv("/Users/marclambertes/Python/Matches/Men/2024-2025/Eredivisie 2024-2025/PSV 1-1 Willem II.csv")  # Change to your match file
xT = pd.read_csv("xT_Grid.csv", header=None).to_numpy()
epv = pd.read_csv("epv_grid.csv", header=None).to_numpy()
xT_rows, xT_cols = xT.shape
epv_rows, epv_cols = epv.shape

# Helper: EPV/xT grid value lookup
def get_epv_value(bin_indices, grid):
    if pd.notnull(bin_indices[0]) and pd.notnull(bin_indices[1]):
        return grid[int(bin_indices[1])][int(bin_indices[0])]
    return np.nan

# Step 1: Infer Carries
def infer_carries(df):
    df = df.sort_values(by=['playerId', 'periodId', 'timeMin', 'timeSec'])
    carry_rows = []
    for i in range(len(df) - 1):
        current = df.iloc[i]
        next_event = df.iloc[i + 1]
        if current['playerId'] == next_event['playerId']:
            time_diff = (next_event['timeMin'] * 60 + next_event['timeSec']) - (current['timeMin'] * 60 + current['timeSec'])
            dist = np.sqrt((next_event['x'] - current['x'])**2 + (next_event['y'] - current['y'])**2)
            if 0 < time_diff <= 10 and dist > 2:
                carry_rows.append({
                    'playerId': current['playerId'],
                    'x': current['x'], 'y': current['y'],
                    'Next_X': next_event['x'], 'Next_Y': next_event['y'],
                    'timeMin': current['timeMin'], 'timeSec': current['timeSec'],
                    'contestantId': current['contestantId']
                })
    return pd.DataFrame(carry_rows)

df_carries = infer_carries(df)

# Step 2: Process Carries
for col in ['x', 'y', 'Next_X', 'Next_Y']:
    df_carries[col] = pd.to_numeric(df_carries[col], errors='coerce')
df_carries.dropna(subset=['x', 'y', 'Next_X', 'Next_Y'], inplace=True)

df_carries['beginning'] = np.sqrt((105 - df_carries['x'])**2 + (52.5 - df_carries['y'])**2)
df_carries['end'] = np.sqrt((105 - df_carries['Next_X'])**2 + (52.5 - df_carries['Next_Y'])**2)
df_carries['progressive'] = (df_carries['end'] / df_carries['beginning']) < 0.75

# xT bins
df_carries['x1_bin'] = pd.cut(df_carries['x'], bins=xT_cols, labels=False)
df_carries['y1_bin'] = pd.cut(df_carries['y'], bins=xT_rows, labels=False)
df_carries['x2_bin'] = pd.cut(df_carries['Next_X'], bins=xT_cols, labels=False)
df_carries['y2_bin'] = pd.cut(df_carries['Next_Y'], bins=xT_rows, labels=False)
df_carries['start_zone_value'] = df_carries[['x1_bin', 'y1_bin']].apply(lambda x: get_epv_value(x, xT), axis=1)
df_carries['end_zone_value'] = df_carries[['x2_bin', 'y2_bin']].apply(lambda x: get_epv_value(x, xT), axis=1)
df_carries['xT'] = df_carries['end_zone_value'] - df_carries['start_zone_value']

# EPV bins
df_carries['x1_bin'] = pd.cut(df_carries['x'], bins=epv_cols, labels=False).astype('Int64')
df_carries['y1_bin'] = pd.cut(df_carries['y'], bins=epv_rows, labels=False).astype('Int64')
df_carries['x2_bin'] = pd.cut(df_carries['Next_X'], bins=epv_cols, labels=False).astype('Int64')
df_carries['y2_bin'] = pd.cut(df_carries['Next_Y'], bins=epv_rows, labels=False).astype('Int64')
df_carries['start_zone_value_epv'] = df_carries[['x1_bin', 'y1_bin']].apply(lambda x: get_epv_value(x, epv), axis=1)
df_carries['end_zone_value_epv'] = df_carries[['x2_bin', 'y2_bin']].apply(lambda x: get_epv_value(x, epv), axis=1)
df_carries['epv'] = pd.to_numeric(df_carries['end_zone_value_epv'], errors='coerce') - pd.to_numeric(df_carries['start_zone_value_epv'], errors='coerce')

# xShot label
def look_ahead_for_shot(index, max_lookahead=5):
    shot_types = [13, 14, 15, 16]
    if index + max_lookahead < len(df):
        return df.iloc[index + 1:index + max_lookahead + 1]['typeId'].isin(shot_types).any()
    return False

df_carries['leads_to_shot'] = df_carries.index.to_series().apply(look_ahead_for_shot)
df_carries['xShot'] = df_carries['leads_to_shot'].astype(int)
df_carries['xCarry'] = df_carries['progressive'].astype(int)

# Step 3: Model xShot and xCarry
features = ['beginning', 'end', 'xT', 'epv', 'progressive']
imputer = SimpleImputer(strategy='mean')
X_carry = imputer.fit_transform(df_carries[features])

model_shot = LogisticRegression(solver='liblinear')
model_shot.fit(X_carry, df_carries['xShot'])

model_carry = LogisticRegression(solver='liblinear')
model_carry.fit(X_carry, df_carries['xCarry'])

df_carries['xShot'] = np.clip(np.round(model_shot.predict_proba(X_carry)[:, 1], 2), 0, 1)
df_carries['xCarry'] = np.clip(np.round(model_carry.predict_proba(X_carry)[:, 1], 2), 0, 1)

# Step 4: Model xDribble
df_dribbles = df[df['typeId'] == 3].copy()
df_dribbles['Next_X'] = df_dribbles['x'].shift(-1)
df_dribbles['Next_Y'] = df_dribbles['y'].shift(-1)
df_dribbles['beginning'] = np.sqrt((105 - df_dribbles['x'])**2 + (52.5 - df_dribbles['y'])**2)
df_dribbles['end'] = np.sqrt((105 - df_dribbles['Next_X'])**2 + (52.5 - df_dribbles['Next_Y'])**2)
df_dribbles['progressive'] = (df_dribbles['end'] / df_dribbles['beginning']) < 0.75

for col in ['x', 'y', 'Next_X', 'Next_Y']:
    df_dribbles[col] = pd.to_numeric(df_dribbles[col], errors='coerce')
df_dribbles.dropna(subset=['x', 'y', 'Next_X', 'Next_Y'], inplace=True)

df_dribbles['x1_bin'] = pd.cut(df_dribbles['x'], bins=xT_cols, labels=False)
df_dribbles['y1_bin'] = pd.cut(df_dribbles['y'], bins=xT_rows, labels=False)
df_dribbles['x2_bin'] = pd.cut(df_dribbles['Next_X'], bins=xT_cols, labels=False)
df_dribbles['y2_bin'] = pd.cut(df_dribbles['Next_Y'], bins=xT_rows, labels=False)
df_dribbles['start_zone_value'] = df_dribbles[['x1_bin', 'y1_bin']].apply(lambda x: get_epv_value(x, xT), axis=1)
df_dribbles['end_zone_value'] = df_dribbles[['x2_bin', 'y2_bin']].apply(lambda x: get_epv_value(x, xT), axis=1)
df_dribbles['xT'] = df_dribbles['end_zone_value'] - df_dribbles['start_zone_value']

df_dribbles['x1_bin'] = pd.cut(df_dribbles['x'], bins=epv_cols, labels=False).astype('Int64')
df_dribbles['y1_bin'] = pd.cut(df_dribbles['y'], bins=epv_rows, labels=False).astype('Int64')
df_dribbles['x2_bin'] = pd.cut(df_dribbles['Next_X'], bins=epv_cols, labels=False).astype('Int64')
df_dribbles['y2_bin'] = pd.cut(df_dribbles['Next_Y'], bins=epv_rows, labels=False).astype('Int64')
df_dribbles['start_zone_value_epv'] = df_dribbles[['x1_bin', 'y1_bin']].apply(lambda x: get_epv_value(x, epv), axis=1)
df_dribbles['end_zone_value_epv'] = df_dribbles[['x2_bin', 'y2_bin']].apply(lambda x: get_epv_value(x, epv), axis=1)
df_dribbles['epv'] = pd.to_numeric(df_dribbles['end_zone_value_epv'], errors='coerce') - pd.to_numeric(df_dribbles['start_zone_value_epv'], errors='coerce')

df_dribbles['xDribble'] = df_dribbles['outcome']
X_dribble = imputer.fit_transform(df_dribbles[features])

model_dribble = LogisticRegression(solver='liblinear')
model_dribble.fit(X_dribble, df_dribbles['xDribble'])
df_dribbles['xDribble'] = np.clip(np.round(model_dribble.predict_proba(X_dribble)[:, 1], 2), 0, 1)

# Step 5: Merge and Export
carries_output = df_carries[['playerId', 'contestantId', 'x', 'y', 'Next_X', 'Next_Y', 
                             'progressive', 'xT', 'epv', 'xShot', 'xCarry']].copy()
carries_output['xDribble'] = np.nan

dribbles_output = df_dribbles[['playerId', 'contestantId', 'x', 'y', 'Next_X', 'Next_Y', 
                               'progressive', 'xT', 'epv', 'xDribble']].copy()
dribbles_output['xShot'] = np.nan
dribbles_output['xCarry'] = np.nan

combined_df = pd.concat([carries_output, dribbles_output], ignore_index=True)
combined_df = combined_df[['playerId', 'contestantId', 'x', 'y', 'Next_X', 'Next_Y', 
                           'progressive', 'xT', 'epv', 'xShot', 'xCarry', 'xDribble']]

# Export to Excel
combined_df.to_excel("xcarry_xdribble_xshot_combined.xlsx", index=False)
