In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb

# ==========================
# 1. Load and preprocess data
# ==========================
file_path = '/Users/user/Event-data/Men/Finland 2025/HJK 3-1 Oulu.csv'  # Your local file
df = pd.read_csv(file_path)

# Keep relevant columns
df = df[['id', 'timeMin', 'timeSec', 'typeId', 'outcome', 'x', 'y', 'playerId', 'playerName', 'contestantId']].copy()
df['time_seconds'] = df['timeMin'] * 60 + df['timeSec']
df.sort_values(by='time_seconds', inplace=True)
df.reset_index(drop=True, inplace=True)

# Normalize pitch coordinates
df['x_norm'] = df['x'] / 100
df['y_norm'] = df['y'] / 100

# ==========================
# 2. Feature Engineering
# ==========================
# Distance to goal (assuming right-side goal at x=100, y=50)
df['distance_to_goal'] = np.sqrt((100 - df['x'])**2 + (50 - df['y'])**2)

# Shot angle feature (simplified)
df['angle_to_goal'] = np.arctan2(abs(50 - df['y']), (100 - df['x']) + 1e-6)

# Previous action info
df['prev_x'] = df['x_norm'].shift(1).fillna(0.5)
df['prev_y'] = df['y_norm'].shift(1).fillna(0.5)
df['prev_outcome'] = df['outcome'].shift(1).fillna(1)
df['prev_type'] = df['typeId'].shift(1).fillna(0)

# One-hot encode event type
event_types = pd.get_dummies(df['typeId'], prefix='event')
df = pd.concat([df, event_types], axis=1)

# ==========================
# 3. Target creation (next 10 actions)
# ==========================
df['score_next_10'] = 0
df['concede_next_10'] = 0
goal_event_ids = [16, 17, 18]  # Adjust based on your data
goal_indices = df[df['typeId'].isin(goal_event_ids)].index

for idx in goal_indices:
    team = df.loc[idx, 'contestantId']
    start_idx = max(0, idx - 10)
    df.loc[start_idx:idx, 'score_next_10'] = (df.loc[start_idx:idx, 'contestantId'] == team).astype(int)
    df.loc[start_idx:idx, 'concede_next_10'] = (df.loc[start_idx:idx, 'contestantId'] != team).astype(int)

# ==========================
# 4. Build feature matrix
# ==========================
features = ['x_norm', 'y_norm', 'distance_to_goal', 'angle_to_goal',
            'prev_x', 'prev_y', 'prev_outcome', 'prev_type'] + list(event_types.columns)
X = df[features]
y_score = df['score_next_10']
y_concede = df['concede_next_10']

# ==========================
# 5. Train XGBoost models
# ==========================
X_train, X_test, y_score_train, y_score_test = train_test_split(X, y_score, test_size=0.2, random_state=42)
_, _, y_concede_train, y_concede_test = train_test_split(X, y_concede, test_size=0.2, random_state=42)

model_score = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)
model_concede = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)

model_score.fit(X_train, y_score_train)
model_concede.fit(X_train, y_concede_train)

# ==========================
# 6. Compute probabilities and VAEP
# ==========================
df['p_score'] = model_score.predict_proba(X)[:, 1]
df['p_concede'] = model_concede.predict_proba(X)[:, 1]

df['p_score_prev'] = df['p_score'].shift(1).fillna(df['p_score'].iloc[0])
df['p_concede_prev'] = df['p_concede'].shift(1).fillna(df['p_concede'].iloc[0])

# VAEP calculation
df['vaep'] = (df['p_score'] - df['p_score_prev']) - (df['p_concede'] - df['p_concede_prev'])

# ==========================
# 7. Aggregate VAEP by player & team
# ==========================
player_vaep = df.groupby('playerName')['vaep'].sum().sort_values(ascending=False)
team_vaep = df.groupby('contestantId')['vaep'].sum().sort_values(ascending=False)

# ==========================
# 8. Visualization
# ==========================
top_players = player_vaep.head(10)
plt.figure(figsize=(10, 6))
plt.barh(top_players.index, top_players.values)
plt.xlabel('VAEP Value')
plt.title('Top 10 Players by VAEP')
plt.gca().invert_yaxis()
plt.show()

# ==========================
# 9. Save results locally
# ==========================
output_folder = '/Users/user/Downloads/'
df.to_csv(output_folder + 'vaep_enriched.csv', index=False)
player_vaep.to_csv(output_folder + 'vaep_player_ranking.csv')
team_vaep.to_csv(output_folder + 'vaep_team_ranking.csv')

print("✅ Files saved:")
print(output_folder + 'vaep_enriched.csv')
print(output_folder + 'vaep_player_ranking.csv')
print(output_folder + 'vaep_team_ranking.csv')


ValueError: Mismatched version between the Python package and the native shared object.  Python package version: 2.0.3. Shared object version: 3.0.2. Shared object is loaded from: /opt/anaconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib.
Likely cause:
  * XGBoost is first installed with anaconda then upgraded with pip. To fix it please remove one of the installations.