In [46]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
import joblib

# Folder with CSV files
folder_path = 'Event-data/Women/NSL 2025'
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
print(f"Found {len(csv_files)} CSV files to process.")

all_shots = []

def process_shots(file_path):
    print(f"Processing: {file_path}")
    shots = pd.read_csv(file_path)
    shots = shots[shots['typeId'].isin([13, 14, 15, 16])]
    type_cols = [col for col in shots.columns if col.startswith('qualifier')]

    is_blocked = (shots['typeId'] == 15) & (shots[type_cols] == 82).any(axis=1)

    def define_shot_type(row):
        if is_blocked.loc[row.name]:
            return 'blocked'
        elif row['typeId'] == 15:
            return 'on_target'
        elif row['typeId'] == 13:
            return 'missed'
        elif row['typeId'] == 14:
            return 'post'
        elif row['typeId'] == 16:
            return 'goal'
        else:
            return 'unknown'

    shots['shot_type'] = shots.apply(define_shot_type, axis=1)

    def get_bodypart(row):
        qualifiers = row[type_cols].values
        if 15 in qualifiers:
            return 'head'
        elif 72 in qualifiers:
            return 'left_foot'
        elif 20 in qualifiers:
            return 'right_foot'
        else:
            return 'unknown'

    shots['bodypart'] = shots.apply(get_bodypart, axis=1)

    play_types = {
        22: 'regular_play', 23: 'fast_break', 24: 'set_piece', 25: 'from_corner',
        26: 'free_kick', 112: 'scramble', 160: 'throw_in_set_piece', 9: 'penalty'
    }

    def get_type_of_play(row):
        qualifiers = row[type_cols].values
        for q in qualifiers:
            if q in play_types:
                return play_types[q]
        return 'unknown'

    shots['type_of_play'] = shots.apply(get_type_of_play, axis=1)

    def calc_distance_angle(x, y, goal_x=100, goal_y=50):
        dx = goal_x - x
        dy = goal_y - y
        distance = np.sqrt(dx**2 + dy**2)
        angle = np.arctan2(dy, dx)
        angle_deg = np.abs(np.degrees(angle))
        return distance, angle_deg

    shots[['distance_to_goal', 'angle_to_goal']] = shots.apply(
        lambda row: pd.Series(calc_distance_angle(row['x'], row['y'])), axis=1)

    fenwick_attempts = shots[shots['shot_type'] != 'blocked']
    fenwick_for = fenwick_attempts.groupby('contestantId').size().reset_index(name='fenwick_for')
    total_fenwicks = fenwick_attempts.shape[0]
    fenwick_for['fenwick_ratio'] = fenwick_for['fenwick_for'] / total_fenwicks

    corsi_for = shots.groupby('contestantId').size().reset_index(name='corsi_for')
    total_corsis = shots.shape[0]
    corsi_for['corsi_ratio'] = corsi_for['corsi_for'] / total_corsis

    shots = shots.merge(fenwick_for[['contestantId', 'fenwick_ratio']], on='contestantId', how='left')
    shots = shots.merge(corsi_for[['contestantId', 'corsi_ratio']], on='contestantId', how='left')

    shot_type_weights = {
        'goal': 1.2, 'on_target': 1.0, 'post': 0.7,
        'missed': 0.5, 'blocked': 0.0, 'unknown': 0.1
    }
    shots['shot_type_weight'] = shots['shot_type'].map(shot_type_weights).fillna(0)
    shots['inv_distance'] = 1 - (shots['distance_to_goal'] / 100)
    shots['angle_score'] = 1 - (shots['angle_to_goal'] / 90)

    shots['danger_score_fenwick'] = (
        0.4 * shots['inv_distance'] +
        0.3 * shots['angle_score'] +
        0.2 * shots['shot_type_weight'] +
        0.1 * shots['fenwick_ratio']
    )

    shots['danger_score_corsi'] = (
        0.4 * shots['inv_distance'] +
        0.3 * shots['angle_score'] +
        0.2 * shots['shot_type_weight'] +
        0.1 * shots['corsi_ratio']
    )

    shots['isIntentionalAssist'] = shots[type_cols].isin([154]).any(axis=1)

    if 'qualifier55' in shots.columns:
        shots['RelatedPlayerId'] = shots['qualifier55']
    else:
        shots['RelatedPlayerId'] = np.nan

    shots['isGoal'] = shots['typeId'] == 16

    all_shots.append(shots)

# Process all CSVs
for csv_file in csv_files:
    process_shots(csv_file)

# Combine all processed shots
combined_shots = pd.concat(all_shots, ignore_index=True)

# Save raw processed data
combined_shots.to_pickle('shots_combined_raw.pkl')

# Prepare features
combined_shots['RelatedPlayerId'] = combined_shots['RelatedPlayerId'].fillna(-1).astype(int)

categorical_cols = ['shot_type', 'bodypart', 'type_of_play']
numerical_cols = [
    'isIntentionalAssist', 'danger_score_fenwick', 'danger_score_corsi',
    'x', 'y', 'distance_to_goal', 'angle_to_goal', 'RelatedPlayerId'
]

# One-hot encode categorical features
X_encoded = pd.get_dummies(combined_shots[categorical_cols], dummy_na=True)

# Combine features
X = pd.concat([X_encoded, combined_shots[numerical_cols]], axis=1)
y = combined_shots['isGoal'].astype(int)

# Train-test split and model training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
base_model = LogisticRegression(max_iter=1000)
calibrated_model = CalibratedClassifierCV(base_model, method='sigmoid', cv=5)
calibrated_model.fit(X_train, y_train)

# Save the model
joblib.dump(calibrated_model, 'calibrated_logistic_model.pkl')

# Predict and scale danger score
danger_scores = calibrated_model.predict_proba(X)[:, 1]
combined_shots['danger_score_logit'] = (danger_scores - danger_scores.min()) / (danger_scores.max() - danger_scores.min())

# Save final output
combined_shots.to_pickle('shots_combined_with_logit.pkl')

print("✅ All shots processed and saved with logistic danger scores.")
print(combined_shots[['shot_type', 'danger_score_fenwick', 'danger_score_corsi', 'danger_score_logit']].head())


Found 28 CSV files to process.
Processing: Event-data/Women/NSL 2025/Ottawa Rapid 2-1 AFC Toronto.csv
Processing: Event-data/Women/NSL 2025/Vancouver Rise 1-0 Halifax Tides.csv
Processing: Event-data/Women/NSL 2025/Vancouver Rise 1-0 Calgary Wild.csv
Processing: Event-data/Women/NSL 2025/Montréal Roses 0-2 AFC Toronto.csv
Processing: Event-data/Women/NSL 2025/Calgary Wild 1-0 Montréal Roses.csv
Processing: Event-data/Women/NSL 2025/Calgary Wild 3-2 Halifax Tides.csv
Processing: Event-data/Women/NSL 2025/Vancouver Rise 2-1 Halifax Tides.csv
Processing: Event-data/Women/NSL 2025/AFC Toronto 3-1 Halifax Tides.csv
Processing: Event-data/Women/NSL 2025/Vancouver Rise 1-1 Calgary Wild.csv
Processing: Event-data/Women/NSL 2025/Vancouver Rise 1-3 Montréal Roses.csv
Processing: Event-data/Women/NSL 2025/AFC Toronto 2-1 Calgary Wild.csv
Processing: Event-data/Women/NSL 2025/Montréal Roses 0-0 Halifax Tides.csv
Processing: Event-data/Women/NSL 2025/Halifax Tides 2-1 Ottawa Rapid.csv
Processing: E

In [57]:
import joblib
import pandas as pd
import numpy as np

# Load new match data
new_shots = pd.read_csv('Event-data/Women/NSL 2025/Vancouver Rise 1-1 Calgary Wild.csv')

# === Preprocessing (same as training) ===
new_shots = new_shots[new_shots['typeId'].isin([13, 14, 15, 16])]
type_cols = [col for col in new_shots.columns if col.startswith('qualifier')]

is_blocked = (new_shots['typeId'] == 15) & (new_shots[type_cols] == 82).any(axis=1)

def define_shot_type(row):
    if is_blocked.loc[row.name]:
        return 'blocked'
    elif row['typeId'] == 15:
        return 'on_target'
    elif row['typeId'] == 13:
        return 'missed'
    elif row['typeId'] == 14:
        return 'post'
    elif row['typeId'] == 16:
        return 'goal'
    else:
        return 'unknown'

new_shots['shot_type'] = new_shots.apply(define_shot_type, axis=1)

def get_bodypart(row):
    qualifiers = row[type_cols].values
    if 15 in qualifiers:
        return 'head'
    elif 72 in qualifiers:
        return 'left_foot'
    elif 20 in qualifiers:
        return 'right_foot'
    else:
        return 'unknown'

new_shots['bodypart'] = new_shots.apply(get_bodypart, axis=1)

play_types = {
    22: 'regular_play', 23: 'fast_break', 24: 'set_piece', 25: 'from_corner',
    26: 'free_kick', 112: 'scramble', 160: 'throw_in_set_piece', 9: 'penalty'
}

def get_type_of_play(row):
    qualifiers = row[type_cols].values
    for q in qualifiers:
        if q in play_types:
            return play_types[q]
    return 'unknown'

new_shots['type_of_play'] = new_shots.apply(get_type_of_play, axis=1)

def calc_distance_angle(x, y, goal_x=100, goal_y=50):
    dx = goal_x - x
    dy = goal_y - y
    distance = np.sqrt(dx**2 + dy**2)
    angle = np.arctan2(dy, dx)
    angle_deg = np.abs(np.degrees(angle))
    return distance, angle_deg

new_shots[['distance_to_goal', 'angle_to_goal']] = new_shots.apply(
    lambda row: pd.Series(calc_distance_angle(row['x'], row['y'])), axis=1)

# Feature engineering
shot_type_weights = {
    'goal': 1.2, 'on_target': 1.0, 'post': 0.7,
    'missed': 0.5, 'blocked': 0.0, 'unknown': 0.1
}
new_shots['shot_type_weight'] = new_shots['shot_type'].map(shot_type_weights).fillna(0)
new_shots['inv_distance'] = 1 - (new_shots['distance_to_goal'] / 100)
new_shots['angle_score'] = 1 - (new_shots['angle_to_goal'] / 90)

new_shots['danger_score_fenwick'] = (
    0.4 * new_shots['inv_distance'] +
    0.3 * new_shots['angle_score'] +
    0.2 * new_shots['shot_type_weight'] +
    0.1
)

new_shots['danger_score_corsi'] = (
    0.4 * new_shots['inv_distance'] +
    0.3 * new_shots['angle_score'] +
    0.2 * new_shots['shot_type_weight'] +
    0.1
)

new_shots['isIntentionalAssist'] = new_shots[type_cols].isin([154]).any(axis=1)

if 'qualifier55' in new_shots.columns:
    new_shots['RelatedPlayerId'] = new_shots['qualifier55']
else:
    new_shots['RelatedPlayerId'] = np.nan
new_shots['RelatedPlayerId'] = new_shots['RelatedPlayerId'].fillna(-1).astype(int)

new_shots['isGoal'] = new_shots['typeId'] == 16

# === Model input preparation ===
categorical = ['shot_type', 'bodypart', 'type_of_play']
numerical = ['isIntentionalAssist', 'RelatedPlayerId', 'isGoal',
             'danger_score_fenwick', 'danger_score_corsi', 'x', 'y',
             'distance_to_goal', 'angle_to_goal']

# One-hot encode categorical variables
X_cat = pd.get_dummies(new_shots[categorical], prefix=categorical, dummy_na=True)
X_num = new_shots[numerical]
X_new = pd.concat([X_cat, X_num], axis=1)

# === Align with model features ===
model = joblib.load('calibrated_logistic_model.pkl')
expected_features = model.feature_names_in_

# Add missing columns as zeros
for col in expected_features:
    if col not in X_new.columns:
        X_new[col] = 0

# Ensure correct column order
X_new = X_new[expected_features]

# === Predict danger scores ===
danger_scores = model.predict_proba(X_new)[:, 1]
new_shots['danger_score_logit'] = (danger_scores - danger_scores.min()) / (danger_scores.max() - danger_scores.min())

# === Prepare output with all features and key columns ===
output_columns = ['playerName', 'contestantId', 'x', 'y'] + list(X_new.columns) + [
    'danger_score_fenwick', 'danger_score_corsi', 'danger_score_logit'
]

# Keep only columns present in the dataframe or feature set
output_columns = [col for col in output_columns if col in new_shots.columns or col in X_new.columns]

# Add model features columns to new_shots DataFrame
for col in X_new.columns:
    new_shots[col] = X_new[col]

# Save to CSV
new_shots[output_columns].to_csv('Vancouver Rise 1-1 Calgary Wild with all danger scores.csv', index=False)

print(new_shots[output_columns].head())


              playerName               contestantId     x     y  \
221               M. Lee  1ete1sj7sclaoohvow5g84gt0  90.7  62.9   
321           K. Johnson  2owrsfmi12xeajx5almb1kbv8  78.2  38.7   
511           J. Sawicki  2owrsfmi12xeajx5almb1kbv8  76.9  59.7   
609         J. Longhurst  1ete1sj7sclaoohvow5g84gt0  94.0  44.2   
685  M. Dougherty Howard  2owrsfmi12xeajx5almb1kbv8  75.0  50.9   

     shot_type_blocked  shot_type_goal  shot_type_missed  shot_type_on_target  \
221              False            True             False                False   
321              False            True             False                False   
511              False           False             False                 True   
609              False           False             False                 True   
685              False           False              True                False   

     shot_type_post  shot_type_nan  ...  danger_score_fenwick  \
221           False          False  ...      