In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Specify the path to the Excel file
file_path = '/Users/marclambertes/Python/Matches/Women/2024-2025/Leagues - Project/Pass/Czech Republic - Pass.xlsx'  # Update with the actual file path on your system

# Step 2: Load the file directly from the local system
data = pd.read_excel(file_path, usecols=['playerName', 'contestantId', 'x', 'y', 'endX', 'endY', 'outcome', 'typeId'])

# Step 3: Filter for passes (typeId == 1) and defensive actions (typeId == 7 or 8)
passes = data[data['typeId'] == 1]  # Passes
defensive_actions = data[data['typeId'].isin([7, 8])]  # Defensive actions (e.g., tackles, interceptions)

# Identify receivers based on outcome == 1 and next action by a different player
passes['receiverName'] = None
for idx, pass_row in passes.iterrows():
    if pass_row['outcome'] == 1:
        subsequent_actions = data[(data['x'] == pass_row['endX']) & (data['y'] == pass_row['endY']) & (data['playerName'] != pass_row['playerName'])]
        if not subsequent_actions.empty:
            passes.at[idx, 'receiverName'] = subsequent_actions.iloc[0]['playerName']

# Add Pressure successful column
passes['Pressure successful'] = passes['receiverName'].apply(lambda x: 'no' if pd.notna(x) else 'yes')

# Define a threshold for distance calculations
threshold = 10  # meters

# Function to calculate distance weight
def calculate_distance_weight(distance, max_distance=threshold):
    return max(0, 1 - distance / max_distance)  # Weight decreases linearly with distance

# Prepare the list for matched actions
matched_actions = []

for _, pass_row in passes.iterrows():
    for _, def_row in defensive_actions.iterrows():
        if pass_row['contestantId'] != def_row['contestantId']:  # Ensure different teams
            # Calculate spatial distance between the defender and pass end location
            distance = np.sqrt((pass_row['endX'] - def_row['x']) ** 2 + (pass_row['endY'] - def_row['y']) ** 2)
            if distance <= threshold:
                # Calculate pre and post-action danger levels
                pre_action_danger = 1 - (
                    0.02 * distance +
                    0.1 * np.abs(np.arctan2(pass_row['endY'] - pass_row['y'], pass_row['endX'] - pass_row['x']))
                )
                pre_action_danger = max(0, min(1, pre_action_danger))  # Clip between 0 and 1
                post_action_danger = pre_action_danger * (0.5 if def_row['outcome'] == 1 else 1.0)

                # Calculate distance weight
                distance_weight = calculate_distance_weight(distance)

                # Calculate xDef with distance weight
                xDef = (pre_action_danger - post_action_danger) * distance_weight

                # Calculate xPass for the pass
                xPass = 1 - (
                    0.02 * distance +
                    0.1 * np.abs(np.arctan2(pass_row['endY'] - pass_row['y'], pass_row['endX'] - pass_row['x']))
                )
                xPass = max(0, min(1, xPass))  # Clip xPass to be between 0 and 1

                # Append the matched action data
                matched_actions.append({
                    'passer': pass_row['playerName'],
                    'receiver': pass_row['receiverName'],
                    'Pressure successful': pass_row['Pressure successful'],
                    'defender': def_row['playerName'],
                    'xPass': xPass,
                    'pre_action_danger': pre_action_danger,
                    'post_action_danger': post_action_danger,
                    'xDef': xDef,
                    'distance_weight': distance_weight,
                    'pass_start': (pass_row['x'], pass_row['y']),
                    'pass_end': (pass_row['endX'], pass_row['endY']),
                    'defensive_action': (def_row['x'], def_row['y']),
                    'distance': distance
                })

# Step 4: Convert matched actions to DataFrame
matched_df = pd.DataFrame(matched_actions)

# Step 5: Define the target variable ('defensive_success') for model training
# Defensive success is 1 if defender successfully completed action, otherwise 0
matched_df['defensive_success'] = matched_df['defender'].apply(lambda defender: 1 if defender in defensive_actions[defensive_actions['outcome'] == 1]['playerName'].values else 0)

# Select features for the model
features = ['xPass', 'pre_action_danger', 'post_action_danger', 'distance_weight', 'distance', 'xDef']
X = matched_df[features]

# Define the target variable
y = matched_df['defensive_success']

# Step 6: Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 8: Predict on the test set and evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 9: Feature importance
feature_importances = model.feature_importances_
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(importance_df)

# Step 10: Use the model to predict the probability of defensive success for new data
new_data = {
    'xPass': 0.8, 
    'pre_action_danger': 0.5, 
    'post_action_danger': 0.4, 
    'distance_weight': 0.7, 
    'distance': 5.0, 
    'xDef': 0.3
}

new_df = pd.DataFrame([new_data])

# Predict the probability of defensive success
predicted_prob = model.predict_proba(new_df)[:, 1]  # Probability of class 1 (success)
print("Predicted Probability of Defensive Success:", predicted_prob[0])

# Step 11: Save the results to a new Excel file
output_filename = 'Matched_Passes_With_Defensive_Actions_Threshold_10_Meters.xlsx'
matched_df.to_excel(output_filename, index=False)

print(f"Results saved locally at '{output_filename}'")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  passes['receiverName'] = None
