In [23]:
import os
import pandas as pd

# Define the folder containing CSV files
folder_path = "/Users/marclambertes/Python/Poland 2024-2025/xgCSV"  # Folder with input CSV files
output_folder = "/Users/marclambertes/Python/SPTI"  # Folder to save the results

# Initialize a list to store results from all files
all_results = []

# Iterate through all CSV files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        
        # Load the dataset
        data = pd.read_csv(file_path)

        # Check if 'Type_of_play' column exists
        if "Type_of_play" not in data.columns:
            continue  # Skip this file if the column is missing

        # Filter set pieces (corners, free kicks, throw-ins)
        set_piece_types = ["FromCorner", "SetPiece", "ThrowinSetPiece"]
        set_piece_data = data[data["Type_of_play"].isin(set_piece_types)]

        # Group by team and calculate set piece metrics
        team_set_piece_stats = set_piece_data.groupby("TeamId").agg(
            total_xG=("xG", "sum"),  # Total xG from set pieces
            total_set_pieces=("Type_of_play", "count"),  # Total number of set pieces
            goals_from_set_pieces=("isGoal", "sum"),  # Goals scored from set pieces
        ).reset_index()

        # Calculate conversion rate
        team_set_piece_stats["conversion_rate"] = (
            team_set_piece_stats["goals_from_set_pieces"] / team_set_piece_stats["total_set_pieces"]
        )

        # Normalize metrics for SPTI calculation
        team_set_piece_stats["xG_normalized"] = (
            team_set_piece_stats["total_xG"] / team_set_piece_stats["total_xG"].max()
        )
        team_set_piece_stats["frequency_normalized"] = (
            team_set_piece_stats["total_set_pieces"] / team_set_piece_stats["total_set_pieces"].max()
        )
        team_set_piece_stats["conversion_normalized"] = (
            team_set_piece_stats["conversion_rate"] / team_set_piece_stats["conversion_rate"].max()
        )

        # Calculate Set Piece Threat Index (SPTI)
        team_set_piece_stats["SPTI"] = (
            team_set_piece_stats["xG_normalized"] * 0.5
            + team_set_piece_stats["frequency_normalized"] * 0.3
            + team_set_piece_stats["conversion_normalized"] * 0.2
        )

        # Add a column to identify the file name
        team_set_piece_stats["file_name"] = file_name

        # Append the results to the list
        all_results.append(team_set_piece_stats)

# Combine results from all files into a single DataFrame
if all_results:  # Check if there are any results to combine
    combined_results = pd.concat(all_results, ignore_index=True)

    # Group by team and calculate average SPTI across all files
    final_results = combined_results.groupby("TeamId").agg(
        average_SPTI=("SPTI", "mean"),  # Average SPTI across all files
        total_xG=("total_xG", "sum"),  # Total xG across all files
        total_set_pieces=("total_set_pieces", "sum"),  # Total set pieces across all files
        total_goals_from_set_pieces=("goals_from_set_pieces", "sum"),  # Total goals from set pieces
    ).reset_index()

    # Calculate overall conversion rate
    final_results["overall_conversion_rate"] = (
        final_results["total_goals_from_set_pieces"] / final_results["total_set_pieces"]
    )

    # Sort by average SPTI for ranking
    final_results = final_results.sort_values(by="average_SPTI", ascending=False)

    # Save the final results to an Excel file in the specified output folder
    output_file_path = os.path.join(output_folder, "SPTI Poland.xlsx")
    final_results.to_excel(output_file_path, index=False)
else:
    print("No valid files found with the required columns.")