# Premier League Player Role Discovery - Phase 2: Feature Engineering

This notebook implements the feature engineering phase for the Premier League Player Role Discovery project. It builds on the cleaned dataset from Phase 1 and creates:

1. Per-90 normalized versions of relevant statistics
2. Composite indices (PI, CCI, DA, FE)
3. Winsorized features to handle outliers

The final engineered dataset will be saved for use in subsequent clustering and analysis phases.

## IMPORTANT: Run Order Instructions

To avoid errors, please run the cells in the following order:

1. Run cells 1-2: Import Libraries and Define Constants
2. Run cells 6-7: Load and Prepare Data (Section 4)
3. Run cell 14: Helper Functions (contains all function definitions)
4. Run cells 3-5: Data Type Conversion and Column Analysis
5. Run cells 8-13: Per-90 Normalization, Composite Indices, and Winsorization
6. Run cells 15-16: Save Dataset and Summary

This order ensures that all functions are defined before they're used and data is loaded before it's processed.


## 1. Import Libraries


In [None]:
import os
import json
import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Tuple, Set, Union
import warnings
from scipy import stats
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# set display options for better dataframe viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)


## 2. Define Constants and Paths


In [None]:
# define paths
INPUT_FILE = '../data/processed/player_stats_cleaned.csv'
OUTPUT_FILE = '../data/processed/player_stats_engineered.csv'
METADATA_FILE = '../data/processed/feature_metadata.json'

# define constants for feature engineering
WINSOR_LOWER = 0.05  # 5th percentile
WINSOR_UPPER = 0.95  # 95th percentile

# columns that should not be normalized or winsorized
ID_COLUMNS = ['player', 'team', 'position', 'data_source', 'nation']

# define composite index components and weights
COMPOSITE_INDICES = {
    'PI': {  # Progressive Index
        'components': ['prgp_per90', 'prgc_per90', 'succ_per90'],
        'weights': [0.4, 0.4, 0.2]  # progressive passes and carries weighted more
    },
    'CCI': {  # Creative Contribution Index
        'components': ['kp_per90', 'sca_per90', 'xag_per90'],
        'weights': [0.3, 0.4, 0.3]  # equal weights with slight emphasis on shot-creating actions
    },
    'DA': {  # Defensive Activity Index
        'components': ['tkl_per90', 'int_per90', 'pressures_per90', 'blocks_per90'],
        'weights': [0.3, 0.3, 0.2, 0.2]  # tackles and interceptions weighted more
    },
    'FE': {  # Final Execution Index
        'components': ['gls_per90', 'xg_per90', 'sot_per90', 'g_per_sh'],
        'weights': [0.3, 0.3, 0.2, 0.2]  # goals and xG weighted more
    }
}


## 3. Helper Functions

### ⚠️ IMPORTANT: Run cell 14 first (contains function definitions) before running other analysis cells


In [None]:
# Check data types and convert string columns to numeric where appropriate
print("Converting string columns to numeric...")

# Sample a few columns to see their content
print("\nSample of column content before conversion:")
for col in ['gls', 'xg', 'prgp', 'tkl', 'minutes']:
    if col in df.columns:
        print(f"{col}: {df[col].head(3).tolist()}")

# Count numeric columns before conversion
numeric_count_before = len(df.select_dtypes(include=[np.number]).columns)
print(f"\nNumeric columns before conversion: {numeric_count_before}")

# Convert string columns to numeric
for col in df.columns:
    if col not in ID_COLUMNS and df[col].dtype == 'object':
        try:
            # Try to convert to numeric, handling commas and other formatting
            df[col] = pd.to_numeric(
                df[col].astype(str).str.replace(',', '').str.replace('-', '0'),
                errors='coerce'
            )
        except Exception as e:
            pass

# Count numeric columns after conversion
numeric_count_after = len(df.select_dtypes(include=[np.number]).columns)
print(f"Numeric columns after conversion: {numeric_count_after}")
print(f"Converted {numeric_count_after - numeric_count_before} columns to numeric")


In [None]:
# Debug: Let's examine the column names to better understand what's available
print("Analyzing column names...")

# Convert all columns to numeric where possible
for col in df.columns:
    if col not in ID_COLUMNS and df[col].dtype == 'object':
        try:
            # Try to convert to numeric, handling commas and other formatting
            df[col] = pd.to_numeric(
                df[col].astype(str).str.replace(',', '').str.replace('-', '0'),
                errors='coerce'
            )
        except Exception as e:
            pass

# Check for key stat columns we need for our indices
key_stats = {
    'Progressive': ['prgp', 'prgc', 'prog', 'succ', 'carries', 'dribble'],
    'Creative': ['kp', 'sca', 'xag', 'xa', 'assist', 'key'],
    'Defensive': ['tkl', 'int', 'press', 'block', 'def', 'tack'],
    'Finishing': ['gls', 'xg', 'sot', 'shot', 'goal', 'finish']
}

# Search for columns matching our key stats
found_columns = {}
for category, terms in key_stats.items():
    found_columns[category] = []
    for term in terms:
        matching_cols = [col for col in df.columns if term.lower() in col.lower()]
        if matching_cols:
            found_columns[category].extend(matching_cols)

# Print the results
for category, cols in found_columns.items():
    print(f"\n{category} columns found ({len(cols)}):")
    for col in sorted(cols)[:10]:  # Show first 10 to avoid too much output
        print(f"  - {col}")
    if len(cols) > 10:
        print(f"  ... and {len(cols) - 10} more")

# Create improved column mappings based on what we found
print("\nCreating improved column mappings based on found columns...")


In [None]:
# Create improved column mappings based on what we found
# We'll create per90 versions of these columns in the next step

# Define our composite indices with more flexible column options
IMPROVED_INDICES = {
    'PI': {  # Progressive Index
        'components': [],  # Will be populated based on found columns
        'weight_groups': {
            'passes': ['prgp', 'prgp_passing', 'prgp_possession'],  # 40%
            'carries': ['prgc', 'prgc_possession', 'carries'],      # 40%
            'dribbles': ['succ', 'succ_possession', 'dribble']      # 20%
        },
        'group_weights': [0.4, 0.4, 0.2]
    },
    'CCI': {  # Creative Contribution Index
        'components': [],
        'weight_groups': {
            'key_passes': ['kp', 'kp_passing', '1_per_3'],          # 30%
            'shot_creation': ['sca', 'sca_creation', 'sca90'],      # 40%
            'expected_assists': ['xag', 'xag_passing', 'xa', 'a_xag'] # 30%
        },
        'group_weights': [0.3, 0.4, 0.3]
    },
    'DA': {  # Defensive Activity Index
        'components': [],
        'weight_groups': {
            'tackles': ['tkl', 'tkl_defense', 'tklw', 'tklw_defense'],  # 30%
            'interceptions': ['int', 'int_defense'],                    # 30%
            'pressures': ['press', 'pres', 'recov'],                    # 20%
            'blocks': ['blocks', 'blocks_defense', 'blocks_pass_types'] # 20%
        },
        'group_weights': [0.3, 0.3, 0.2, 0.2]
    },
    'FE': {  # Final Execution Index
        'components': [],
        'weight_groups': {
            'goals': ['gls', 'gls_shooting', 'gls1'],                # 30%
            'expected_goals': ['xg', 'xg_shooting', 'xg1', 'npxg'],  # 30%
            'shots_on_target': ['sot', 'sot_shooting', 'sh_shooting'], # 20%
            'conversion': ['g_per_sh', 'g_per_sot', 'g_xg']          # 20%
        },
        'group_weights': [0.3, 0.3, 0.2, 0.2]
    }
}

# For each index, find the best matching column for each component
for index_name, index_info in IMPROVED_INDICES.items():
    print(f"\nPopulating components for {index_name}:")
    
    # For each weight group, find the best matching column
    for group_name, search_terms in index_info['weight_groups'].items():
        best_match = None
        
        # Try each search term in order of preference
        for term in search_terms:
            # Look for exact matches first
            exact_matches = [col for col in df.columns if col == term]
            if exact_matches:
                best_match = exact_matches[0]
                break
                
            # Then look for partial matches
            partial_matches = [col for col in df.columns if term.lower() in col.lower()]
            if partial_matches:
                best_match = partial_matches[0]
                break
        
        if best_match:
            index_info['components'].append(best_match)
            print(f"  Found {group_name}: {best_match}")
        else:
            print(f"  No match found for {group_name}")
    
    # Print summary
    print(f"  Total components found: {len(index_info['components'])}/{len(index_info['weight_groups'])}")

# Update our composite indices with the improved components and weights
for index_name, index_info in IMPROVED_INDICES.items():
    if len(index_info['components']) >= 2:  # Require at least 2 components
        # Adjust weights based on available components
        weights = []
        for i, comp in enumerate(index_info['components']):
            group_idx = list(index_info['weight_groups'].keys()).index(list(index_info['weight_groups'].keys())[i])
            weights.append(index_info['group_weights'][group_idx])
        
        # Normalize weights to sum to 1
        weights = [w / sum(weights) for w in weights]
        
        # Update the COMPOSITE_INDICES dictionary
        COMPOSITE_INDICES[index_name] = {
            'components': [f"{comp}_per90" for comp in index_info['components']],  # We'll add _per90 suffix later
            'weights': weights
        }
        
        print(f"Updated {index_name} with {len(COMPOSITE_INDICES[index_name]['components'])} components")


In [None]:
# Enhanced per-90 normalization that ensures all needed columns are properly converted
def enhanced_normalize_per90(df: pd.DataFrame, minutes_col: str = "minutes",
                           target_columns: List[str] = None) -> pd.DataFrame:
    """
    Enhanced function to normalize specific columns to per-90 minutes values
    
    args:
        df: dataframe with player statistics
        minutes_col: name of column containing minutes played
        target_columns: specific columns to normalize (if None, will normalize all numeric columns)
        
    returns:
        dataframe with additional per-90 normalized columns
    """
    # create a copy to avoid modifying the original dataframe
    result_df = df.copy()
    
    # ensure minutes column exists and is numeric
    if minutes_col not in result_df.columns:
        raise ValueError(f"minutes column '{minutes_col}' not found in dataframe")
    
    # convert minutes to numeric, handling commas and other formatting
    result_df[minutes_col] = pd.to_numeric(
        result_df[minutes_col].astype(str).str.replace(',', '').str.replace('-', '0'),
        errors='coerce'
    )
    
    # If specific target columns are provided, ensure they're all numeric
    if target_columns:
        print(f"Converting {len(target_columns)} target columns to numeric...")
        for col in target_columns:
            if col in result_df.columns and result_df[col].dtype == 'object':
                try:
                    result_df[col] = pd.to_numeric(
                        result_df[col].astype(str).str.replace(',', '').str.replace('-', '0'),
                        errors='coerce'
                    )
                except Exception as e:
                    print(f"Warning: Could not convert {col} to numeric: {e}")
    
    # Determine which columns to normalize
    cols_to_normalize = []
    if target_columns:
        # Only normalize specified columns that exist
        cols_to_normalize = [col for col in target_columns if col in result_df.columns]
    else:
        # Normalize all numeric columns except ID_COLUMNS and minutes
        exclude_cols = ID_COLUMNS + [minutes_col]
        # Also exclude columns that are already per90 or percentages
        per90_pattern = r'.*per90.*|.*_90.*|.*pct.*|.*_per_.*|.*ratio.*'
        exclude_pattern = result_df.filter(regex=per90_pattern).columns.tolist()
        exclude_cols.extend(exclude_pattern)
        
        # Get all numeric columns that are not in exclude_cols
        numeric_cols = result_df.select_dtypes(include=[np.number]).columns
        cols_to_normalize = [col for col in numeric_cols if col not in exclude_cols]
    
    print(f"Normalizing {len(cols_to_normalize)} columns to per-90...")
    
    # normalize each column
    for col in cols_to_normalize:
        # create new column name
        new_col = f"{col}_per90"
        # normalize to per-90
        result_df[new_col] = result_df[col] * 90 / result_df[minutes_col]
    
    return result_df

# Get a list of all columns we need for our composite indices
columns_for_indices = []
for index_info in IMPROVED_INDICES.values():
    columns_for_indices.extend(index_info['components'])

# Apply the enhanced normalization
print("Applying enhanced per-90 normalization...")
df_per90 = enhanced_normalize_per90(df, target_columns=columns_for_indices)

# Check how many per-90 columns were added
original_count = len(df.columns)
new_count = len(df_per90.columns)
print(f"Added {new_count - original_count} per-90 normalized columns")

# Display a sample of the original and per-90 columns for our index components
sample_cols = ['player', 'team', 'position', 'minutes']
per90_cols = [f"{col}_per90" for col in columns_for_indices if f"{col}_per90" in df_per90.columns][:5]
print("\nSample of per-90 normalized data for index components:")
if per90_cols:
    print(df_per90[sample_cols + per90_cols].head())
else:
    print("No per-90 columns were created for index components")


In [None]:
# Calculate composite indices using the per-90 normalized columns
print("\nCalculating composite indices...")

# Create a function to find per90 versions of our columns
def find_per90_columns(df: pd.DataFrame, base_columns: List[str]) -> Dict[str, str]:
    """Find per90 versions of the specified base columns"""
    per90_mapping = {}
    for base_col in base_columns:
        per90_col = f"{base_col}_per90"
        if per90_col in df.columns:
            per90_mapping[base_col] = per90_col
    return per90_mapping

# For each index, calculate using the per90 columns
indices_calculated = 0
for index_name, index_info in IMPROVED_INDICES.items():
    if len(index_info['components']) >= 2:  # Require at least 2 components
        # Find per90 versions of our components
        per90_mapping = find_per90_columns(df_per90, index_info['components'])
        
        if len(per90_mapping) >= 2:  # Require at least 2 per90 columns
            print(f"\nCalculating {index_name} using {len(per90_mapping)} components:")
            
            # Get the per90 columns and their corresponding weights
            per90_columns = list(per90_mapping.values())
            
            # Get weights for each component
            weights = []
            for base_col, per90_col in per90_mapping.items():
                # Find the component's position in the original components list
                idx = index_info['components'].index(base_col)
                # Get the corresponding weight group
                group_key = list(index_info['weight_groups'].keys())[idx]
                # Get the weight for this group
                group_idx = list(index_info['weight_groups'].keys()).index(group_key)
                weight = index_info['group_weights'][group_idx]
                weights.append(weight)
                print(f"  - {per90_col}: weight={weight:.2f}")
            
            # Normalize weights to sum to 1
            weights = [w / sum(weights) for w in weights]
            
            try:
                # Calculate the composite index
                df_per90[index_name] = calculate_composite_index(
                    df_per90, per90_columns, weights, index_name
                )
                indices_calculated += 1
                print(f"Successfully calculated {index_name}")
            except Exception as e:
                print(f"Error calculating {index_name}: {e}")
        else:
            print(f"Not enough per90 columns found for {index_name}, skipping")

# Display summary statistics for the composite indices
print(f"\n{indices_calculated} composite indices calculated")
composite_indices_cols = [col for col in df_per90.columns if col in IMPROVED_INDICES.keys()]
if composite_indices_cols:
    print("\nSummary statistics for composite indices:")
    print(df_per90[composite_indices_cols].describe())
else:
    print("No composite indices were calculated")


# ⚠️ IMPORTANT: Run cell 14 first! ⚠️
# This cell requires the calculate_composite_index function to be defined


In [None]:
# Apply winsorization to handle outliers
print(f"\nApplying winsorization at {WINSOR_LOWER*100}th and {WINSOR_UPPER*100}th percentiles...")

# Exclude ID columns and composite indices from winsorization
exclude_cols = ID_COLUMNS + composite_indices_cols

# Winsorize the dataframe
df_winsorized = winsorize_df(df_per90, lower=WINSOR_LOWER, upper=WINSOR_UPPER, exclude_cols=exclude_cols)

# Check the effect of winsorization on a few columns
print("\nEffect of winsorization on selected columns:")

# Select a few numeric columns to check, prioritizing per90 columns
numeric_cols = df_per90.select_dtypes(include=[np.number]).columns
per90_cols = [col for col in numeric_cols if col.endswith('_per90') and col not in exclude_cols]
sample_numeric_cols = per90_cols[:5] if per90_cols else [col for col in numeric_cols if col not in exclude_cols][:5]

# Compare before and after winsorization
for col in sample_numeric_cols:
    before = df_per90[col].describe([0.05, 0.25, 0.5, 0.75, 0.95])
    after = df_winsorized[col].describe([0.05, 0.25, 0.5, 0.75, 0.95])
    
    print(f"\n{col}:")
    print(f"  Before - min: {before['min']:.2f}, max: {before['max']:.2f}, mean: {before['mean']:.2f}")
    print(f"  After  - min: {after['min']:.2f}, max: {after['max']:.2f}, mean: {after['mean']:.2f}")

# Set the winsorized dataframe as our final dataset
df_final = df_winsorized


# ⚠️ IMPORTANT EXECUTION ORDER ⚠️

To avoid errors like "name 'winsorize_df' is not defined" or "name 'calculate_composite_index' is not defined", please run cell 14 (the large cell with all helper function definitions) BEFORE running this cell.

The correct execution order is:
1. Run cells 1-2: Import Libraries and Define Constants
2. Run cells 6-7: Load and Prepare Data (Section 4) 
3. Run cell 14: Helper Functions (contains all function definitions)
4. Then continue with the analysis cells


In [None]:
# Ensure output directory exists
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

# Save the engineered dataset
print(f"\nSaving engineered dataset to {OUTPUT_FILE}...")
df_final.to_csv(OUTPUT_FILE, index=False)
print(f"Saved {df_final.shape[0]} rows and {df_final.shape[1]} columns")

# Generate and save metadata
print(f"\nGenerating metadata...")
metadata = {
    "num_rows": len(df_final),
    "num_columns": len(df_final.columns),
    "feature_groups": {
        "original": [col for col in df_final.columns if col in original_columns],
        "per90": [col for col in df_final.columns if col.endswith('_per90')],
        "composite_indices": composite_indices_cols
    },
    "composite_indices_details": {}
}

# Add details about the composite indices
for index_name in composite_indices_cols:
    if index_name in df_final.columns:
        # Find the components used for this index
        components = []
        weights = []
        
        # Look through IMPROVED_INDICES to find the components
        for base_col, per90_col in find_per90_columns(df_final, IMPROVED_INDICES[index_name]['components']).items():
            components.append(per90_col)
            
            # Find the weight for this component
            idx = IMPROVED_INDICES[index_name]['components'].index(base_col)
            group_key = list(IMPROVED_INDICES[index_name]['weight_groups'].keys())[idx]
            group_idx = list(IMPROVED_INDICES[index_name]['weight_groups'].keys()).index(group_key)
            weight = IMPROVED_INDICES[index_name]['group_weights'][group_idx]
            weights.append(weight)
        
        # Normalize weights
        if weights:
            weights = [w / sum(weights) for w in weights]
        
        metadata['composite_indices_details'][index_name] = {
            'description': {
                'PI': "Progressive Index - measures a player's ball progression abilities",
                'CCI': "Creative Contribution Index - measures a player's creativity and chance creation",
                'DA': "Defensive Activity Index - measures a player's defensive contributions",
                'FE': "Final Execution Index - measures a player's finishing and goal threat"
            }.get(index_name, "Composite index"),
            'components': components,
            'weights': [float(w) for w in weights],
            'min_value': float(df_final[index_name].min()),
            'max_value': float(df_final[index_name].max()),
            'mean_value': float(df_final[index_name].mean()),
            'std_value': float(df_final[index_name].std())
        }

# Save metadata
print(f"Saving metadata to {METADATA_FILE}...")
os.makedirs(os.path.dirname(METADATA_FILE), exist_ok=True)
with open(METADATA_FILE, 'w') as f:
    json.dump(metadata, f, indent=2)
print("Metadata saved successfully")


In [None]:
# Print summary statistics
print("\nFeature Engineering Summary:")
print(f"- Processed {df_final.shape[0]} player records")
print(f"- Created {len([col for col in df_final.columns if col.endswith('_per90')])} per-90 normalized features")
print(f"- Calculated {len(composite_indices_cols)} composite indices")
print(f"- Applied winsorization at {WINSOR_LOWER*100}th and {WINSOR_UPPER*100}th percentiles")
print(f"- Final dataset has {df_final.shape[1]} columns")

# Print next steps
print("\nNext Steps (Phase 3):")
print("- Dimensionality Reduction with PCA")
print("- Clustering to identify player roles")
print("- Interpretation and visualization of clusters")


In [None]:
def normalize_per90(df: pd.DataFrame, minutes_col: str = "minutes", 
                  exclude_cols: List[str] = None) -> pd.DataFrame:
    """
    normalize raw counting stats to per-90 minutes values
    
    args:
        df: dataframe with player statistics
        minutes_col: name of column containing minutes played
        exclude_cols: list of columns to exclude from normalization
        
    returns:
        dataframe with additional per-90 normalized columns
    """
    # create a copy to avoid modifying the original dataframe
    result_df = df.copy()
    
    # ensure minutes column exists and is numeric
    if minutes_col not in result_df.columns:
        raise ValueError(f"minutes column '{minutes_col}' not found in dataframe")
    
    # convert minutes to numeric, handling commas and other formatting
    result_df[minutes_col] = pd.to_numeric(
        result_df[minutes_col].astype(str).str.replace(',', '').str.replace('-', '0'),
        errors='coerce'
    )
    
    # default exclude columns if none provided
    if exclude_cols is None:
        exclude_cols = ID_COLUMNS.copy()
    else:
        exclude_cols = exclude_cols.copy()
        
    # add minutes column to exclude list if not already there
    if minutes_col not in exclude_cols:
        exclude_cols.append(minutes_col)
    
    # identify columns that are already per90 or percentages
    per90_pattern = r'.*per90.*|.*_90.*|.*pct.*|.*_per_.*|.*ratio.*'
    exclude_pattern = result_df.filter(regex=per90_pattern).columns.tolist()
    exclude_cols.extend(exclude_pattern)
    
    # get numeric columns that are not in exclude_cols
    numeric_cols = result_df.select_dtypes(include=[np.number]).columns
    cols_to_normalize = [col for col in numeric_cols if col not in exclude_cols]
    
    # normalize each column
    for col in cols_to_normalize:
        # create new column name
        new_col = f"{col}_per90"
        # normalize to per-90
        result_df[new_col] = result_df[col] * 90 / result_df[minutes_col]
    
    return result_df

def calculate_composite_index(df: pd.DataFrame, features: List[str], 
                             weights: List[float], name: str) -> pd.Series:
    """
    calculate a composite index from multiple features using weights
    
    args:
        df: dataframe with player statistics
        features: list of feature names to include in the index
        weights: list of weights for each feature (must sum to 1)
        name: name of the composite index
        
    returns:
        series with the calculated composite index
    """
    # validate inputs
    if len(features) != len(weights):
        raise ValueError("features and weights must have the same length")
    
    if abs(sum(weights) - 1.0) > 0.001:
        raise ValueError("weights must sum to 1.0")
    
    # check if all features exist in the dataframe
    missing_features = [f for f in features if f not in df.columns]
    if missing_features:
        raise ValueError(f"features {missing_features} not found in dataframe")
    
    # create a copy of the features for standardization
    features_df = df[features].copy()
    
    # standardize each feature
    scaler = StandardScaler()
    features_standardized = scaler.fit_transform(features_df)
    
    # convert back to dataframe for easier handling
    features_std_df = pd.DataFrame(features_standardized, columns=features, index=df.index)
    
    # calculate weighted sum
    composite_index = pd.Series(0, index=df.index)
    for feature, weight in zip(features, weights):
        composite_index += features_std_df[feature] * weight
    
    return composite_index

def winsorize_df(df: pd.DataFrame, lower: float = 0.05, upper: float = 0.95,
                exclude_cols: List[str] = None) -> pd.DataFrame:
    """
    apply winsorization to handle outliers in numeric columns
    
    args:
        df: dataframe with player statistics
        lower: lower percentile for winsorization (default: 0.05)
        upper: upper percentile for winsorization (default: 0.95)
        exclude_cols: list of columns to exclude from winsorization
        
    returns:
        dataframe with winsorized values
    """
    # create a copy to avoid modifying the original dataframe
    result_df = df.copy()
    
    # default exclude columns if none provided
    if exclude_cols is None:
        exclude_cols = ID_COLUMNS.copy()
    
    # get numeric columns that are not in exclude_cols
    numeric_cols = result_df.select_dtypes(include=[np.number]).columns
    cols_to_winsorize = [col for col in numeric_cols if col not in exclude_cols]
    
    # winsorize each column
    for col in cols_to_winsorize:
        # skip columns with all NaN values
        if result_df[col].isna().all():
            continue
            
        # winsorize the column
        result_df[col] = stats.mstats.winsorize(result_df[col], limits=[lower, 1-upper])
    
    return result_df

def generate_feature_metadata(df: pd.DataFrame, original_cols: List[str]) -> Dict:
    """
    generate metadata about features in the dataframe
    
    args:
        df: dataframe with player statistics
        original_cols: list of column names from the original dataframe
        
    returns:
        dictionary with feature metadata
    """
    metadata = {
        "num_rows": len(df),
        "num_columns": len(df.columns),
        "feature_groups": {
            "original": [col for col in df.columns if col in original_cols],
            "per90": [col for col in df.columns if col.endswith('_per90')],
            "composite_indices": [
                "PI", "CCI", "DA", "FE"
            ]
        },
        "composite_indices": {
            index_name: {
                "description": {
                    "PI": "Progressive Index - measures a player's ball progression abilities",
                    "CCI": "Creative Contribution Index - measures a player's creativity and chance creation",
                    "DA": "Defensive Activity Index - measures a player's defensive contributions",
                    "FE": "Final Execution Index - measures a player's finishing and goal threat"
                }[index_name],
                "components": components["components"],
                "weights": components["weights"]
            }
            for index_name, components in COMPOSITE_INDICES.items()
        }
    }
    
    return metadata


## 4. Load and Prepare Data


In [None]:
# load the cleaned dataset
print(f"loading data from {INPUT_FILE}")
try:
    df = pd.read_csv(INPUT_FILE)
    print(f"loaded {df.shape[0]} rows and {df.shape[1]} columns")
except Exception as e:
    print(f"error loading data: {e}")
    raise


In [None]:
# check for duplicate rows
duplicate_count = df.duplicated().sum()
print(f"found {duplicate_count} duplicate rows")

if duplicate_count > 0:
    print("removing duplicate rows...")
    df = df.drop_duplicates()
    print(f"dataset now has {df.shape[0]} rows")

# check for missing values in key fields
key_fields = ['player', 'team', 'position', 'minutes']
missing_key_fields = df[key_fields].isna().sum()
print("\nmissing values in key fields:")
print(missing_key_fields)

if missing_key_fields.sum() > 0:
    print("removing rows with missing key fields...")
    df = df.dropna(subset=key_fields)
    print(f"dataset now has {df.shape[0]} rows")

# store original column names for metadata
original_columns = df.columns.tolist()

# display basic info about the dataset
print("\ndataset info:")
df.info()


## 5. Per-90 Normalization


In [None]:
# normalize raw counting stats to per-90 minutes
print("applying per-90 normalization...")
df_per90 = normalize_per90(df, minutes_col="minutes")

# check how many per-90 columns were added
original_count = len(df.columns)
new_count = len(df_per90.columns)
print(f"added {new_count - original_count} per-90 normalized columns")

# display a sample of the original and per-90 columns
sample_cols = ['player', 'team', 'position', 'minutes']
per90_cols = [col for col in df_per90.columns if col.endswith('_per90')][:5]  # first 5 per90 cols
print("\nsample of per-90 normalized data:")
print(df_per90[sample_cols + per90_cols].head())


## 6. Calculate Composite Indices


In [None]:
# define a mapping for columns that might have different names in our dataset
column_mapping = {
    # progressive index
    'prgp_per90': ['prgp_per90', 'prgp_passing_per90'],  # progressive passes
    'prgc_per90': ['prgc_per90', 'prgc_possession_per90'],  # progressive carries
    'succ_per90': ['succ_per90', 'succ_possession_per90'],  # successful dribbles
    
    # creative contribution index
    'kp_per90': ['kp_per90', 'kp_passing_per90'],  # key passes
    'sca_per90': ['sca_per90', 'sca_creation_per90'],  # shot-creating actions
    'xag_per90': ['xag_per90', 'xag_passing_per90', 'xag1_per90'],  # expected assists
    
    # defensive activity index
    'tkl_per90': ['tkl_per90', 'tkl_defense_per90'],  # tackles
    'int_per90': ['int_per90', 'int_defense_per90'],  # interceptions
    'pressures_per90': ['pressures_per90', 'press_per90', 'pres_per90'],  # pressures
    'blocks_per90': ['blocks_per90', 'blocks_defense_per90'],  # blocks
    
    # final execution index
    'gls_per90': ['gls_per90', 'gls_shooting_per90', 'gls1_per90'],  # goals
    'xg_per90': ['xg_per90', 'xg_shooting_per90', 'xg1_per90'],  # expected goals
    'sot_per90': ['sot_per90', 'sot_shooting_per90'],  # shots on target
    'g_per_sh': ['g_per_sh', 'g_per_sh_shooting']  # goal per shot ratio
}


In [None]:
# function to find the actual column name in the dataframe
def find_column(df: pd.DataFrame, possible_names: List[str]) -> Optional[str]:
    """find the first column name from a list that exists in the dataframe"""
    for col in possible_names:
        if col in df.columns:
            return col
    return None

# check which components are available in our dataset
print("checking for composite index components...")
available_components = {}

for index_name, index_info in COMPOSITE_INDICES.items():
    available_components[index_name] = []
    
    for component in index_info['components']:
        # get possible column names for this component
        possible_names = column_mapping.get(component, [component])
        
        # find the actual column name in the dataframe
        actual_col = find_column(df_per90, possible_names)
        
        if actual_col:
            available_components[index_name].append(actual_col)
            print(f"found {component} as {actual_col}")
        else:
            print(f"warning: component {component} not found in dataset")

# update composite indices with available components
updated_indices = {}
for index_name, components in available_components.items():
    if len(components) >= 2:  # require at least 2 components
        # get the original weights
        original_components = COMPOSITE_INDICES[index_name]['components']
        original_weights = COMPOSITE_INDICES[index_name]['weights']
        
        # create a mapping from original components to weights
        weight_map = {comp: weight for comp, weight in zip(original_components, original_weights)}
        
        # get weights for available components
        weights = [weight_map.get(comp.replace('_possession', '').replace('_passing', '').replace('_defense', '').replace('_shooting', '').replace('_creation', '').replace('1', ''), 1.0) 
                  for comp in components]
        
        # normalize weights to sum to 1
        weights = [w / sum(weights) for w in weights]
        
        updated_indices[index_name] = {
            'components': components,
            'weights': weights
        }
    else:
        print(f"warning: not enough components found for {index_name}, skipping")

print("\nupdated composite indices:")
for index_name, index_info in updated_indices.items():
    print(f"{index_name}: {len(index_info['components'])} components")
    for comp, weight in zip(index_info['components'], index_info['weights']):
        print(f"  - {comp}: {weight:.2f}")


In [None]:
# calculate composite indices
print("\ncalculating composite indices...")
for index_name, index_info in updated_indices.items():
    components = index_info['components']
    weights = index_info['weights']
    
    try:
        # calculate the composite index
        df_per90[index_name] = calculate_composite_index(
            df_per90, components, weights, index_name
        )
        print(f"calculated {index_name} using {len(components)} components")
    except Exception as e:
        print(f"error calculating {index_name}: {e}")

# display summary statistics for the composite indices
print("\nsummary statistics for composite indices:")
composite_indices_cols = [col for col in df_per90.columns if col in updated_indices.keys()]
if composite_indices_cols:
    print(df_per90[composite_indices_cols].describe())


## 7. Apply Winsorization


In [None]:
# apply winsorization to handle outliers
print(f"applying winsorization at {WINSOR_LOWER*100}th and {WINSOR_UPPER*100}th percentiles...")

# exclude ID columns and composite indices from winsorization
exclude_cols = ID_COLUMNS + composite_indices_cols

# winsorize the dataframe
df_winsorized = winsorize_df(df_per90, lower=WINSOR_LOWER, upper=WINSOR_UPPER, exclude_cols=exclude_cols)

# check the effect of winsorization on a few columns
print("\neffect of winsorization on selected columns:")

# select a few numeric columns to check
numeric_cols = df_per90.select_dtypes(include=[np.number]).columns
sample_numeric_cols = [col for col in numeric_cols if col not in exclude_cols][:5]  # first 5 numeric cols

# compare before and after winsorization
for col in sample_numeric_cols:
    before = df_per90[col].describe([0.05, 0.25, 0.5, 0.75, 0.95])
    after = df_winsorized[col].describe([0.05, 0.25, 0.5, 0.75, 0.95])
    
    print(f"\n{col}:")
    print(f"  Before - min: {before['min']:.2f}, max: {before['max']:.2f}, mean: {before['mean']:.2f}")
    print(f"  After  - min: {after['min']:.2f}, max: {after['max']:.2f}, mean: {after['mean']:.2f}")

# set the winsorized dataframe as our final dataset
df_final = df_winsorized


## 8. Save Engineered Dataset


In [None]:
# ensure output directory exists
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

# save the engineered dataset
print(f"saving engineered dataset to {OUTPUT_FILE}...")
df_final.to_csv(OUTPUT_FILE, index=False)
print(f"saved {df_final.shape[0]} rows and {df_final.shape[1]} columns")

# generate and save metadata
print(f"\ngenerating metadata...")
metadata = generate_feature_metadata(df_final, original_columns)

# add additional metadata about the composite indices
metadata['composite_indices_details'] = {}
for index_name, index_info in updated_indices.items():
    if index_name in df_final.columns:
        metadata['composite_indices_details'][index_name] = {
            'components': index_info['components'],
            'weights': [float(w) for w in index_info['weights']],
            'min_value': float(df_final[index_name].min()),
            'max_value': float(df_final[index_name].max()),
            'mean_value': float(df_final[index_name].mean()),
            'std_value': float(df_final[index_name].std())
        }

# save metadata
print(f"saving metadata to {METADATA_FILE}...")
os.makedirs(os.path.dirname(METADATA_FILE), exist_ok=True)
with open(METADATA_FILE, 'w') as f:
    json.dump(metadata, f, indent=2)
print("metadata saved successfully")


## 9. Summary and Next Steps


In [None]:
# print summary statistics
print("Feature Engineering Summary:")
print(f"- Processed {df_final.shape[0]} player records")
print(f"- Created {len([col for col in df_final.columns if col.endswith('_per90')])} per-90 normalized features")
print(f"- Calculated {len([col for col in df_final.columns if col in updated_indices.keys()])} composite indices")
print(f"- Applied winsorization at {WINSOR_LOWER*100}th and {WINSOR_UPPER*100}th percentiles")
print(f"- Final dataset has {df_final.shape[1]} columns")

# print next steps
print("\nNext Steps (Phase 3):")
print("- Dimensionality Reduction with PCA")
print("- Clustering to identify player roles")
print("- Interpretation and visualization of clusters")
