# Premier League Player Role Discovery - Phase 2: Feature Engineering

This notebook implements the feature engineering phase for the Premier League Player Role Discovery project. It builds on the cleaned dataset from Phase 1 and creates:

1. Per-90 normalized versions of relevant statistics
2. Composite indices (PI, CCI, DA, FE)
3. Winsorized features to handle outliers

The final engineered dataset will be saved for use in subsequent clustering and analysis phases.



## 1. Import Libraries


In [14]:
import os
import json
import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Tuple, Set, Union
import warnings
from scipy import stats
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# set display options for better dataframe viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)


## 2. Helper Functions

In [15]:
## Helper Functions

def calculate_composite_index(df: pd.DataFrame, features: List[str], 
                             weights: List[float], name: str) -> pd.Series:
    """
    Calculate a composite index from multiple features using weights
    
    Args:
        df: dataframe with player statistics
        features: list of feature names to include in the index
        weights: list of weights for each feature (must sum to 1)
        name: name of the composite index
        
    Returns:
        series with the calculated composite index
    """
    # Validate inputs
    if len(features) != len(weights):
        raise ValueError("Features and weights must have the same length")
    
    if abs(sum(weights) - 1.0) > 0.001:
        raise ValueError("Weights must sum to 1.0")
    
    # Check if all features exist in the dataframe
    missing_features = [f for f in features if f not in df.columns]
    if missing_features:
        raise ValueError(f"Features {missing_features} not found in dataframe")
    
    # Create a copy of the features for standardization
    features_df = df[features].copy()
    
    # Standardize each feature
    scaler = StandardScaler()
    features_standardized = scaler.fit_transform(features_df)
    
    # Convert back to dataframe for easier handling
    features_std_df = pd.DataFrame(features_standardized, columns=features, index=df.index)
    
    # Calculate weighted sum
    composite_index = pd.Series(0, index=df.index)
    for feature, weight in zip(features, weights):
        composite_index += features_std_df[feature] * weight
    
    return composite_index

def winsorize_df(df: pd.DataFrame, lower: float = 0.05, upper: float = 0.95,
                exclude_cols: List[str] = None) -> pd.DataFrame:
    """
    Apply winsorization to handle outliers in numeric columns
    
    Args:
        df: dataframe with player statistics
        lower: lower percentile for winsorization (default: 0.05)
        upper: upper percentile for winsorization (default: 0.95)
        exclude_cols: list of columns to exclude from winsorization
        
    Returns:
        dataframe with winsorized values
    """
    # Create a copy to avoid modifying the original dataframe
    result_df = df.copy()
    
    # Default exclude columns if none provided
    if exclude_cols is None:
        # Use a default list of common ID columns instead of global variable
        default_exclude = ['player', 'team', 'position', 'data_source', 'nation']
        exclude_cols = default_exclude
    
    # Get numeric columns that are not in exclude_cols
    numeric_cols = result_df.select_dtypes(include=[np.number]).columns
    cols_to_winsorize = [col for col in numeric_cols if col not in exclude_cols]
    
    # Track columns that were winsorized
    winsorized_count = 0
    
    # Winsorize each column
    for col in cols_to_winsorize:
        # Skip columns with all NaN values
        if result_df[col].isna().all():
            continue
        
        try:    
            # Winsorize the column
            result_df[col] = stats.mstats.winsorize(result_df[col], limits=[lower, 1-upper])
            winsorized_count += 1
        except Exception as e:
            print(f"Warning: Could not winsorize column '{col}': {str(e)}")
    
    print(f"Winsorized {winsorized_count} columns")
    return result_df

## 3. Define Paths

In [16]:
# define paths
INPUT_FILE = '../data/processed/player_stats_cleaned.csv'
OUTPUT_FILE = '../data/processed/player_stats_engineered.csv'
METADATA_FILE = '../data/processed/feature_metadata.json'

# define constants for feature engineering
WINSOR_LOWER = 0.05  # 5th percentile
WINSOR_UPPER = 0.95  # 95th percentile

# columns that should not be normalized or winsorized
ID_COLUMNS = ['player', 'team', 'position', 'data_source', 'nation']

# columns that should not be converted to numeric (including columns with mixed data types)
NON_NUMERIC_COLUMNS = ID_COLUMNS + [
    'nation_misc', 'position_misc', 'team_misc',
    'nation_passing', 'position_passing', 'team_passing',
    'nation_playtime', 'position_playtime', 'team_playtime',
    'nation_defense', 'position_defense', 'team_defense',
    'nation_pass_types', 'position_pass_types', 'team_pass_types',
    'nation_creation', 'position_creation', 'team_creation',
    'nation_possession', 'position_possession', 'team_possession',
    'nation_shooting', 'position_shooting', 'team_shooting'
]

# define composite index components and weights
COMPOSITE_INDICES = {
    'PI': {  # Progressive Index
        'components': ['prgp_per90', 'prgc_per90', 'succ_per90'],
        'weights': [0.4, 0.4, 0.2]  # progressive passes and carries weighted more
    },
    'CCI': {  # Creative Contribution Index
        'components': ['kp_per90', 'sca_per90', 'xag_per90'],
        'weights': [0.3, 0.4, 0.3]  # equal weights with slight emphasis on shot-creating actions
    },
    'DA': {  # Defensive Activity Index
        'components': ['tkl_per90', 'int_per90', 'pressures_per90', 'blocks_per90'],
        'weights': [0.3, 0.3, 0.2, 0.2]  # tackles and interceptions weighted more
    },
    'FE': {  # Final Execution Index
        'components': ['gls_per90', 'xg_per90', 'sot_per90', 'g_per_sh'],
        'weights': [0.3, 0.3, 0.2, 0.2]  # goals and xG weighted more
    }
}


## 4. Load and Prepare Data


### Load cleaned dataset

In [17]:
# load the cleaned dataset
print(f"loading data from {INPUT_FILE}")
try:
    df = pd.read_csv(INPUT_FILE)
    print(f"loaded {df.shape[0]} rows and {df.shape[1]} columns")
except Exception as e:
    print(f"error loading data: {e}")
    raise


loading data from ../data/processed/player_stats_cleaned.csv
loaded 563 rows and 248 columns


### Check for Duplicate Rows

In [18]:
# check for duplicate rows
duplicate_count = df.duplicated().sum()
print(f"found {duplicate_count} duplicate rows")

if duplicate_count > 0:
    print("removing duplicate rows...")
    df = df.drop_duplicates()
    print(f"dataset now has {df.shape[0]} rows")

# check for missing values in key fields
key_fields = ['player', 'team', 'position', 'minutes']
missing_key_fields = df[key_fields].isna().sum()
print("\nmissing values in key fields:")
print(missing_key_fields)

if missing_key_fields.sum() > 0:
    print("removing rows with missing key fields...")
    df = df.dropna(subset=key_fields)
    print(f"dataset now has {df.shape[0]} rows")

# store original column names for metadata
original_columns = df.columns.tolist()

# display basic info about the dataset
print("\ndataset info:")
df.info()


found 0 duplicate rows

missing values in key fields:
player      0
team        0
position    0
minutes     0
dtype: int64

dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 563 entries, 0 to 562
Columns: 248 entries, player to data_source
dtypes: int64(9), object(239)
memory usage: 1.1+ MB


## 5. Data Type Conversion

### Conversions

In [19]:
# Check data types and convert string columns to numeric where appropriate
print("Converting string columns to numeric...")

# Sample a few columns to see their content
print("\nSample of column content before conversion:")
for col in ['gls', 'xg', 'prgp', 'tkl', 'minutes']:
    if col in df.columns:
        print(f"{col}: {df[col].head(3).tolist()}")

# Sample a few identifier columns we want to preserve
print("\nSample of identifier columns to preserve:")
for col in ['nation_misc', 'position_misc', 'team_misc'][:3]:
    if col in df.columns:
        print(f"{col}: {df[col].head(3).tolist()}")

# Count numeric columns before conversion
numeric_count_before = len(df.select_dtypes(include=[np.number]).columns)
print(f"\nNumeric columns before conversion: {numeric_count_before}")

# Track conversion issues
conversion_issues = []
conversion_success = 0

# Convert string columns to numeric, excluding identifier columns
for col in df.columns:
    if col not in NON_NUMERIC_COLUMNS and df[col].dtype == 'object':
        try:
            # Try to convert to numeric, handling commas and other formatting
            df[col] = pd.to_numeric(
                df[col].astype(str).str.replace(',', '').str.replace('-', '0'),
                errors='coerce'
            )
            conversion_success += 1
        except ValueError as e:
            # Handle value errors specifically
            conversion_issues.append(f"Value error in column '{col}': {str(e)}")
        except TypeError as e:
            # Handle type errors specifically
            conversion_issues.append(f"Type error in column '{col}': {str(e)}")
        except Exception as e:
            # Catch other unexpected errors
            conversion_issues.append(f"Unexpected error in column '{col}': {str(e)}")

# Count numeric columns after conversion
numeric_count_after = len(df.select_dtypes(include=[np.number]).columns)
print(f"Numeric columns after conversion: {numeric_count_after}")
print(f"Converted {numeric_count_after - numeric_count_before} columns to numeric")
print(f"Successful conversions: {conversion_success}")

# Print conversion issues (if any)
if conversion_issues:
    print(f"\nEncountered {len(conversion_issues)} conversion issues:")
    for issue in conversion_issues[:5]:  # Show first 5 issues
        print(f"  - {issue}")
    if len(conversion_issues) > 5:
        print(f"  ... and {len(conversion_issues) - 5} more issues")

# Verify that identifier columns were preserved
print("\nVerifying identifier columns are preserved:")
for col in ['nation_misc', 'position_misc', 'team_misc'][:3]:
    if col in df.columns:
        print(f"{col}: {df[col].head(3).tolist()}")


Converting string columns to numeric...

Sample of column content before conversion:
gls: ['0', '0', '2']
xg: ['0.2', '0', '1.2']
prgp: ['36', '0', '149']
tkl: ['11', '0', '70']
minutes: ['824', '2,700', '3,154']

Sample of identifier columns to preserve:
nation_misc: ['eng ENG', 'eng ENG', 'cd COD']
position_misc: ['DF', 'GK', 'DF']
team_misc: ['West Ham', 'Southampton', 'West Ham']

Numeric columns before conversion: 9
Numeric columns after conversion: 219
Converted 210 columns to numeric
Successful conversions: 210

Verifying identifier columns are preserved:
nation_misc: ['eng ENG', 'eng ENG', 'cd COD']
position_misc: ['DF', 'GK', 'DF']
team_misc: ['West Ham', 'Southampton', 'West Ham']


## 6. Analyze Column Values

### Analyze Column Mappings

In [20]:
# todo:examine the column names to better understand what's available
print("Analyzing column names...")

# Track conversion issues
column_conversion_issues = []

# Convert columns to numeric where appropriate, preserving identifier columns
for col in df.columns:
    if col not in NON_NUMERIC_COLUMNS and df[col].dtype == 'object':
        try:
            # Try to convert to numeric, handling commas and other formatting
            df[col] = pd.to_numeric(
                df[col].astype(str).str.replace(',', '').str.replace('-', '0'),
                errors='coerce'
            )
        except ValueError as e:
            # Handle value errors specifically
            column_conversion_issues.append(f"Value error in column '{col}': {str(e)}")
        except TypeError as e:
            # Handle type errors specifically
            column_conversion_issues.append(f"Type error in column '{col}': {str(e)}")
        except Exception as e:
            # Catch other unexpected errors
            column_conversion_issues.append(f"Unexpected error in column '{col}': {str(e)}")

# Print conversion issues if any occurred
if column_conversion_issues:
    print(f"Note: {len(column_conversion_issues)} conversion issues encountered during column analysis")

# Check for key stat columns we need for our indices
key_stats = {
    'Progressive': ['prgp', 'prgc', 'prog', 'succ', 'carries', 'dribble'],
    'Creative': ['kp', 'sca', 'xag', 'xa', 'assist', 'key'],
    'Defensive': ['tkl', 'int', 'press', 'block', 'def', 'tack'],
    'Finishing': ['gls', 'xg', 'sot', 'shot', 'goal', 'finish']
}

# Search for columns matching our key stats
found_columns = {}
for category, terms in key_stats.items():
    found_columns[category] = []
    for term in terms:
        matching_cols = [col for col in df.columns if term.lower() in col.lower()]
        if matching_cols:
            found_columns[category].extend(matching_cols)

# Print the results
for category, cols in found_columns.items():
    print(f"\n{category} columns found ({len(cols)}):")
    for col in sorted(cols)[:10]:  # Show first 10 to avoid too much output
        print(f"  - {col}")
    if len(cols) > 10:
        print(f"  ... and {len(cols) - 10} more")

# Create improved column mappings based on what we found
print("\nCreating improved column mappings based on found columns...")


Analyzing column names...

Progressive columns found (7):
  - carries
  - prgc
  - prgc_possession
  - prgp
  - prgp_passing
  - succ
  - succpct

Creative columns found (18):
  - a_xag
  - a_xag
  - kp
  - npxgplusxag
  - npxgplusxag
  - npxgplusxag1
  - npxgplusxag1
  - sca
  - sca90
  - xa
  ... and 8 more

Defensive columns found (30):
  - 90s_defense
  - age_defense
  - att_defense
  - blocks
  - blocks_pass_types
  - born_defense
  - def
  - def1
  - def_3rd
  - def_3rd_possession
  ... and 20 more

Finishing columns found (23):
  - g_per_sot
  - g_xg
  - gls
  - gls1
  - gls_shooting
  - np:g_xg
  - npxg
  - npxg1
  - npxg_per_sh
  - npxg_shooting
  ... and 13 more

Creating improved column mappings based on found columns...


### Improved Column Mappings

In [21]:
# Create improved column mappings based on what we found
# We'll create per90 versions of these columns in the next step

# Define our composite indices with more flexible column options
IMPROVED_INDICES = {
    'PI': {  # Progressive Index
        'components': [],  # Will be populated based on found columns
        'weight_groups': {
            'passes': ['prgp', 'prgp_passing', 'prgp_possession'],  # 40%
            'carries': ['prgc', 'prgc_possession', 'carries'],      # 40%
            'dribbles': ['succ', 'succ_possession', 'dribble']      # 20%
        },
        'group_weights': [0.4, 0.4, 0.2]
    },
    'CCI': {  # Creative Contribution Index
        'components': [],
        'weight_groups': {
            'key_passes': ['kp', 'kp_passing', '1_per_3'],          # 30%
            'shot_creation': ['sca', 'sca_creation', 'sca90'],      # 40%
            'expected_assists': ['xag', 'xag_passing', 'xa', 'a_xag'] # 30%
        },
        'group_weights': [0.3, 0.4, 0.3]
    },
    'DA': {  # Defensive Activity Index
        'components': [],
        'weight_groups': {
            'tackles': ['tkl', 'tkl_defense', 'tklw', 'tklw_defense'],  # 30%
            'interceptions': ['int', 'int_defense'],                    # 30%
            'pressures': ['press', 'pres', 'recov'],                    # 20%
            'blocks': ['blocks', 'blocks_defense', 'blocks_pass_types'] # 20%
        },
        'group_weights': [0.3, 0.3, 0.2, 0.2]
    },
    'FE': {  # Final Execution Index
        'components': [],
        'weight_groups': {
            'goals': ['gls', 'gls_shooting', 'gls1'],                # 30%
            'expected_goals': ['xg', 'xg_shooting', 'xg1', 'npxg'],  # 30%
            'shots_on_target': ['sot', 'sot_shooting', 'sh_shooting'], # 20%
            'conversion': ['g_per_sh', 'g_per_sot', 'g_xg']          # 20%
        },
        'group_weights': [0.3, 0.3, 0.2, 0.2]
    }
}

# For each index, find the best matching column for each component
for index_name, index_info in IMPROVED_INDICES.items():
    print(f"\nPopulating components for {index_name}:")
    
    # For each weight group, find the best matching column
    for group_name, search_terms in index_info['weight_groups'].items():
        best_match = None
        
        # Try each search term in order of preference
        for term in search_terms:
            # Look for exact matches first
            exact_matches = [col for col in df.columns if col == term]
            if exact_matches:
                best_match = exact_matches[0]
                break
                
            # Then look for partial matches
            partial_matches = [col for col in df.columns if term.lower() in col.lower()]
            if partial_matches:
                best_match = partial_matches[0]
                break
        
        if best_match:
            index_info['components'].append(best_match)
            print(f"  Found {group_name}: {best_match}")
        else:
            print(f"  No match found for {group_name}")
    
    # Print summary
    print(f"  Total components found: {len(index_info['components'])}/{len(index_info['weight_groups'])}")

# Update our composite indices with the improved components and weights
for index_name, index_info in IMPROVED_INDICES.items():
    if len(index_info['components']) >= 2:  # Require at least 2 components
        # Adjust weights based on available components
        weights = []
        for i, comp in enumerate(index_info['components']):
            group_idx = list(index_info['weight_groups'].keys()).index(list(index_info['weight_groups'].keys())[i])
            weights.append(index_info['group_weights'][group_idx])
        
        # Normalize weights to sum to 1
        weights = [w / sum(weights) for w in weights]
        
        # Update the COMPOSITE_INDICES dictionary
        COMPOSITE_INDICES[index_name] = {
            'components': [f"{comp}_per90" for comp in index_info['components']],  # We'll add _per90 suffix later
            'weights': weights
        }
        
        print(f"Updated {index_name} with {len(COMPOSITE_INDICES[index_name]['components'])} components")



Populating components for PI:
  Found passes: prgp
  Found carries: prgc
  Found dribbles: succ
  Total components found: 3/3

Populating components for CCI:
  Found key_passes: kp
  Found shot_creation: sca
  Found expected_assists: xag
  Total components found: 3/3

Populating components for DA:
  Found tackles: tkl
  Found interceptions: int
  Found pressures: recov
  Found blocks: blocks
  Total components found: 4/4

Populating components for FE:
  Found goals: gls
  Found expected_goals: xg
  Found shots_on_target: sot
  Found conversion: g_per_sh
  Total components found: 4/4
Updated PI with 3 components
Updated CCI with 3 components
Updated DA with 4 components
Updated FE with 4 components


## 7. Enhanced per-90 Normalization

In [22]:
# Enhanced per-90 normalization that ensures all needed columns are properly converted
def enhanced_normalize_per90(df: pd.DataFrame, minutes_col: str = "minutes",
                           target_columns: List[str] = None) -> pd.DataFrame:
    """
    Enhanced function to normalize specific columns to per-90 minutes values
    
    args:
        df: dataframe with player statistics
        minutes_col: name of column containing minutes played
        target_columns: specific columns to normalize (if None, will normalize all numeric columns)
        
    returns:
        dataframe with additional per-90 normalized columns
    """
    # create a copy to avoid modifying the original dataframe
    result_df = df.copy()
    
    # ensure minutes column exists and is numeric
    if minutes_col not in result_df.columns:
        raise ValueError(f"minutes column '{minutes_col}' not found in dataframe")
    
    # convert minutes to numeric, handling commas and other formatting
    try:
        result_df[minutes_col] = pd.to_numeric(
            result_df[minutes_col].astype(str).str.replace(',', '').str.replace('-', '0'),
            errors='coerce'
        )
    except Exception as e:
        raise ValueError(f"Failed to convert minutes column to numeric: {str(e)}")
    
    # Check for zero or negative minutes
    zero_minutes = (result_df[minutes_col] <= 0).sum()
    if zero_minutes > 0:
        print(f"Warning: {zero_minutes} players have zero or negative minutes. Their per-90 values will be NaN.")
    
    # If specific target columns are provided, ensure they're all numeric
    conversion_failures = []
    if target_columns:
        print(f"Converting {len(target_columns)} target columns to numeric...")
        for col in target_columns:
            if col in result_df.columns and result_df[col].dtype == 'object':
                try:
                    result_df[col] = pd.to_numeric(
                        result_df[col].astype(str).str.replace(',', '').str.replace('-', '0'),
                        errors='coerce'
                    )
                except ValueError as e:
                    conversion_failures.append(f"Value error in column '{col}': {str(e)}")
                    print(f"Warning: Could not convert {col} to numeric: {e}")
                except TypeError as e:
                    conversion_failures.append(f"Type error in column '{col}': {str(e)}")
                    print(f"Warning: Could not convert {col} to numeric: {e}")
                except Exception as e:
                    conversion_failures.append(f"Unexpected error in column '{col}': {str(e)}")
                    print(f"Warning: Could not convert {col} to numeric: {e}")
    
    # Determine which columns to normalize
    cols_to_normalize = []
    if target_columns:
        # Only normalize specified columns that exist
        cols_to_normalize = [col for col in target_columns if col in result_df.columns]
    else:
        # Normalize all numeric columns except common ID columns and minutes
        # Use a default list instead of relying on global variable
        default_exclude = ['player', 'team', 'position', 'data_source', 'nation']
        exclude_cols = default_exclude + [minutes_col]
        
        # Also exclude columns that are already per90 or percentages
        per90_pattern = r'.*per90.*|.*_90.*|.*pct.*|.*_per_.*|.*ratio.*'
        exclude_pattern = result_df.filter(regex=per90_pattern).columns.tolist()
        exclude_cols.extend(exclude_pattern)
        
        # Get all numeric columns that are not in exclude_cols
        numeric_cols = result_df.select_dtypes(include=[np.number]).columns
        cols_to_normalize = [col for col in numeric_cols if col not in exclude_cols]
    
    print(f"Normalizing {len(cols_to_normalize)} columns to per-90...")
    
    # Track normalization issues
    normalization_issues = 0
    
    # normalize each column
    for col in cols_to_normalize:
        try:
            # create new column name
            new_col = f"{col}_per90"
            # normalize to per-90
            result_df[new_col] = result_df[col] * 90 / result_df[minutes_col]
        except Exception as e:
            normalization_issues += 1
            print(f"Warning: Failed to normalize column '{col}': {str(e)}")
    
    if normalization_issues > 0:
        print(f"Warning: {normalization_issues} columns had normalization issues")
    
    return result_df

# Get a list of all columns we need for our composite indices
columns_for_indices = []
for index_info in IMPROVED_INDICES.values():
    columns_for_indices.extend(index_info['components'])

# Apply the enhanced normalization
print("Applying enhanced per-90 normalization...")
df_per90 = enhanced_normalize_per90(df, target_columns=columns_for_indices)

# Check how many per-90 columns were added
original_count = len(df.columns)
new_count = len(df_per90.columns)
print(f"Added {new_count - original_count} per-90 normalized columns")

# Display a sample of the original and per-90 columns for our index components
sample_cols = ['player', 'team', 'position', 'minutes']
per90_cols = [f"{col}_per90" for col in columns_for_indices if f"{col}_per90" in df_per90.columns][:5]
print("\nSample of per-90 normalized data for index components:")
if per90_cols:
    print(df_per90[sample_cols + per90_cols].head())
else:
    print("No per-90 columns were created for index components")


Applying enhanced per-90 normalization...
Converting 14 target columns to numeric...
Normalizing 14 columns to per-90...
Added 14 per-90 normalized columns

Sample of per-90 normalized data for index components:
               player             team position  minutes  prgp_per90  prgc_per90  succ_per90  kp_per90  sca_per90
0     Aaron Cresswell         West Ham       DF    824.0    3.932039    0.655340    0.109223  0.983010   1.638350
1      Aaron Ramsdale      Southampton       GK   2700.0    0.000000    0.000000    0.000000  0.000000   0.100000
2   Aaron Wan-Bissaka         West Ham       DF   3154.0    4.251744    3.138871    1.826252  0.941661   2.653773
3  Abdoulaye Doucouré          Everton       MF   2564.0    2.878315    1.474259    0.842434  0.982839   2.035881
4  Abdukodir Khusanov  Manchester City       DF    503.0    4.473161    0.178926    0.000000  0.357853   0.894632


## 8. Calculate Composite Indices

In [23]:
# Calculate composite indices using the per-90 normalized columns
print("\nCalculating composite indices...")

# Create a function to find per90 versions of our columns
def find_per90_columns(df: pd.DataFrame, base_columns: List[str]) -> Dict[str, str]:
    """Find per90 versions of the specified base columns"""
    per90_mapping = {}
    for base_col in base_columns:
        per90_col = f"{base_col}_per90"
        if per90_col in df.columns:
            per90_mapping[base_col] = per90_col
    return per90_mapping

# For each index, calculate using the per90 columns
indices_calculated = 0
for index_name, index_info in IMPROVED_INDICES.items():
    if len(index_info['components']) >= 2:  # Require at least 2 components
        # Find per90 versions of our components
        per90_mapping = find_per90_columns(df_per90, index_info['components'])
        
        if len(per90_mapping) >= 2:  # Require at least 2 per90 columns
            print(f"\nCalculating {index_name} using {len(per90_mapping)} components:")
            
            # Get the per90 columns and their corresponding weights
            per90_columns = list(per90_mapping.values())
            
            # Get weights for each component
            weights = []
            for base_col, per90_col in per90_mapping.items():
                # Find the component's position in the original components list
                idx = index_info['components'].index(base_col)
                # Get the corresponding weight group
                group_key = list(index_info['weight_groups'].keys())[idx]
                # Get the weight for this group
                group_idx = list(index_info['weight_groups'].keys()).index(group_key)
                weight = index_info['group_weights'][group_idx]
                weights.append(weight)
                print(f"  - {per90_col}: weight={weight:.2f}")
            
            # Normalize weights to sum to 1
            weights = [w / sum(weights) for w in weights]
            
            try:
                # Calculate the composite index
                df_per90[index_name] = calculate_composite_index(
                    df_per90, per90_columns, weights, index_name
                )
                indices_calculated += 1
                print(f"Successfully calculated {index_name}")
            except Exception as e:
                print(f"Error calculating {index_name}: {e}")
        else:
            print(f"Not enough per90 columns found for {index_name}, skipping")

# Display summary statistics for the composite indices
print(f"\n{indices_calculated} composite indices calculated")
composite_indices_cols = [col for col in df_per90.columns if col in IMPROVED_INDICES.keys()]
if composite_indices_cols:
    print("\nSummary statistics for composite indices:")
    print(df_per90[composite_indices_cols].describe())
else:
    print("No composite indices were calculated")



Calculating composite indices...

Calculating PI using 3 components:
  - prgp_per90: weight=0.40
  - prgc_per90: weight=0.40
  - succ_per90: weight=0.20
Successfully calculated PI

Calculating CCI using 3 components:
  - kp_per90: weight=0.30
  - sca_per90: weight=0.40
  - xag_per90: weight=0.30
Successfully calculated CCI

Calculating DA using 4 components:
  - tkl_per90: weight=0.30
  - int_per90: weight=0.30
  - recov_per90: weight=0.20
  - blocks_per90: weight=0.20
Successfully calculated DA

Calculating FE using 4 components:
  - gls_per90: weight=0.30
  - xg_per90: weight=0.30
  - sot_per90: weight=0.20
  - g_per_sh_per90: weight=0.20
Successfully calculated FE

4 composite indices calculated

Summary statistics for composite indices:
                 PI           CCI            DA          FE
count  5.620000e+02  5.620000e+02  5.620000e+02  457.000000
mean   1.896466e-17  1.580389e-17 -1.580389e-17    0.071869
std    7.270364e-01  9.212461e-01  5.258909e-01    0.702697
min   -7

## 9. Winsorization at 5th and 95th percentiles

In [24]:
# Apply winsorization to handle outliers
print(f"\nApplying winsorization at {WINSOR_LOWER*100}th and {WINSOR_UPPER*100}th percentiles...")

# Exclude ID columns and composite indices from winsorization
exclude_cols = ID_COLUMNS + composite_indices_cols

# Winsorize the dataframe
df_winsorized = winsorize_df(df_per90, lower=WINSOR_LOWER, upper=WINSOR_UPPER, exclude_cols=exclude_cols)

# Check the effect of winsorization on a few columns
print("\nEffect of winsorization on selected columns:")

# Select a few numeric columns to check, prioritizing per90 columns
numeric_cols = df_per90.select_dtypes(include=[np.number]).columns
per90_cols = [col for col in numeric_cols if col.endswith('_per90') and col not in exclude_cols]
sample_numeric_cols = per90_cols[:5] if per90_cols else [col for col in numeric_cols if col not in exclude_cols][:5]

# Compare before and after winsorization
for col in sample_numeric_cols:
    before = df_per90[col].describe([0.05, 0.25, 0.5, 0.75, 0.95])
    after = df_winsorized[col].describe([0.05, 0.25, 0.5, 0.75, 0.95])
    
    print(f"\n{col}:")
    print(f"  Before - min: {before['min']:.2f}, max: {before['max']:.2f}, mean: {before['mean']:.2f}")
    print(f"  After  - min: {after['min']:.2f}, max: {after['max']:.2f}, mean: {after['mean']:.2f}")

# Set the winsorized dataframe as our final dataset
df_final = df_winsorized



Applying winsorization at 5.0th and 95.0th percentiles...
Winsorized 233 columns

Effect of winsorization on selected columns:

prgp_per90:
  Before - min: 0.00, max: 54.00, mean: 3.34
  After  - min: 0.00, max: 7.06, mean: 3.08

prgc_per90:
  Before - min: 0.00, max: 36.00, mean: 1.79
  After  - min: 0.00, max: 5.05, mean: 1.61

succ_per90:
  Before - min: 0.00, max: 90.00, mean: 0.87
  After  - min: 0.00, max: 2.30, mean: 0.66

kp_per90:
  Before - min: 0.00, max: 24.55, mean: 0.93
  After  - min: 0.00, max: 2.44, mean: 0.82

sca_per90:
  Before - min: 0.00, max: 32.73, mean: 2.13
  After  - min: 0.00, max: 5.09, mean: 1.97


## 10. Save Dataset & MetaData

### Output Directory Check

In [25]:
# Ensure output directory exists
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

# Save the engineered dataset
print(f"\nSaving engineered dataset to {OUTPUT_FILE}...")
df_final.to_csv(OUTPUT_FILE, index=False)
print(f"Saved {df_final.shape[0]} rows and {df_final.shape[1]} columns")

# Generate and save metadata
print(f"\nGenerating metadata...")
metadata = {
    "num_rows": len(df_final),
    "num_columns": len(df_final.columns),
    "feature_groups": {
        "original": [col for col in df_final.columns if col in original_columns],
        "per90": [col for col in df_final.columns if col.endswith('_per90')],
        "composite_indices": composite_indices_cols
    },
    "composite_indices_details": {}
}

# Add details about the composite indices
for index_name in composite_indices_cols:
    if index_name in df_final.columns:
        # Find the components used for this index
        components = []
        weights = []
        
        # Look through IMPROVED_INDICES to find the components
        for base_col, per90_col in find_per90_columns(df_final, IMPROVED_INDICES[index_name]['components']).items():
            components.append(per90_col)
            
            # Find the weight for this component
            idx = IMPROVED_INDICES[index_name]['components'].index(base_col)
            group_key = list(IMPROVED_INDICES[index_name]['weight_groups'].keys())[idx]
            group_idx = list(IMPROVED_INDICES[index_name]['weight_groups'].keys()).index(group_key)
            weight = IMPROVED_INDICES[index_name]['group_weights'][group_idx]
            weights.append(weight)
        
        # Normalize weights
        if weights:
            weights = [w / sum(weights) for w in weights]
        
        metadata['composite_indices_details'][index_name] = {
            'description': {
                'PI': "Progressive Index - measures a player's ball progression abilities",
                'CCI': "Creative Contribution Index - measures a player's creativity and chance creation",
                'DA': "Defensive Activity Index - measures a player's defensive contributions",
                'FE': "Final Execution Index - measures a player's finishing and goal threat"
            }.get(index_name, "Composite index"),
            'components': components,
            'weights': [float(w) for w in weights],
            'min_value': float(df_final[index_name].min()),
            'max_value': float(df_final[index_name].max()),
            'mean_value': float(df_final[index_name].mean()),
            'std_value': float(df_final[index_name].std())
        }

# Save metadata
print(f"Saving metadata to {METADATA_FILE}...")
os.makedirs(os.path.dirname(METADATA_FILE), exist_ok=True)
with open(METADATA_FILE, 'w') as f:
    json.dump(metadata, f, indent=2)
print("Metadata saved successfully")



Saving engineered dataset to ../data/processed/player_stats_engineered.csv...
Saved 563 rows and 266 columns

Generating metadata...
Saving metadata to ../data/processed/feature_metadata.json...
Metadata saved successfully


### Return Summary Statistics

In [26]:
# Print summary statistics
print("\nFeature Engineering Summary:")
print(f"- Processed {df_final.shape[0]} player records")
print(f"- Created {len([col for col in df_final.columns if col.endswith('_per90')])} per-90 normalized features")
print(f"- Calculated {len(composite_indices_cols)} composite indices")
print(f"- Applied winsorization at {WINSOR_LOWER*100}th and {WINSOR_UPPER*100}th percentiles")
print(f"- Final dataset has {df_final.shape[1]} columns")

# Print next steps
print("\nNext Steps (Phase 3):")
print("- Dimensionality Reduction with PCA")
print("- Clustering to identify player roles")
print("- Interpretation and visualization of clusters")



Feature Engineering Summary:
- Processed 563 player records
- Created 14 per-90 normalized features
- Calculated 4 composite indices
- Applied winsorization at 5.0th and 95.0th percentiles
- Final dataset has 266 columns

Next Steps (Phase 3):
- Dimensionality Reduction with PCA
- Clustering to identify player roles
- Interpretation and visualization of clusters
