In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

In [2]:
class BasketballFeatureEngineer:
    def __init__(self):
        self.scaler = StandardScaler()
        self.efficiency_weights = {
            'pts_weight': 1.0,
            'ast_weight': 0.7,
            'reb_weight': 0.8,
            'stl_weight': 1.2,
            'blk_weight': 1.2,
            'tov_weight': -1.0,
            'pf_weight': -0.3
        }
    
    def calculate_player_efficiency_rating(self, df):
        """Calculate custom PER-like efficiency metrics"""
        # Basic efficiency calculation
        df['BASIC_EFF'] = (
            df['PTS'] + df['AST'] + df['REB'] + 
            df['STL'] + df['BLK'] - df['TOV'] - 0.5 * df['PF']
        ) / df['MIN']
        
        # Advanced efficiency incorporating shooting
        df['ADV_EFF'] = (
            df['PTS'] * self.efficiency_weights['pts_weight'] +
            df['AST'] * self.efficiency_weights['ast_weight'] +
            df['REB'] * self.efficiency_weights['reb_weight'] +
            df['STL'] * self.efficiency_weights['stl_weight'] +
            df['BLK'] * self.efficiency_weights['blk_weight'] +
            df['TOV'] * self.efficiency_weights['tov_weight'] +
            df['PF'] * self.efficiency_weights['pf_weight']
        ) / (df['MIN'] + 1e-10)  # Avoid division by zero
        
        # Pace-adjusted efficiency
        if 'PACE' in df.columns:
            df['PACE_ADJ_EFF'] = df['ADV_EFF'] * (100 / df['PACE'])
        
        return df
    
    def create_contextual_features(self, df, game_logs=None):
        """Create contextual performance features"""
        # Usage rate categories
        df['USAGE_CATEGORY'] = pd.cut(
            df['USG_PCT'], 
            bins=[0, 15, 20, 25, 100], 
            labels=['Low', 'Medium', 'High', 'Elite']
        )
        
        # Position-based efficiency
        position_map = {'G': 1, 'F': 2, 'C': 3}
        if 'POSITION' in df.columns:
            df['POS_NUMERIC'] = df['POSITION'].map(position_map).fillna(2)
        
        # Age-based performance curves
        if 'AGE' in df.columns:
            df['AGE_PRIME'] = np.where(
                (df['AGE'] >= 25) & (df['AGE'] <= 29), 1, 0
            )
            df['AGE_VETERAN'] = np.where(df['AGE'] >= 30, 1, 0)
        
        # Experience factors
        if 'SEASON_EXP' in df.columns:
            df['EXP_CATEGORY'] = pd.cut(
                df['SEASON_EXP'],
                bins=[-1, 2, 5, 10, 20],
                labels=['Rookie', 'Young', 'Veteran', 'Elite_Veteran']
            )
        
        return df
    
    def create_situational_features(self, game_logs_df):
        """Create features based on game situations"""
        if game_logs_df is None or len(game_logs_df) == 0:
            return None
        
        # Home vs away performance
        game_logs_df['HOME_GAME'] = game_logs_df['MATCHUP'].str.contains('vs.').astype(int)
        
        # Back-to-back games
        game_logs_df['GAME_DATE'] = pd.to_datetime(game_logs_df['GAME_DATE'])
        game_logs_df = game_logs_df.sort_values('GAME_DATE')
        game_logs_df['DAYS_REST'] = game_logs_df['GAME_DATE'].diff().dt.days
        game_logs_df['BACK_TO_BACK'] = (game_logs_df['DAYS_REST'] <= 1).astype(int)
        
        # Recent form (last 10 games)
        game_logs_df['RECENT_PPG'] = game_logs_df['PTS'].rolling(window=10, min_periods=5).mean()
        game_logs_df['RECENT_EFF'] = (
            (game_logs_df['PTS'] + game_logs_df['AST'] + game_logs_df['REB'] + 
             game_logs_df['STL'] + game_logs_df['BLK'] - game_logs_df['TOV'])
            .rolling(window=10, min_periods=5).mean()
        )
        
        # Performance volatility
        game_logs_df['PTS_VOLATILITY'] = game_logs_df['PTS'].rolling(window=10, min_periods=5).std()
        
        return game_logs_df
    
    def create_opponent_adjusted_metrics(self, df, opponent_data=None):
        """Adjust metrics based on opponent strength"""
        if opponent_data is not None:
            # This would incorporate opponent defensive ratings
            # Placeholder for opponent-adjusted calculations
            df['OPP_ADJ_PTS'] = df['PTS']  # Would be adjusted based on opponent def rating
            df['OPP_ADJ_EFF'] = df['ADV_EFF']  # Would be adjusted
        
        return df
    
    def create_lineup_synergy_features(self, lineup_df):
        """Create features measuring player combination effectiveness"""
        if lineup_df is None or len(lineup_df) == 0:
            return None
        
        # Net rating for different lineup combinations
        lineup_df['PACE_FACTOR'] = lineup_df['PACE'] / 100
        lineup_df['OFF_EFF_PACE_ADJ'] = lineup_df['OFF_RATING'] * lineup_df['PACE_FACTOR']
        lineup_df['DEF_EFF_PACE_ADJ'] = lineup_df['DEF_RATING'] * lineup_df['PACE_FACTOR']
        
        # Plus/minus per minute
        lineup_df['NET_PER_MIN'] = lineup_df['NET_RATING'] / (lineup_df['MIN'] + 1e-10)
        
        return lineup_df
    
    def prepare_modeling_features(self, df):
        """Prepare final feature set for modeling"""
        # Select numeric features for modeling
        numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
        
        # Remove ID columns and target variables
        id_columns = [col for col in numeric_features if 'ID' in col.upper()]
        numeric_features = [col for col in numeric_features if col not in id_columns]
        
        # Handle missing values
        df_clean = df[numeric_features].fillna(df[numeric_features].median())
        
        # Scale features
        df_scaled = pd.DataFrame(
            self.scaler.fit_transform(df_clean),
            columns=numeric_features,
            index=df_clean.index
        )
        
        return df_scaled, numeric_features


In [3]:
df = pd.read_csv('nba_player_stats_2024_25.csv')

# Initialize feature engineer
fe = BasketballFeatureEngineer()

# Create features
df_with_features = fe.calculate_player_efficiency_rating(df)
df_with_features = fe.create_contextual_features(df_with_features)

# Prepare for modeling
model_ready_data, feature_names = fe.prepare_modeling_features(df_with_features)


In [5]:
model_ready_data.to_parquet("./data/02_data-for-modeling.parquet")

In [7]:
df_with_features.to_parquet("./data/02_df-w-features.parquet")