In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter

def clean_nba_data(input_file_path, output_file_path=None):
    """
    Clean NBA dataset for prediction modeling
    
    Parameters:
    input_file_path (str): Path to the input CSV file
    output_file_path (str): Path to save cleaned data (optional)
    
    Returns:
    pd.DataFrame: Cleaned dataset
    """
    
    # Load the data
    print("Loading data...")
    df = pd.read_csv(input_file_path)
    
    # Display basic info about the dataset
    print(f"Original dataset shape: {df.shape}")
    print(f"Unique players: {df['player_name'].nunique()}")
    print(f"Seasons covered: {sorted(df['season'].unique())}")
    
    # Analyze missing data
    print("\nMissing data analysis:")
    missing_data = df.isnull().sum()
    missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
    for col, missing in missing_data.head(10).items():
        percentage = (missing / len(df)) * 100
        print(f"  {col}: {missing} ({percentage:.2f}%)")
    
    # Define stat columns
    stat_columns = ['MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'FG%', 
                   '3P', '3PA', '3P%', 'FT%', 'FGA', 'FTA', 'ORB', 'DRB', 'PF']
    
    # Group by player and process
    print("\nCleaning data...")
    cleaned_records = []
    
    for player_name in df['player_name'].unique():
        player_data = df[df['player_name'] == player_name].sort_values('season').reset_index(drop=True)
        seasons_played = len(player_data)
        
        for idx, row in player_data.iterrows():
            clean_record = {
                'player_id': row['player_id'],
                'player_name': row['player_name'],
                'season': row['season'],
                'age': row['age'],
                'pos': row['pos'],
                'seasons_played': seasons_played,
                'career_season_number': idx + 1
            }
            
            # Add current season stats (from "next" columns)
            for stat in stat_columns:
                clean_record[f'{stat}_current'] = row.get(f'{stat}_next', np.nan)
            
            # Add historical stats (prev1 through prev5)
            for i in range(1, 6):
                has_prev_data = row.get(f'has_prev{i}', 0)
                for stat in stat_columns:
                    if has_prev_data:
                        clean_record[f'{stat}_prev{i}'] = row.get(f'{stat}_prev{i}', np.nan)
                    else:
                        clean_record[f'{stat}_prev{i}'] = np.nan
            
            cleaned_records.append(clean_record)
    
    # Create cleaned DataFrame
    cleaned_df = pd.DataFrame(cleaned_records)
    
    print(f"\nCleaned dataset shape: {cleaned_df.shape}")
    print(f"Columns in cleaned dataset: {len(cleaned_df.columns)}")
    
    # Save if output path provided
    if output_file_path:
        cleaned_df.to_csv(output_file_path, index=False)
        print(f"Cleaned dataset saved to: {output_file_path}")
    
    return cleaned_df

def analyze_cleaned_data(df):
    """
    Analyze the cleaned dataset
    """
    print("\n" + "="*50)
    print("CLEANED DATA ANALYSIS")
    print("="*50)
    
    # Basic stats
    print(f"Total records: {len(df)}")
    print(f"Unique players: {df['player_name'].nunique()}")
    print(f"Seasons: {sorted(df['season'].unique())}")
    
    # Season distribution
    print(f"\nSeason distribution:")
    season_counts = df['season'].value_counts().sort_index()
    for season, count in season_counts.items():
        print(f"  {season}: {count} players")
    
    # Career length distribution
    print(f"\nCareer length distribution:")
    career_lengths = df['seasons_played'].value_counts().sort_index()
    for length, count in career_lengths.items():
        unique_players = df[df['seasons_played'] == length]['player_name'].nunique()
        print(f"  {length} seasons: {unique_players} players")
    
    # Position distribution
    print(f"\nPosition distribution:")
    pos_counts = df['pos'].value_counts()
    for pos, count in pos_counts.items():
        print(f"  {pos}: {count} records")
    
    # Sample records
    print(f"\nSample records:")
    sample_cols = ['player_name', 'season', 'age', 'pos', 'PTS_current', 'PTS_prev1', 'seasons_played', 'career_season_number']
    print(df[sample_cols].head(10).to_string(index=False))
    
    return df

def create_features_for_prediction(df):
    """
    Create additional features that might be useful for prediction
    """
    print("\nCreating additional features...")
    
    # Calculate career averages (where data exists)
    stat_columns = ['MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'FG%', 
                   '3P', '3PA', '3P%', 'FT%', 'FGA', 'FTA', 'ORB', 'DRB', 'PF']
    
    for stat in stat_columns:
        # Get all previous season columns for this stat
        prev_cols = [f'{stat}_prev{i}' for i in range(1, 6)]
        
        # Calculate career average (excluding NaN)
        df[f'{stat}_career_avg'] = df[prev_cols].mean(axis=1, skipna=True)
        
        # Calculate most recent season (prev1 if available, else NaN)
        df[f'{stat}_last_season'] = df[f'{stat}_prev1']
        
        # Calculate trend (difference between prev1 and prev2)
        df[f'{stat}_trend'] = df[f'{stat}_prev1'] - df[f'{stat}_prev2']
    
    # Age-related features
    df['age_squared'] = df['age'] ** 2
    df['is_rookie'] = (df['career_season_number'] == 1).astype(int)
    df['is_veteran'] = (df['career_season_number'] >= 5).astype(int)
    
    # Experience features
    df['years_experience'] = df['career_season_number'] - 1
    
    print(f"Added features. New shape: {df.shape}")
    
    return df

# Main execution
if __name__ == "__main__":
    # Set your file paths here
    input_file = "../data/nba_dataset_5yr.csv"  # Your input file
    output_file = "cleaned_nba_data.csv"  # Output file
    
    try:
        # Clean the data
        cleaned_data = clean_nba_data(input_file, output_file)
        
        # Analyze the cleaned data
        analyze_cleaned_data(cleaned_data)
        
        # Create additional features
        enhanced_data = create_features_for_prediction(cleaned_data)
        
        # Save enhanced version
        enhanced_output = "enhanced_nba_data.csv"
        enhanced_data.to_csv(enhanced_output, index=False)
        print(f"\nEnhanced dataset saved to: {enhanced_output}")
        
        # Show final column list
        print(f"\nFinal columns ({len(enhanced_data.columns)}):")
        for i, col in enumerate(enhanced_data.columns, 1):
            print(f"  {i:2d}. {col}")
            
    except Exception as e:
        print(f"Error: {e}")
        print("Make sure your input file path is correct and the file exists.")



Loading data...
Original dataset shape: (3862, 112)
Unique players: 1188
Seasons covered: [2018, 2019, 2020, 2021, 2022, 2023, 2024]

Missing data analysis:
  3P%_prev5: 3479 (90.08%)
  FT%_prev5: 3465 (89.72%)
  TRB_prev5: 3463 (89.67%)
  STL_prev5: 3463 (89.67%)
  AST_prev5: 3463 (89.67%)
  TOV_prev5: 3463 (89.67%)
  3P_prev5: 3463 (89.67%)
  3PA_prev5: 3463 (89.67%)
  MP_prev5: 3463 (89.67%)
  BLK_prev5: 3463 (89.67%)

Cleaning data...

Cleaned dataset shape: (3862, 109)
Columns in cleaned dataset: 109
Cleaned dataset saved to: cleaned_nba_data.csv

CLEANED DATA ANALYSIS
Total records: 3862
Unique players: 1188
Seasons: [2018, 2019, 2020, 2021, 2022, 2023, 2024]

Season distribution:
  2018: 541 players
  2019: 531 players
  2020: 530 players
  2021: 541 players
  2022: 606 players
  2023: 540 players
  2024: 573 players

Career length distribution:
  1 seasons: 340 players
  2 seasons: 228 players
  3 seasons: 163 players
  4 seasons: 109 players
  5 seasons: 104 players
  6 season