# NBA Playoff Predictor - Data Cleaning

This notebook focuses on cleaning and preprocessing the raw NBA data from Kaggle sources. We'll prepare the data for feature engineering by standardizing formats, handling missing values, and ensuring data quality across different sources.

## Data Sources and Cleaning Goals

1. NBA/ABA/BAA Stats (sumitrodatta)
   - Player Season Info: Contains individual player statistics per season
     - Cleaning focuses on standardizing team names, filtering for NBA-only data, and handling missing values
   - Team Stats Per Game: Contains team-level performance metrics
     - Cleaning involves normalizing team names and ensuring consistent statistical calculations

2. NBA Injury Stats (loganlauton)
   - Contains historical injury data from 1951-2023
   - Cleaning involves:
     - Standardizing injury descriptions
     - Converting dates to consistent format
     - Matching team names with other datasets
     - Removing duplicate entries

3. NBA Shots Data (mexwell)
   - Contains detailed shot location and outcome data
   - Cleaning involves:
     - Standardizing coordinate systems
     - Validating shot types and distances
     - Ensuring consistent player and team naming
     - Removing invalid or incomplete shot records

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
from pathlib import Path

sys.path.append('..')

from src.data.cleaners.sumitrodatta_cleaner import SumitrodattaCleaner
from src.data.cleaners.loganlauton_cleaner import LoganlautonCleaner
from src.data.cleaners.mexwell_cleaner import MexwellCleaner
from src.data.utils import setup_logging

logger = setup_logging()

sns.set_theme()

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

## Clean NBA/ABA/BAA Stats

Process data from sumitrodatta's dataset. This section focuses on cleaning two key datasets:
1. Player season data - individual player statistics
2. Team stats data - aggregated team performance metrics

The cleaning process ensures consistent formatting and removes any anomalies that could affect our analysis.

In [None]:
# Load and clean player season data
sumitrodatta = SumitrodattaCleaner()

player_season_df = pd.read_csv('../data/raw/kaggle/sumitrodatta/nba-aba-baa-stats/Player Season Info.csv')
cleaned_player_season = sumitrodatta.clean_player_season_data(player_season_df)
logger.info(f"Cleaned {len(cleaned_player_season)} player season records")

# Load and clean team stats data
team_stats_df = pd.read_csv('../data/raw/kaggle/sumitrodatta/nba-aba-baa-stats/Team Stats Per Game.csv')
cleaned_team_stats = sumitrodatta.clean_team_stats_data(team_stats_df)
logger.info(f"Cleaned {len(cleaned_team_stats)} team statistics records")

Visualization for cleaned player season data

In [None]:
def visualize_player_season_cleaning(raw_df: pd.DataFrame, cleaned_df: pd.DataFrame):
    """Visualize player season data cleaning effects"""
    # Set up the figure
    plt.style.use('seaborn')
    fig = plt.figure(figsize=(15, 12))
    fig.suptitle('Player Season Data Cleaning Effects', fontsize=14)
    
    # 1. Missing Values Comparison (Before vs After)
    plt.subplot(2, 2, 1)
    missing_before = raw_df.isnull().sum() / len(raw_df) * 100
    missing_after = cleaned_df.isnull().sum() / len(cleaned_df) * 100
    
    # Plot side by side bars
    x = np.arange(len(missing_before))
    width = 0.35
    plt.bar(x - width/2, missing_before, width, label='Before', alpha=0.5)
    plt.bar(x + width/2, missing_after, width, label='After', alpha=0.5)
    plt.title('Missing Values % (Before vs After)')
    plt.xticks(rotation=45, ha='right')
    plt.legend()
    
    # 2. Games Played Distribution
    plt.subplot(2, 2, 2)
    # Handle potential column name differences
    games_col_raw = 'G' if 'G' in raw_df.columns else 'games'
    games_col_clean = 'games' if 'games' in cleaned_df.columns else 'G'
    
    sns.histplot(data=raw_df[games_col_raw], label='Before', color='red', alpha=0.5, bins=30)
    sns.histplot(data=cleaned_df[games_col_clean], label='After', color='blue', alpha=0.5, bins=30)
    plt.title('Games Played Distribution')
    plt.legend()
    
    # 3. Minutes Played Distribution
    plt.subplot(2, 2, 3)
    # Handle potential column name differences
    mp_col_raw = 'MP' if 'MP' in raw_df.columns else 'minutes'
    mp_col_clean = 'minutes' if 'minutes' in cleaned_df.columns else 'MP'
    
    sns.boxplot(data=pd.DataFrame({
        'Before': raw_df[mp_col_raw],
        'After': cleaned_df[mp_col_clean]
    }).melt(), x='variable', y='value')
    plt.title('Minutes Played Distribution')
    
    # 4. Season Coverage
    plt.subplot(2, 2, 4)
    season_col = 'season' if 'season' in cleaned_df.columns else 'Season'
    season_counts = cleaned_df[season_col].value_counts().sort_index()
    plt.plot(season_counts.index, season_counts.values)
    plt.title('Players per Season')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print("\nPlayer Season Data Cleaning Summary:")
    print(f"Total Records Before: {len(raw_df):,}")
    print(f"Total Records After: {len(cleaned_df):,}")
    print(f"Records Changed: {abs(len(raw_df) - len(cleaned_df)):,}")
    
    # Calculate and display the date range
    season_range = cleaned_df[season_col].agg(['min', 'max'])
    print(f"\nSeasons Covered: {season_range['min']} to {season_range['max']}")
    print(f"Total Seasons: {len(cleaned_df[season_col].unique())}")
    
    # Show impact on missing values
    print("\nMissing Values Impact:")
    print(f"Before: {raw_df.isnull().sum().sum():,} missing values")
    print(f"After: {cleaned_df.isnull().sum().sum():,} missing values")

visualize_player_season_cleaning(player_season_df,cleaned_player_season)

Visualization for team stats cleaning

In [None]:
def visualize_team_stats_cleaning(raw_df: pd.DataFrame, cleaned_df: pd.DataFrame):
    """Visualize team statistics data cleaning effects"""
    plt.style.use('seaborn')
    
    # Create figure with multiple subplots
    fig = plt.figure(figsize=(15, 12))
    fig.suptitle('Team Statistics Data Cleaning Effects', fontsize=14)
    
    # 1. Missing Values Before/After
    plt.subplot(2, 2, 1)
    sns.heatmap(raw_df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
    plt.title('Missing Values Before Cleaning')
    
    plt.subplot(2, 2, 2)
    sns.heatmap(cleaned_df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
    plt.title('Missing Values After Cleaning')
    
    # 2. Points Distribution
    plt.subplot(2, 2, 3)
    sns.kdeplot(data=raw_df['PTS'], label='Before', color='red')
    sns.kdeplot(data=cleaned_df['pts_per_game'], label='After', color='blue')
    plt.title('Points per Game Distribution')
    plt.legend()
    
    # 3. Shooting Percentages
    plt.subplot(2, 2, 4)
    raw_df[['FG%', '3P%', 'FT%']].boxplot(color='red', alpha=0.5)
    plt.title('Shooting Percentages Before Cleaning')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\nCleaning Impact Summary:")
    print(f"Original Records: {len(raw_df)}")
    print(f"Cleaned Records: {len(cleaned_df)}")
    print("\nMissing Values:")
    print("Before:", raw_df.isnull().sum().sum())
    print("After:", cleaned_df.isnull().sum().sum())

visualize_team_stats_cleaning(team_stats_df,cleaned_team_stats)

## Clean NBA Injury Stats

Process data from loganlauton's dataset. This section handles injury data cleaning, which is crucial for:
- Understanding player availability
- Analyzing team performance impact from injuries
- Tracking injury patterns and their effect on playoff chances

In [None]:
# Load and clean injury data
loganlauton = LoganlautonCleaner()
injury_df = pd.read_csv('../data/raw/kaggle/loganlauton/nba-injury-stats-1951-2023/NBA Player Injury Stats(1951 - 2023).csv')
cleaned_injuries = loganlauton.clean_injury_data(injury_df)
logger.info(f"Cleaned {len(cleaned_injuries)} injury records")

Visualization for injury data cleaning

In [None]:
def visualize_injury_cleaning(raw_df: pd.DataFrame, cleaned_df: pd.DataFrame):
    """Visualize injury data cleaning effects"""
    fig = plt.figure(figsize=(15, 12))
    fig.suptitle('Injury Data Cleaning Effects', fontsize=14)
    
    # 1. Missing Values
    plt.subplot(2, 2, 1)
    sns.heatmap(raw_df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
    plt.title('Missing Values Before Cleaning')
    
    plt.subplot(2, 2, 2)
    sns.heatmap(cleaned_df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
    plt.title('Missing Values After Cleaning')
    
    # 2. Injury Types
    plt.subplot(2, 2, 3)
    cleaned_df['injury'].value_counts().head(10).plot(kind='bar')
    plt.title('Top 10 Most Common Injuries')
    plt.xticks(rotation=45)
    
    # 3. Injuries by Season
    plt.subplot(2, 2, 4)
    injuries_by_season = cleaned_df.groupby('season').size()
    plt.plot(injuries_by_season.index, injuries_by_season.values)
    plt.title('Injuries by Season')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print("\nInjury Data Cleaning Summary:")
    print(f"Total Records Before: {len(raw_df):,}")
    print(f"Total Records After: {len(cleaned_df):,}")
    print("\nUnique Injury Types:", len(cleaned_df['injury'].unique()))

visualize_injury_cleaning(injury_df,cleaned_injuries)

## Clean NBA Shots Data

Process data from mexwell's dataset. This section handles shot data cleaning, which provides insights into:
- Team shooting patterns and efficiency
- Player shooting preferences and success rates
- Spatial analysis of scoring

The cleaning process ensures accurate shot coordinates and consistent categorization of shot types.

In [None]:
# Load and clean shots data
mexwell = MexwellCleaner()
shots_dir = Path('../data/raw/kaggle/mexwell/nba-shots')
shots_files = list(shots_dir.glob('NBA_20[0-9][0-9]_Shots.csv')) 

logger.info(f"Looking for shot files in: {shots_dir.absolute()}")

shots_data = []

try:
    # Get shot files
    if not shots_dir.exists():
        raise FileNotFoundError(f"Directory not found: {shots_dir}")
    
    shots_files = sorted(shots_dir.glob('NBA_20[0-9][0-9]_Shots.csv'))
    if not shots_files:
        raise FileNotFoundError(f"No shot files found in {shots_dir}")
    
    logger.info(f"Found {len(shots_files)} shot files:")
    for file in shots_files:
        logger.info(f"- {file.name}")

    total_rows = 0
    for file in shots_files:
        logger.info(f"Loading {file.name}...")
        df = pd.read_csv(file)
        total_rows += len(df)
        logger.info(f"Loaded {len(df):,} records")
        shots_data.append(df)

    shots_df = pd.concat(shots_data, ignore_index=True)
    logger.info(f"Successfully combined {len(shots_df):,} total records")

    logger.info("Initial data overview:")
    logger.info(f"Shape: {shots_df.shape}")
    logger.info("Column types:")
    logger.info(shots_df.dtypes)
    
    # Clean the data
    cleaned_shots = mexwell.clean_shots_data(shots_df)
    logger.info(f"\nCleaned {len(cleaned_shots):,} shot records")
    
    # Show cleaning results
    logger.info("Cleaning summary:")
    logger.info(f"Original records: {len(shots_df):,}")
    logger.info(f"Cleaned records: {len(cleaned_shots):,}")
    logger.info(f"Records removed: {len(shots_df) - len(cleaned_shots):,}")

except Exception as e:
    logger.error(f"Error processing shots data: {str(e)}")
    raise

Visualation for Shots cleaning

In [None]:
def visualize_shots_cleaning(raw_df: pd.DataFrame, cleaned_df: pd.DataFrame):
    """Visualize shots data cleaning effects"""
    fig = plt.figure(figsize=(15, 12))
    fig.suptitle('Shot Data Cleaning Effects', fontsize=14)
    
    # 1. Shot Locations
    plt.subplot(2, 2, 1)
    plt.scatter(raw_df['LOC_X'], raw_df['LOC_Y'], alpha=0.1, s=1)
    plt.title('Shot Locations Before Cleaning')
    
    plt.subplot(2, 2, 2)
    plt.scatter(cleaned_df['loc_x'], cleaned_df['loc_y'], alpha=0.1, s=1)
    plt.title('Shot Locations After Cleaning')
    
    # 2. Shot Types
    plt.subplot(2, 2, 3)
    shot_types = raw_df['SHOT_TYPE'].value_counts()
    plt.pie(shot_types.values, labels=shot_types.index, autopct='%1.1f%%')
    plt.title('Shot Types Before Cleaning')
    
    # 3. Shot Success Rate by Distance
    plt.subplot(2, 2, 4)
    sns.scatterplot(data=cleaned_df, x='shot_distance', y='shot_made', alpha=0.1)
    plt.title('Shot Success by Distance')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print("\nShot Data Cleaning Summary:")
    print(f"Total Shots Before: {len(raw_df):,}")
    print(f"Total Shots After: {len(cleaned_df):,}")
    print("\nSuccess Rate:", f"{(cleaned_df['shot_made'].mean()*100):.1f}%")

visualize_shots_cleaning(shots_df, cleaned_shots)

## Save Cleaned Data

Save each cleaned dataset to its appropriate location in the processed directory.
These cleaned datasets will be used in the feature engineering phase
to create predictive features for our playoff prediction model.

In [None]:
# Save cleaned data
cleaned_player_season.to_csv('../data/processed/historical/player_season.csv', index=False)
cleaned_team_stats.to_csv('../data/processed/historical/team_stats.csv', index=False)
cleaned_injuries.to_csv('../data/processed/historical/injuries.csv', index=False)
cleaned_shots.to_csv('../data/processed/historical/shots.csv', index=False)
print("Saved all cleaned data")