# Cleaning and EDA

## Overview: Loading, Initial Observations, and Cleaning umpire datasets


---
## Project Workflow:
1. **Step 1: Install and Import Necessary Libraries**
2. **Step 2: Access Data**
3. **Step 3: Check Packaging**
5. **Step 4: Clean Data**
6. **Step 5: Save Data**


In [None]:
#----- Step 1: Install and Import Necessary Libraries -----
import pandas as pd
from sklearn.preprocessing import StandardScaler

# prep
umpire_files = ["Umpire_1.csv", "Umpire_2.csv", "Umpire_3.csv"]
columns_to_drop = ['on_1b', 'on_2b', 'on_3b', 'at_bat_number'] # drop bc not consistent across games

#----- Step 3: Check Packaging -----
def write_diagnostics(df: pd.DataFrame, stage: str, path: str):
    with open(path, "a") as log:
        log.write(f"\n\n=== {stage} ===\n\n")
        df.info(buf=log)
        log.write("\n\n--- Head ---\n")
        log.write(df.head().to_string())
        log.write("\n\n--- Describe ---\n")
        log.write(df.describe(include='all').to_string())

for file in umpire_files:
    #----- Step 2: Access Data #-----
    df = pd.read_csv("data/" + file)
    
    #----- Step 3: Check Packaging -----
    key = file.replace(".csv", "")
    log_path = f"data/cleaned/logs/{key}_log.txt"
    open(log_path, "w").close() # Clear previous logs
    
    #----- Step 4: Clean Data -----
    
    # Drop irrelevant columns
    df = df.drop(columns=columns_to_drop, errors='ignore')
    df = df.dropna()
    write_diagnostics(df, f"{key} RAW", log_path)

    # Convert manually bc encoding does it backwards
    df['error_in_decision'] = df['error_in_decision'].map({'correct': 0, 'incorrect': 1})

	# one-hot encode
    non_numeric_cols = df.select_dtypes(exclude=['number']).columns
    df = pd.get_dummies(df, columns=non_numeric_cols, drop_first=True)
    write_diagnostics(df, f"{key} ENCODED", log_path)

	# Scaling
    numeric_cols = df.select_dtypes(include='number').columns
    scaler = StandardScaler()
    cols_to_scale = [col for col in numeric_cols if df[col].nunique() > 2]
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    write_diagnostics(df, f"{key} SCALED", log_path)

	#----- Step 5: Save Data -----
    df.to_csv(f"data/cleaned/{key}_cleaned.csv", index=False)
    
    #----- Step 3: Check Packaging -----
    with open(log_path, "a") as log:
        log.write("\n\n=== FINAL SUMMARY ===\n\n")
        log.write(f"{key} cleaned → shape: {df.shape} | Target distribution:\n{df['error_in_decision'].value_counts().to_string()}\n")


Umpire_1 cleaned → shape: (57698, 47) | Target distribution:
error_in_decision
0    50339
1     7359

Umpire_2 cleaned → shape: (53621, 46) | Target distribution:
error_in_decision
0    46603
1     7018

Umpire_3 cleaned → shape: (57326, 46) | Target distribution:
error_in_decision
0    50394
1     6932

