In [3]:
import pandas as pd
from datascience import *


In [None]:
import pandas as pd
import numpy as np
from datascience import Table

# ============================================================================
# FINAL FIXED KC ROYALS ANALYSIS
# ============================================================================

# Step 1: Load Data
print("Step 1: Loading datasets...")
# Using try-except or standard load
df_br = pd.read_excel('BR Baseball Data 2021-2025.xlsb', sheet_name=5, engine='pyxlsb')
df_savant = pd.read_csv('Savant Batter 2021-2025.csv')

# Step 2: Filter for KC Royals
print("Step 2: Filtering for KC Royals...")
kc_royals_df = df_br[df_br['Team'] == 'KCR'].copy()

# Step 3: Create a Clean Join Key
# This handles "Wacha, Michael" -> "michael wacha" and "Michael Wacha" -> "michael wacha"
print("Step 3: Standardizing names...")

def create_join_key(name):
    if pd.isna(name): 
        return ""
    name = str(name).replace('"', '').lower().strip()
    if ',' in name:
        parts = [p.strip() for p in name.split(',')]
        return f"{parts[1]} {parts[0]}"
    return name

kc_royals_df['Join_Key'] = kc_royals_df['Player'].apply(create_join_key)
df_savant['Join_Key'] = df_savant['last_name, first_name'].apply(create_join_key)

# Step 4: DEDUPLICATE SAVANT DATA
# If Wacha appears 5 times in Savant, this averages his stats into 1 row
print("Step 4: Aggregating Savant stats to prevent duplicate rows...")
df_savant_unique = df_savant.groupby('Join_Key').mean(numeric_only=True).reset_index()

# Step 5: Join
print("Step 5: Joining tables...")
combined_df = kc_royals_df.merge(
    df_savant_unique, 
    on='Join_Key', 
    how='left', 
    suffixes=('', '_savant_duplicate')
)

# Step 6: Final Cleanup
# Drop the helper key and any redundant columns created by the join
print("Step 6: Cleaning up columns and removing duplicate records...")
combined_df = combined_df.drop(columns=['Join_Key'])
# Remove any columns that ended up with the duplicate suffix
cols_to_keep = [c for c in combined_df.columns if not c.endswith('_savant_duplicate')]
combined_df = combined_df[cols_to_keep]

# Remove any truly duplicate rows
combined_df = combined_df.drop_duplicates()

# Step 7: Convert to datascience Table
print("Step 7: Converting to final Table object...")
kc_royals_table = Table.from_df(combined_df)

# Display Results
print("\n" + "="*80)
print(f"ANALYSIS COMPLETE: {kc_royals_table.num_rows} total rows.")
print("="*80)
kc_royals_table.show(10)

# Export the final Table to a CSV file
kc_royals_table.to_csv('kc_royals_combined_analysis.csv')
print("Successfully exported to 'kc_royals_combined_analysis.csv'")

SyntaxError: invalid syntax. Perhaps you forgot a comma? (261463114.py, line 12)