In [None]:
# Install required libraries if not already present
!pip install azure-storage-blob pandas

from azure.storage.blob import BlobServiceClient
import pandas as pd
from datetime import datetime
import io

# Connect to Azure Blob Storage
connection_string = "DefaultEndpointsProtocol=https;AccountName=sg092620240215;AccountKey=+PaTF6WCZ0NY63Hni1XIWRJfWsnTI7QJCLVP0f1OXUoVzJyl0AcE4h2Pe1b7ZbgldGkDDFA0j9iK+AStvU4auA==;EndpointSuffix=core.windows.net"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client("silver")

# Function to find the latest merged_rosters_players_*.csv
def get_latest_csv(container_client, prefix="merged_rosters_players_"):
    latest_blob = None
    latest_time = None
    for blob in container_client.list_blobs(name_starts_with=prefix):
        print(f"Processing blob: {blob.name}")
        try:
            parts = blob.name.split("_")
            if len(parts) < 5:
                print(f"Skipping {blob.name}: Not enough parts")
                continue
            timestamp_str = f"{parts[-2]}_{parts[-1].replace('.csv', '')}"
            timestamp = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
            if latest_time is None or timestamp > latest_time:
                latest_time = timestamp
                latest_blob = blob.name
        except ValueError as e:
            print(f"Skipping {blob.name}: Invalid timestamp ({e})")
            continue
    return latest_blob

# Check for latest CSV
latest_blob = get_latest_csv(container_client)
if not latest_blob:
    raise ValueError("No merged_rosters_players_*.csv files found")
else:
    print(f"Latest CSV found: {latest_blob}")

# Load the latest CSV
print(f"Loading {latest_blob}")
blob_client = container_client.get_blob_client(latest_blob)
blob_data = blob_client.download_blob().readall().decode("utf-8")
df = pd.read_csv(io.StringIO(blob_data))

# Define position group mapping
def assign_position_group(pos):
    if pos in ['CF', 'DH', 'LF', 'RF']:
        return 'OFD'
    elif pos in ['P', 'SP', 'RP']:
        return 'P'
    #elif pos in ['2B', 'SS']:
    #    return 'MI'
    #elif pos in ['1B', '3B']:
    #    return 'CI'
    else:
        return 'Other'  # Default for unmatched positions


# Add the new position_group column
df['position_group'] = df['pos'].apply(assign_position_group)


# Save the updated DataFrame back to silver
output_blob_name = f"merged_rosters_players_with_groups_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.csv"
output_blob_client = container_client.get_blob_client(output_blob_name)
output_csv = df.to_csv(index=False)
output_blob_client.upload_blob(output_csv, overwrite=True)
print(f"Uploaded updated CSV to silver as {output_blob_name}")

# Display the updated DataFrame (using pandas print instead of Databricks display)
print("Updated DataFrame:")
print(df.head())  # Show first 5 rows