In [None]:
import pandas as pd
import glob
import os

def process_raw_bess_data(df):
    """
    Process raw BESS data to add EFA blocks and delivery dates
    This replicates the processing logic from individual notebooks
    """
    # Convert deliveryStart to datetime
    df['deliveryStart'] = pd.to_datetime(df['deliveryStart'])
    
    # Clean up column names
    df.columns = df.columns.str.strip()
    if 'EFA block of the day ' in df.columns:
        df = df.rename(columns={'EFA block of the day ': 'EFA block of the day'})
    
    # Rename executedQuantity to be more descriptive
    if 'executedQuantity' in df.columns:
        df = df.rename(columns={'executedQuantity': 'Delivery capacity (MW)'})
    
    # Fill missing delivery dates and EFA blocks
    for index, row in df.iterrows():
        if pd.isna(row.get('Delivery date')) or pd.isna(row.get('EFA block of the day')):
            start_time = row['deliveryStart']
            hour = start_time.hour
            
            # EFA blocks mapping:
            # Block 1: 23:00-03:00 (delivery date is next day for 23:xx)
            # Block 2: 03:00-07:00 
            # Block 3: 07:00-11:00 
            # Block 4: 11:00-15:00 
            # Block 5: 15:00-19:00 
            # Block 6: 19:00-23:00 
            
            if hour >= 23:  # 23:00-23:59
                efa_block = 1
                delivery_date = (start_time + pd.Timedelta(days=1)).strftime('%d/%m/%Y')
            elif hour < 3:  # 00:00-02:59
                efa_block = 1
                delivery_date = start_time.strftime('%d/%m/%Y')
            elif hour < 7:  # 03:00-06:59
                efa_block = 2
                delivery_date = start_time.strftime('%d/%m/%Y')
            elif hour < 11:  # 07:00-10:59
                efa_block = 3
                delivery_date = start_time.strftime('%d/%m/%Y')
            elif hour < 15:  # 11:00-14:59
                efa_block = 4
                delivery_date = start_time.strftime('%d/%m/%Y')
            elif hour < 19:  # 15:00-18:59
                efa_block = 5
                delivery_date = start_time.strftime('%d/%m/%Y')
            else:  # 19:00-22:59
                efa_block = 6
                delivery_date = start_time.strftime('%d/%m/%Y')
            
            # Update missing values
            if pd.isna(row.get('Delivery date')):
                # Convert delivery date column to object type first to avoid pandas warning
                if df['Delivery date'].dtype != 'object':
                    df['Delivery date'] = df['Delivery date'].astype('object')
                df.at[index, 'Delivery date'] = delivery_date
            if pd.isna(row.get('EFA block of the day')):
                df.at[index, 'EFA block of the day'] = efa_block
    
    return df

# Store the original directory and navigate to raw_data
original_dir = os.getcwd()
os.chdir('../raw_data')

csv_files = [f for f in glob.glob("*.csv") if not f.endswith('_updated.csv') and f != 'Battery name mapping.csv']

print(f"Found {len(csv_files)} raw BESS unit files:")
for file in sorted(csv_files):
    print(f"  - {file}")

# Load the battery mapping to check which units have actual data files
mapping_df = pd.read_csv('Battery name mapping.csv')
print(f"\nOriginal mapping file has {len(mapping_df)} units")

# Create a list of units that have actual data files
# This will exclude Conrad Energy - Swindon (SWDN-1) and Ecotricity - Alveston (ALVB-1)
# since they don't have corresponding CSV files
units_with_data_files = []

# Process and combine all raw CSV files
dataframes = []
for file in csv_files:
    print(f"\nProcessing: {file}")
    
    try:
        # Load raw data
        df = pd.read_csv(file)
        
        # Process the data (add EFA blocks, delivery dates, etc.)
        df = process_raw_bess_data(df)
        
        # Add source identifier (extract unit name from filename)
        unit_name = file.replace('.csv', '')
        df['source_file'] = unit_name
        
        # Track which units actually have data files
        if 'auctionUnit' in df.columns and len(df) > 0:
            unit_codes = df['auctionUnit'].unique()
            units_with_data_files.extend(unit_codes)
        
        dataframes.append(df)
        print(f"  ✓ Processed {len(df):,} records")
        
    except Exception as e:
        print(f"  ✗ Error processing {file}: {e}")

# Create a filtered mapping file that only includes units with actual data
units_with_data_files = list(set(units_with_data_files))  # Remove duplicates
filtered_mapping = mapping_df[mapping_df['Unit (NESO)'].isin(units_with_data_files)]

print(f"\nFiltered mapping:")
print(f"  Units with data files: {sorted(units_with_data_files)}")
print(f"  Units excluded (no data files): {sorted(set(mapping_df['Unit (NESO)']) - set(units_with_data_files))}")
print(f"  Filtered mapping has {len(filtered_mapping)} units (down from {len(mapping_df)})")

# Save the filtered mapping file
filtered_mapping.to_csv('Battery name mapping filtered.csv', index=False)
print(f"  ✓ Saved filtered mapping: Battery name mapping filtered.csv")

# Combine all dataframes
if dataframes:
    merged_df = pd.concat(dataframes, ignore_index=True)
    
    # Save the merged dataset - navigate to processed_data
    os.chdir('../processed_data')
    merged_df.to_csv('merged_17_units_updated.csv', index=False)
    
    print(f"\n{'='*50}")
    print(f"✓ Merged dataset saved: merged_17_units_updated.csv")
    print(f"✓ Total records: {len(merged_df):,}")
    print(f"✓ Total units with data: {merged_df['auctionUnit'].nunique()}")
    print(f"✓ Total companies: {merged_df['source_file'].str.split(' - ').str[0].nunique()}")
    print(f"✓ Date range: {merged_df['deliveryStart'].min()} to {merged_df['deliveryStart'].max()}")
    print(f"✓ Columns: {list(merged_df.columns)}")
    print(f"✓ Units excluded from original mapping (no data files): SWDN-1, ALVB-1")
    
    # Show sample of units
    print(f"\nUnits included:")
    for unit in sorted(merged_df['source_file'].unique()):
        count = len(merged_df[merged_df['source_file'] == unit])
        # Get the auctionUnit code for this source file
        unit_code = merged_df[merged_df['source_file'] == unit]['auctionUnit'].iloc[0]
        print(f"  - {unit} ({unit_code}): {count:,} records")
        
else:
    print("❌ No files were successfully processed!")

# Return to original directory (analysis_notebooks)
os.chdir(original_dir)

Found 17 raw BESS unit files:
  - Centrica - Roosecote.csv
  - Conrad - Blackpool.csv
  - Conrad - Midsomer.csv
  - Conrad - Torquay.csv
  - Conrad - Winchester.csv
  - EDF - Bustleholm.csv
  - EDF - Coventry.csv
  - EDF - Cowley.csv
  - EDF - Kemsley.csv
  - EDF - Tye Lane.csv
  - GS - Breach Farm.csv
  - GS - Cenin.csv
  - GS - Hulley Road.csv
  - GS - Larport.csv
  - GS - Lascar.csv
  - GS - Port of Tilbury.csv
  - Scottish - Whitelee.csv

Processing: Conrad - Midsomer.csv
  ✓ Processed 376 records

Processing: GS - Lascar.csv
  ✓ Processed 1,835 records

Processing: Scottish - Whitelee.csv
  ✓ Processed 5,506 records

Processing: Conrad - Winchester.csv
  ✓ Processed 1,136 records

Processing: GS - Port of Tilbury.csv
  ✓ Processed 2,624 records

Processing: GS - Breach Farm.csv
  ✓ Processed 5,506 records

Processing: Conrad - Winchester.csv
  ✓ Processed 1,136 records

Processing: GS - Port of Tilbury.csv
  ✓ Processed 2,624 records

Processing: GS - Breach Farm.csv
  ✓ Processed

In [None]:
# Clean up any existing individual updated files
import os
import glob

# Store current directory and navigate to processed_data
original_dir = os.getcwd()
os.chdir('../processed_data')

updated_files = glob.glob('*_updated.csv')
updated_files = [f for f in updated_files if f != 'merged_17_units_updated.csv']

print(f"Cleaning up {len(updated_files)} individual updated files...")
for file in updated_files:
    try:
        os.remove(file)
        print(f"  ✓ Removed: {file}")
    except FileNotFoundError:
        print(f"  - File not found: {file}")

# Return to original directory
os.chdir(original_dir)
print("Cleanup complete! Only merged_17_units_updated.csv remains in processed_data.")

Cleaning up 17 individual updated files...
  ✓ Removed: GS - Breach Farm_updated.csv
  ✓ Removed: EDF - Coventry_updated.csv
  ✓ Removed: GS - Hulley Road_updated.csv
  ✓ Removed: EDF - Tye Lane_updated.csv
  ✓ Removed: Conrad - Blackpool_updated.csv
  ✓ Removed: EDF - Kemsley_updated.csv
  ✓ Removed: EDF - Bustleholm_updated.csv
  ✓ Removed: Centrica - Roosecote_updated.csv
  ✓ Removed: EDF - Cowley_updated.csv
  ✓ Removed: Scottish - Whitelee_updated.csv
  ✓ Removed: GS - Larport_updated.csv
  ✓ Removed: Conrad - Midsomer_updated.csv
  ✓ Removed: Conrad - Torquay_updated.csv
  ✓ Removed: GS - Cenin_updated.csv
  ✓ Removed: Conrad - Winchester_updated.csv
  ✓ Removed: GS - Port of Tilbury_updated.csv
  ✓ Removed: GS - Lascar_updated.csv
Cleanup complete! Only merged_17_units_updated.csv remains in processed_data.
