In [None]:
# Import required libraries
import pandas as pd
from datetime import datetime, timedelta
import os

print("2025 DC Price Cleaning Notebook")
print("===============================")
print("This notebook processes raw DC price data and separates it into DCH and DCL products")
print("with proper EFA block mapping (1-6) and Date, EFA, Clearing Price format.")

2025 DC Price Cleaning Notebook
This notebook processes raw DC price data and separates it into DCH and DCL products
with proper EFA block mapping (1-6) and Date, EFA, Clearing Price format.


In [3]:
# Load and examine the raw DC price data
raw_data_path = "Results for EDA/Raw DC Price.csv"
df_raw = pd.read_csv(raw_data_path)

print("Raw data shape:", df_raw.shape)
print("\nColumns:", df_raw.columns.tolist())
print("\nFirst few rows:")
print(df_raw.head(15))

print("\nUnique products:")
print(df_raw['Product'].unique())

print("\nDelivery time examples:")
print("Delivery Start unique times (first 20):")
print(sorted(df_raw['Delivery Start'].unique())[:20])

Raw data shape: (2172, 5)

Columns: ['Auction ID', 'Product', 'Delivery Start', 'Delivery End', 'Clearing Price']

First few rows:
    Auction ID Product       Delivery Start         Delivery End  \
0          906     DCL  2024-12-31T23:00:00  2025-01-01T03:00:00   
1          906     DCH  2024-12-31T23:00:00  2025-01-01T03:00:00   
2          906     DCH  2025-01-01T03:00:00  2025-01-01T07:00:00   
3          906     DCH  2025-01-01T07:00:00  2025-01-01T11:00:00   
4          906     DCH  2025-01-01T11:00:00  2025-01-01T15:00:00   
5          906     DCH  2025-01-01T15:00:00  2025-01-01T19:00:00   
6          906     DCH  2025-01-01T19:00:00  2025-01-01T23:00:00   
7          906     DCL  2025-01-01T03:00:00  2025-01-01T07:00:00   
8          906     DCL  2025-01-01T07:00:00  2025-01-01T11:00:00   
9          906     DCL  2025-01-01T11:00:00  2025-01-01T15:00:00   
10         906     DCL  2025-01-01T15:00:00  2025-01-01T19:00:00   
11         906     DCL  2025-01-01T19:00:00  2025-01-

In [4]:
# Function to map delivery start time to EFA block
def get_efa_block(delivery_start_str):
    """
    Map delivery start time to EFA block (1-6)
    EFA blocks:
    1: 23:00 (11pm) or 22:00 (10pm) - previous day 
    2: 03:00 (3am)
    3: 07:00 (7am) 
    4: 11:00 (11am)
    5: 15:00 (3pm)
    6: 19:00 (7pm)
    """
    # Extract time from the datetime string
    time_str = delivery_start_str.split('T')[1][:5]  # Get HH:MM
    
    # Map time to EFA block
    time_to_efa = {
        '23:00': 1,  # 11pm
        '22:00': 1,  # 10pm (edge case)
        '03:00': 2,  # 3am
        '02:00': 2,  # 2am (edge case)
        '07:00': 3,  # 7am
        '06:00': 3,  # 6am (edge case)
        '11:00': 4,  # 11am
        '10:00': 4,  # 10am (edge case)
        '15:00': 5,  # 3pm
        '14:00': 5,  # 2pm (edge case)
        '19:00': 6,  # 7pm
        '18:00': 6,  # 6pm (edge case)
    }
    
    return time_to_efa.get(time_str, None)

# Function to get the delivery date (accounting for EFA block 1 starting previous day)
def get_delivery_date(delivery_start_str, efa_block):
    """
    Get the delivery date. For EFA block 1 (starting at 11pm/10pm), 
    the delivery date is the next day
    """
    date_str = delivery_start_str.split('T')[0]
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    
    if efa_block == 1:
        # EFA block 1 starts the night before, so delivery date is next day
        date_obj = date_obj + timedelta(days=1)
    
    return date_obj.strftime('%Y-%m-%d')

print("EFA mapping functions defined successfully!")

EFA mapping functions defined successfully!


In [5]:
# Test the EFA mapping functions
print("Testing EFA mapping:")
test_times = ['2025-01-01T23:00:00', '2025-01-01T22:00:00', '2025-01-01T03:00:00', 
              '2025-01-01T07:00:00', '2025-01-01T11:00:00', '2025-01-01T15:00:00', '2025-01-01T19:00:00']

for time_str in test_times:
    efa = get_efa_block(time_str)
    date = get_delivery_date(time_str, efa)
    print(f"{time_str} -> EFA {efa}, Delivery Date: {date}")

Testing EFA mapping:
2025-01-01T23:00:00 -> EFA 1, Delivery Date: 2025-01-02
2025-01-01T22:00:00 -> EFA 1, Delivery Date: 2025-01-02
2025-01-01T03:00:00 -> EFA 2, Delivery Date: 2025-01-01
2025-01-01T07:00:00 -> EFA 3, Delivery Date: 2025-01-01
2025-01-01T11:00:00 -> EFA 4, Delivery Date: 2025-01-01
2025-01-01T15:00:00 -> EFA 5, Delivery Date: 2025-01-01
2025-01-01T19:00:00 -> EFA 6, Delivery Date: 2025-01-01


In [6]:
# Process the raw data to create cleaned format
def clean_dc_data(df_raw):
    """
    Clean the raw DC data and separate into DCH and DCL products
    """
    # Create copy of data
    df_clean = df_raw.copy()
    
    # Add EFA block
    df_clean['EFA'] = df_clean['Delivery Start'].apply(get_efa_block)
    
    # Add delivery date 
    df_clean['Date'] = df_clean.apply(lambda row: get_delivery_date(row['Delivery Start'], row['EFA']), axis=1)
    
    # Select only needed columns
    df_clean = df_clean[['Date', 'EFA', 'Product', 'Clearing Price']]
    
    # Check for any missing EFA mappings
    missing_efa = df_clean[df_clean['EFA'].isna()]
    if not missing_efa.empty:
        print("Warning: Found rows with missing EFA mapping:")
        print(missing_efa.head(10))
    
    # Separate DCH and DCL
    df_dch = df_clean[df_clean['Product'] == 'DCH'][['Date', 'EFA', 'Clearing Price']].copy()
    df_dcl = df_clean[df_clean['Product'] == 'DCL'][['Date', 'EFA', 'Clearing Price']].copy()
    
    # Sort by date and EFA
    df_dch = df_dch.sort_values(['Date', 'EFA']).reset_index(drop=True)
    df_dcl = df_dcl.sort_values(['Date', 'EFA']).reset_index(drop=True)
    
    return df_dch, df_dcl

# Clean the data
print("Processing raw data...")
df_dch_clean, df_dcl_clean = clean_dc_data(df_raw)

print("DCH cleaned data shape:", df_dch_clean.shape)
print("DCL cleaned data shape:", df_dcl_clean.shape)
print("Data processing completed!")

Processing raw data...
DCH cleaned data shape: (1086, 3)
DCL cleaned data shape: (1086, 3)
Data processing completed!


In [7]:
# Preview the cleaned data
print("DCH sample data:")
print(df_dch_clean.head(12))

print("\nDCL sample data:")
print(df_dcl_clean.head(12))

DCH sample data:
          Date  EFA  Clearing Price
0   2025-01-01    1            3.00
1   2025-01-01    2            4.50
2   2025-01-01    3            2.50
3   2025-01-01    4            4.60
4   2025-01-01    5            1.75
5   2025-01-01    6            2.00
6   2025-01-02    1            4.00
7   2025-01-02    2            5.50
8   2025-01-02    3            3.48
9   2025-01-02    4            2.00
10  2025-01-02    5            0.91
11  2025-01-02    6            2.48

DCL sample data:
          Date  EFA  Clearing Price
0   2025-01-01    1            2.50
1   2025-01-01    2            3.96
2   2025-01-01    3            2.25
3   2025-01-01    4            2.14
4   2025-01-01    5            6.25
5   2025-01-01    6            3.34
6   2025-01-02    1            2.00
7   2025-01-02    2            4.69
8   2025-01-02    3            3.61
9   2025-01-02    4            2.00
10  2025-01-02    5            5.30
11  2025-01-02    6            2.50


In [8]:
# Save cleaned data to CSV files
print("Saving cleaned data to CSV files...")

# Create output directories if they don't exist
dcl_output_dir = "../Cleaned market prices/DC/DCL/"
dch_output_dir = "../Cleaned market prices/DC/DCH/"

os.makedirs(dcl_output_dir, exist_ok=True)
os.makedirs(dch_output_dir, exist_ok=True)

# Save cleaned data to CSV files
dcl_output_path = os.path.join(dcl_output_dir, "2025 DCL Price.csv")
dch_output_path = os.path.join(dch_output_dir, "2025 DCH Price.csv")

df_dcl_clean.to_csv(dcl_output_path, index=False)
df_dch_clean.to_csv(dch_output_path, index=False)

print(f"DCL data saved to: {dcl_output_path}")
print(f"DCH data saved to: {dch_output_path}")

print(f"\nFinal file sizes:")
print(f"DCL file: {df_dcl_clean.shape[0]} rows")
print(f"DCH file: {df_dch_clean.shape[0]} rows")

Saving cleaned data to CSV files...
DCL data saved to: ../Cleaned market prices/DC/DCL/2025 DCL Price.csv
DCH data saved to: ../Cleaned market prices/DC/DCH/2025 DCH Price.csv

Final file sizes:
DCL file: 1086 rows
DCH file: 1086 rows
