In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)


print(f"Pd --v: {pd.__version__}")
np.__version__

Pd --v: 2.3.1


'2.3.2'

In [6]:
project_root = Path.cwd().parent
data_raw_path = project_root / "data" / "raw"
data_processed_path = project_root / "data" / "processed"

data_processed_path.mkdir(parents=True, exist_ok=True)

epex_file = data_raw_path / "Day-ahead_prices_202507070000_202508040000_Hour_Quarterhour.csv"
hupx_file = data_raw_path / "Labs_DAM_Aggregated_Trading_Data_20250802_190631.csv"

print("File paths:")
print(f"EPEX (DE) data: {epex_file}")
print(f"HUPX (HU) data: {hupx_file}")
print(f"Processed data output: {data_processed_path}")

print(f"\nFile existence check:")
print(f"EPEX file exists: {epex_file.exists()}")
print(f"HUPX file exists: {hupx_file.exists()}")

File paths:
EPEX (DE) data: c:\Users\micha\code\power-market-analysis-de-hu\data\raw\Day-ahead_prices_202507070000_202508040000_Hour_Quarterhour.csv
HUPX (HU) data: c:\Users\micha\code\power-market-analysis-de-hu\data\raw\Labs_DAM_Aggregated_Trading_Data_20250802_190631.csv
Processed data output: c:\Users\micha\code\power-market-analysis-de-hu\data\processed

File existence check:
EPEX file exists: True
HUPX file exists: True


In [7]:
print("=" * 60)

epex_raw = pd.read_csv(epex_file, sep=';')

print(f"EPEX data shape: {epex_raw.shape}")
print(f"Columns ({len(epex_raw.columns)}):")
for i, col in enumerate(epex_raw.columns):
    print(f"  {i+1:2d}. {col}")

print(f"\nFirst few rows:")
print(epex_raw.head(3))

print(f"\nData types:")
print(epex_raw.dtypes)

EPEX data shape: (2688, 19)
Columns (19):
   1. Start date
   2. End date
   3. Germany/Luxembourg [€/MWh] Original resolutions
   4. ∅ DE/LU neighbours [€/MWh] Original resolutions
   5. Belgium [€/MWh] Original resolutions
   6. Denmark 1 [€/MWh] Original resolutions
   7. Denmark 2 [€/MWh] Original resolutions
   8. France [€/MWh] Original resolutions
   9. Netherlands [€/MWh] Original resolutions
  10. Norway 2 [€/MWh] Original resolutions
  11. Austria [€/MWh] Original resolutions
  12. Poland [€/MWh] Original resolutions
  13. Sweden 4 [€/MWh] Original resolutions
  14. Switzerland [€/MWh] Original resolutions
  15. Czech Republic [€/MWh] Original resolutions
  16. DE/AT/LU [€/MWh] Original resolutions
  17. Northern Italy [€/MWh] Original resolutions
  18. Slovenia [€/MWh] Original resolutions
  19. Hungary [€/MWh] Original resolutions

First few rows:
             Start date              End date  \
0  Jul 7, 2025 12:00 AM  Jul 7, 2025 12:15 AM   
1  Jul 7, 2025 12:15 AM  Jul 7

In [None]:
hupx_raw = pd.read_csv(hupx_file)

print(f"HU dataa shape: {hupx_raw.shape}")
print(f"Cols ({len(hupx_raw.columns)}):")
for i, col in enumerate(hupx_raw.columns):
    print(f"  {i+1}. {col}")

print(f"\nFirst few rows:")
print(hupx_raw.head(3))

print(f"\nData types:")
print(hupx_raw.dtypes)

print(f"\nUnique values in 'Status' column:")
print(hupx_raw['Status'].value_counts())

HU dataa shape: (504, 6)
Cols (6):
  1. Delivery day
  2. Hour
  3. Price
  4. Traded volume
  5. Baseload price
  6. Status

First few rows:
           Delivery day  Hour   Price  Traded volume  Baseload price Status
0  2025-07-27T00:00:00Z    24  121.78         2925.7           92.04  final
1  2025-07-27T00:00:00Z    23  163.93         3235.0           92.04  final
2  2025-07-27T00:00:00Z    22  180.00         3562.3           92.04  final

Data types:
Delivery day       object
Hour                int64
Price             float64
Traded volume     float64
Baseload price    float64
Status             object
dtype: object

Unique values in 'Status' column:
Status
final    504
Name: count, dtype: int64


In [None]:
print("=" * 60)
print("CLEANING EPEX DATA")
print("=" * 60)

epex_clean = epex_raw[['Start date', 'Germany/Luxembourg [€/MWh] Original resolutions', 
                       'Hungary [€/MWh] Original resolutions']].copy()

epex_clean.columns = ['timestamp_start', 'de_price_epex', 'hu_price_epex']

epex_clean['timestamp_start'] = pd.to_datetime(epex_clean['timestamp_start'])

epex_clean['de_price_epex'] = pd.to_numeric(epex_clean['de_price_epex'], errors='coerce')
epex_clean['hu_price_epex'] = pd.to_numeric(epex_clean['hu_price_epex'], errors='coerce')

epex_clean = epex_clean.dropna(subset=['de_price_epex', 'hu_price_epex'], how='all')

print(f"EPEX cleaned data shape: {epex_clean.shape}")
print(f"Date range: {epex_clean['timestamp_start'].min()} to {epex_clean['timestamp_start'].max()}")
print(f"Missing values:")
print(epex_clean.isnull().sum())
print(f"\nFirst few rows:")
print(epex_clean.head())

CLEANING EPEX DATA
EPEX cleaned data shape: (672, 3)
Date range: 2025-07-07 00:00:00 to 2025-08-03 23:00:00
Missing values:
timestamp_start    0
de_price_epex      0
hu_price_epex      0
dtype: int64

First few rows:
       timestamp_start  de_price_epex  hu_price_epex
0  2025-07-07 00:00:00         118.84         117.99
4  2025-07-07 01:00:00         107.42         105.64
8  2025-07-07 02:00:00         101.92         100.98
12 2025-07-07 03:00:00          99.12          97.67
16 2025-07-07 04:00:00         101.00          99.34


  epex_clean['timestamp_start'] = pd.to_datetime(epex_clean['timestamp_start'])


In [None]:
print("CHANGE EPEX DATA TO HOURLY RESOLUTION")

epex_clean['hour'] = epex_clean['timestamp_start'].dt.floor('H')

epex_hourly = epex_clean.groupby('hour').agg({
    'de_price_epex': 'mean',
    'hu_price_epex': 'mean'
}).reset_index()

epex_hourly.rename(columns={'hour': 'timestamp'}, inplace=True)

print(f"EPEX hourly data shape: {epex_hourly.shape}")
print(f"Date range: {epex_hourly['timestamp'].min()} to {epex_hourly['timestamp'].max()}")
print(f"Missing values:")
print(epex_hourly.isnull().sum())
print(f"\nFirst few rows:")
print(epex_hourly.head())

time_diff = epex_hourly['timestamp'].diff().dropna()
print(f"\nTime gaps check:")
print(f"Standard interval: {time_diff.mode().iloc[0]}")
print(f"Any non-standard intervals: {(time_diff != pd.Timedelta('1H')).any()}")

CHANGE EPEX DATA TO HOURLY RESOLUTION
EPEX hourly data shape: (672, 3)
Date range: 2025-07-07 00:00:00 to 2025-08-03 23:00:00
Missing values:
timestamp        0
de_price_epex    0
hu_price_epex    0
dtype: int64

First few rows:
            timestamp  de_price_epex  hu_price_epex
0 2025-07-07 00:00:00         118.84         117.99
1 2025-07-07 01:00:00         107.42         105.64
2 2025-07-07 02:00:00         101.92         100.98
3 2025-07-07 03:00:00          99.12          97.67
4 2025-07-07 04:00:00         101.00          99.34

Time gaps check:
Standard interval: 0 days 01:00:00
Any non-standard intervals: False


  epex_clean['hour'] = epex_clean['timestamp_start'].dt.floor('H')
  print(f"Any non-standard intervals: {(time_diff != pd.Timedelta('1H')).any()}")


In [None]:
print("=" * 60)
print("CLEANING HUPX DATA")
print("=" * 60)


hupx_clean = hupx_raw.copy()

columns_to_drop = ['Status', 'Baseload price', 'Traded volume']
hupx_clean = hupx_clean.drop(columns=columns_to_drop)

hupx_clean.columns = ['delivery_day', 'hour', 'hu_price_hupx']


hupx_clean['delivery_day'] = pd.to_datetime(hupx_clean['delivery_day'])


hupx_clean['hour_0based'] = hupx_clean['hour'] - 1
hupx_clean['timestamp'] = hupx_clean['delivery_day'] + pd.to_timedelta(hupx_clean['hour_0based'], unit='h')


hupx_clean = hupx_clean[['timestamp', 'hu_price_hupx']].copy()


hupx_clean = hupx_clean.sort_values('timestamp').reset_index(drop=True)

print(f"HUPX cleaned data shape: {hupx_clean.shape}")
print(f"Date range: {hupx_clean['timestamp'].min()} to {hupx_clean['timestamp'].max()}")
print(f"Missing values:")
print(hupx_clean.isnull().sum())
print(f"\nFirst few rows:")
print(hupx_clean.head())


time_diff = hupx_clean['timestamp'].diff().dropna()
print(f"\nTime gaps check:")
print(f"Standard interval: {time_diff.mode().iloc[0]}")
print(f"Any non-standard intervals: {(time_diff != pd.Timedelta('1H')).any()}")

CLEANING HUPX DATA
HUPX cleaned data shape: (504, 2)
Date range: 2025-07-07 00:00:00+00:00 to 2025-07-27 23:00:00+00:00
Missing values:
timestamp        0
hu_price_hupx    0
dtype: int64

First few rows:
                  timestamp  hu_price_hupx
0 2025-07-07 00:00:00+00:00         117.99
1 2025-07-07 01:00:00+00:00         105.64
2 2025-07-07 02:00:00+00:00         100.98
3 2025-07-07 03:00:00+00:00          97.67
4 2025-07-07 04:00:00+00:00          99.34

Time gaps check:
Standard interval: 0 days 01:00:00
Any non-standard intervals: False


  print(f"Any non-standard intervals: {(time_diff != pd.Timedelta('1H')).any()}")


In [None]:

print("\nStandardizing timestamps to timezone-naive...")

epex_hourly['timestamp'] = pd.to_datetime(epex_hourly['timestamp']).dt.tz_localize(None)
hupx_clean['timestamp'] = pd.to_datetime(hupx_clean['timestamp']).dt.tz_localize(None)

print("After standardization:")
print(f"EPEX timestamp type: {epex_hourly['timestamp'].dtype}")
print(f"HUPX timestamp type: {hupx_clean['timestamp'].dtype}")

merged_data = pd.merge(epex_hourly, hupx_clean, on='timestamp', how='outer')

merged_data = merged_data.sort_values('timestamp').reset_index(drop=True)

print(f"Merged data shape: {merged_data.shape}")
print(f"Date range: {merged_data['timestamp'].min()} to {merged_data['timestamp'].max()}")
print(f"Missing values:")
print(merged_data.isnull().sum())

print(f"\nData coverage analysis:")
print(f"EPEX DE prices available: {merged_data['de_price_epex'].notna().sum()} hours")
print(f"EPEX HU prices available: {merged_data['hu_price_epex'].notna().sum()} hours")
print(f"HUPX HU prices available: {merged_data['hu_price_hupx'].notna().sum()} hours")
print(f"Total time periods: {len(merged_data)} hours")

print(f"\nFirst few rows:")
print(merged_data.head(10))

print(f"\nLast few rows:")
print(merged_data.tail(10))

MERGING DATASETS
EPEX timestamp info:
  Type: datetime64[ns]
  Timezone: None
  Sample: 2025-07-07 00:00:00

HUPX timestamp info:
  Type: datetime64[ns, UTC]
  Timezone: UTC
  Sample: 2025-07-07 00:00:00+00:00

Standardizing timestamps to timezone-naive...
After standardization:
EPEX timestamp type: datetime64[ns]
HUPX timestamp type: datetime64[ns]
Merged data shape: (672, 4)
Date range: 2025-07-07 00:00:00 to 2025-08-03 23:00:00
Missing values:
timestamp          0
de_price_epex      0
hu_price_epex      0
hu_price_hupx    168
dtype: int64

Data coverage analysis:
EPEX DE prices available: 672 hours
EPEX HU prices available: 672 hours
HUPX HU prices available: 504 hours
Total time periods: 672 hours

First few rows:
            timestamp  de_price_epex  hu_price_epex  hu_price_hupx
0 2025-07-07 00:00:00         118.84         117.99         117.99
1 2025-07-07 01:00:00         107.42         105.64         105.64
2 2025-07-07 02:00:00         101.92         100.98         100.98
3 20

In [None]:
print("=" * 60)
print("ADDING DERIVED COLUMNS FOR ANALYSIS")
print("=" * 60)

merged_data['date'] = merged_data['timestamp'].dt.date
merged_data['hour'] = merged_data['timestamp'].dt.hour
merged_data['weekday'] = merged_data['timestamp'].dt.day_name()
merged_data['week_number'] = merged_data['timestamp'].dt.isocalendar().week
merged_data['month'] = merged_data['timestamp'].dt.month

merged_data['hu_price_primary'] = merged_data['hu_price_hupx'].fillna(merged_data['hu_price_epex'])

merged_data['de_hu_spread'] = merged_data['de_price_epex'] - merged_data['hu_price_primary']
merged_data['epex_hupx_hu_spread'] = merged_data['hu_price_epex'] - merged_data['hu_price_hupx']

merged_data['has_de_price'] = merged_data['de_price_epex'].notna()
merged_data['has_hu_hupx'] = merged_data['hu_price_hupx'].notna()
merged_data['has_hu_epex'] = merged_data['hu_price_epex'].notna()

print(f"Enhanced data shape: {merged_data.shape}")
print(f"Columns: {list(merged_data.columns)}")

weeks_covered = sorted(merged_data['week_number'].dropna().unique())
print(f"\nWeeks covered: {weeks_covered}")

target_weeks = [28, 29, 30]
target_data = merged_data[merged_data['week_number'].isin(target_weeks)].copy()

print(f"\nTarget weeks (28, 29, 30) data shape: {target_data.shape}")
print(f"Date range for target weeks: {target_data['timestamp'].min()} to {target_data['timestamp'].max()}")

print(f"\nData availability by week:")
for week in target_weeks:
    week_data = target_data[target_data['week_number'] == week]
    print(f"Week {week}: {len(week_data)} hours, DE prices: {week_data['has_de_price'].sum()}, HU HUPX: {week_data['has_hu_hupx'].sum()}")

ADDING DERIVED COLUMNS FOR ANALYSIS
Enhanced data shape: (672, 15)
Columns: ['timestamp', 'de_price_epex', 'hu_price_epex', 'hu_price_hupx', 'date', 'hour', 'weekday', 'week_number', 'month', 'hu_price_primary', 'de_hu_spread', 'epex_hupx_hu_spread', 'has_de_price', 'has_hu_hupx', 'has_hu_epex']

Weeks covered: [np.uint32(28), np.uint32(29), np.uint32(30), np.uint32(31)]

Target weeks (28, 29, 30) data shape: (504, 15)
Date range for target weeks: 2025-07-07 00:00:00 to 2025-07-27 23:00:00

Data availability by week:
Week 28: 168 hours, DE prices: 168, HU HUPX: 168
Week 29: 168 hours, DE prices: 168, HU HUPX: 168
Week 30: 168 hours, DE prices: 168, HU HUPX: 168


In [None]:
print("=" * 60)
print("DATA QUALITY REPORT")
print("=" * 60)

print("SUMMARY STATISTICS FOR WEEKS 28, 29, 30:")
print("=" * 50)

summary_stats = target_data[['de_price_epex', 'hu_price_hupx', 'hu_price_epex', 
                             'hu_price_primary', 'de_hu_spread']].describe()
print(summary_stats)

print(f"\nMISSING DATA ANALYSIS:")
print("=" * 30)
missing_analysis = target_data[['de_price_epex', 'hu_price_hupx', 'hu_price_epex']].isnull().sum()
total_hours = len(target_data)
print(f"Total hours in target weeks: {total_hours}")
for col, missing_count in missing_analysis.items():
    pct_missing = (missing_count / total_hours) * 100
    print(f"{col}: {missing_count} missing ({pct_missing:.1f}%)")

print(f"\nDATA AVAILABILITY BY WEEK:")
print("=" * 30)
for week in target_weeks:
    week_data = target_data[target_data['week_number'] == week]
    total_week_hours = len(week_data)
    de_available = week_data['de_price_epex'].notna().sum()
    hu_hupx_available = week_data['hu_price_hupx'].notna().sum()
    
    print(f"\nWeek {week} ({total_week_hours} hours):")
    print(f"  DE prices: {de_available}/{total_week_hours} ({de_available/total_week_hours*100:.1f}%)")
    print(f"  HU HUPX:   {hu_hupx_available}/{total_week_hours} ({hu_hupx_available/total_week_hours*100:.1f}%)")

print(f"\nPRICE ANOMALY CHECK:")
print("=" * 25)
neg_de = (target_data['de_price_epex'] < 0).sum()
neg_hu = (target_data['hu_price_primary'] < 0).sum()
print(f"Negative DE prices: {neg_de}")
print(f"Negative HU prices: {neg_hu}")

high_de = (target_data['de_price_epex'] > 500).sum()
high_hu = (target_data['hu_price_primary'] > 500).sum()
print(f"Very high DE prices (>500): {high_de}")
print(f"Very high HU prices (>500): {high_hu}")

DATA QUALITY REPORT
SUMMARY STATISTICS FOR WEEKS 28, 29, 30:
       de_price_epex  hu_price_hupx  hu_price_epex  hu_price_primary  \
count     504.000000     504.000000     504.000000        504.000000   
mean       88.260556     103.049325     103.049325        103.049325   
std        30.603741      47.015873      47.015873         47.015873   
min        -0.010000      -0.010000      -0.010000         -0.010000   
25%        79.650000      85.220000      85.220000         85.220000   
50%        94.025000     100.965000     100.965000        100.965000   
75%       108.167500     116.512500     116.512500        116.512500   
max       154.160000     386.030000     386.030000        386.030000   

       de_hu_spread  
count    504.000000  
mean     -14.788770  
std       31.987004  
min     -276.750000  
25%      -20.912500  
50%       -2.640000  
75%        0.662500  
max       29.050000  

MISSING DATA ANALYSIS:
Total hours in target weeks: 504
de_price_epex: 0 missing (0.0%)
hu_

In [None]:

full_output_file = data_processed_path / "merged_spot_prices_full.csv"
merged_data.to_csv(full_output_file, index=False)
print(f"Full dataset exported to: {full_output_file}")

target_output_file = data_processed_path / "spot_prices_weeks_28_29_30.csv"
target_data.to_csv(target_output_file, index=False)
print(f"Target weeks dataset exported to: {target_output_file}")

summary_data = target_data.groupby(['week_number', 'date']).agg({
    'de_price_epex': ['mean', 'min', 'max'],
    'hu_price_primary': ['mean', 'min', 'max'],
    'de_hu_spread': ['mean', 'min', 'max'],
    'has_de_price': 'sum',
    'has_hu_hupx': 'sum'
}).round(2)

summary_data.columns = ['_'.join(col).strip() for col in summary_data.columns]
summary_data = summary_data.reset_index()

summary_output_file = data_processed_path / "daily_summary_weeks_28_29_30.csv"
summary_data.to_csv(summary_output_file, index=False)
print(f"Daily summary exported to: {summary_output_file}")

print(f"Files created:")
print(f"  1. {full_output_file.name} - Full merged dataset ({len(merged_data)} hours)")
print(f"  2. {target_output_file.name} - Target weeks only ({len(target_data)} hours)")
print(f"  3. {summary_output_file.name} - Daily summaries ({len(summary_data)} days)")


EXPORTING CLEANED DATA
Full dataset exported to: c:\Users\micha\code\power-market-analysis-de-hu\data\processed\merged_spot_prices_full.csv
Target weeks dataset exported to: c:\Users\micha\code\power-market-analysis-de-hu\data\processed\spot_prices_weeks_28_29_30.csv
Daily summary exported to: c:\Users\micha\code\power-market-analysis-de-hu\data\processed\daily_summary_weeks_28_29_30.csv

Data processing completed successfully!
Files created:
  1. merged_spot_prices_full.csv - Full merged dataset (672 hours)
  2. spot_prices_weeks_28_29_30.csv - Target weeks only (504 hours)
  3. daily_summary_weeks_28_29_30.csv - Daily summaries (21 days)

FINAL PREVIEW - WEEKS 28, 29, 30:
            timestamp  week_number  de_price_epex  hu_price_hupx  \
0 2025-07-07 00:00:00           28         118.84         117.99   
1 2025-07-07 01:00:00           28         107.42         105.64   
2 2025-07-07 02:00:00           28         101.92         100.98   
3 2025-07-07 03:00:00           28          9