In [83]:
import pandas as pd

In [84]:
file_path = '../data/raw/energy_dataset.csv'
df = pd.read_csv(file_path)

print("Original Dataset Preview:")
print(df.head())
orig_rows, orig_cols = df.shape

Original Dataset Preview:
                        time  generation biomass  \
0  2015-01-01 00:00:00+01:00               447.0   
1  2015-01-01 01:00:00+01:00               449.0   
2  2015-01-01 02:00:00+01:00               448.0   
3  2015-01-01 03:00:00+01:00               438.0   
4  2015-01-01 04:00:00+01:00               428.0   

   generation fossil brown coal/lignite  generation fossil coal-derived gas  \
0                                 329.0                                 0.0   
1                                 328.0                                 0.0   
2                                 323.0                                 0.0   
3                                 254.0                                 0.0   
4                                 187.0                                 0.0   

   generation fossil gas  generation fossil hard coal  generation fossil oil  \
0                 4844.0                       4821.0                  162.0   
1                 5196.0  

In [85]:
missing_counts = df.isnull().sum()
print("\nMissing Values per Column:")
print(missing_counts)


Missing Values per Column:
time                                               0
generation biomass                                19
generation fossil brown coal/lignite              18
generation fossil coal-derived gas                18
generation fossil gas                             18
generation fossil hard coal                       18
generation fossil oil                             19
generation fossil oil shale                       18
generation fossil peat                            18
generation geothermal                             18
generation hydro pumped storage aggregated     35064
generation hydro pumped storage consumption       19
generation hydro run-of-river and poundage        19
generation hydro water reservoir                  18
generation marine                                 19
generation nuclear                                17
generation other                                  18
generation other renewable                        18
generation solar  

In [None]:
threshold = int(0.8 * len(df.columns))
df_cleaned = df.dropna(thresh=threshold)
df_cleaned = df_cleaned.dropna(axis=1, how='all')               # drop fully NaN columns
df_cleaned = df_cleaned.loc[:, (df_cleaned != 0).any(axis=0)]   # drop all-zero columns

print("Cleaned shape:", df_cleaned.shape)
print("\nRemaining columns:")
print(df_cleaned.columns.tolist())

Cleaned shape: (35064, 27)

Remaining columns:
['time', 'generation biomass', 'generation fossil brown coal/lignite', 'generation fossil coal-derived gas', 'generation fossil gas', 'generation fossil hard coal', 'generation fossil oil', 'generation fossil oil shale', 'generation fossil peat', 'generation geothermal', 'generation hydro pumped storage consumption', 'generation hydro run-of-river and poundage', 'generation hydro water reservoir', 'generation marine', 'generation nuclear', 'generation other', 'generation other renewable', 'generation solar', 'generation waste', 'generation wind offshore', 'generation wind onshore', 'forecast solar day ahead', 'forecast wind onshore day ahead', 'total load forecast', 'total load actual', 'price day ahead', 'price actual']


In [88]:
medians = df.median(numeric_only=True)

print("\nMedian values per column:")
for col, val in medians.items():
    print(f"{col}: {val}")


Median values per column:
generation biomass: 367.0
generation fossil brown coal/lignite: 509.0
generation fossil coal-derived gas: 0.0
generation fossil gas: 4969.0
generation fossil hard coal: 4474.0
generation fossil oil: 300.0
generation fossil oil shale: 0.0
generation fossil peat: 0.0
generation geothermal: 0.0
generation hydro pumped storage aggregated: nan
generation hydro pumped storage consumption: 68.0
generation hydro run-of-river and poundage: 906.0
generation hydro water reservoir: 2164.0
generation marine: 0.0
generation nuclear: 6566.0
generation other: 57.0
generation other renewable: 88.0
generation solar: 616.0
generation waste: 279.0
generation wind offshore: 0.0
generation wind onshore: 4849.0
forecast solar day ahead: 576.0
forecast wind offshore eday ahead: nan
forecast wind onshore day ahead: 4855.0
total load forecast: 28906.0
total load actual: 28901.0
price day ahead: 50.52
price actual: 58.02


In [89]:
cleaned_medians = df_cleaned.median(numeric_only=True)

print("\nMedian values per column after removing NULL values:")
for col, val in cleaned_medians.items():
    print(f"{col}: {val}")


Median values per column after removing NULL values:
generation biomass: 367.0
generation fossil brown coal/lignite: 509.0
generation fossil coal-derived gas: 0.0
generation fossil gas: 4969.0
generation fossil hard coal: 4474.0
generation fossil oil: 300.0
generation fossil oil shale: 0.0
generation fossil peat: 0.0
generation geothermal: 0.0
generation hydro pumped storage consumption: 68.0
generation hydro run-of-river and poundage: 906.0
generation hydro water reservoir: 2164.0
generation marine: 0.0
generation nuclear: 6566.0
generation other: 57.0
generation other renewable: 88.0
generation solar: 616.0
generation waste: 279.0
generation wind offshore: 0.0
generation wind onshore: 4849.0
forecast solar day ahead: 576.0
forecast wind onshore day ahead: 4855.0
total load forecast: 28906.0
total load actual: 28901.0
price day ahead: 50.52
price actual: 58.02


In [90]:
cleaned_rows, cleaned_cols = df_cleaned.shape

row_drop_pct = (orig_rows - cleaned_rows) / orig_rows * 100
col_drop_pct = (orig_cols - cleaned_cols) / orig_cols * 100

print("Cleaned shape:", df_cleaned.shape)
print(f"\nDropped rows: {orig_rows - cleaned_rows} ({row_drop_pct:.2f}%)")
print(f"Dropped columns: {orig_cols - cleaned_cols} ({col_drop_pct:.2f}%)")

Cleaned shape: (35064, 27)

Dropped rows: 0 (0.00%)
Dropped columns: 2 (6.90%)
