In [22]:
import pandas as pd

In [23]:
file_path = '../data/raw/energy_dataset.csv'
df = pd.read_csv(file_path)

print("Original Dataset Preview:")
print(df.head())

Original Dataset Preview:
                        time  generation biomass  \
0  2015-01-01 00:00:00+01:00               447.0   
1  2015-01-01 01:00:00+01:00               449.0   
2  2015-01-01 02:00:00+01:00               448.0   
3  2015-01-01 03:00:00+01:00               438.0   
4  2015-01-01 04:00:00+01:00               428.0   

   generation fossil brown coal/lignite  generation fossil coal-derived gas  \
0                                 329.0                                 0.0   
1                                 328.0                                 0.0   
2                                 323.0                                 0.0   
3                                 254.0                                 0.0   
4                                 187.0                                 0.0   

   generation fossil gas  generation fossil hard coal  generation fossil oil  \
0                 4844.0                       4821.0                  162.0   
1                 5196.0  

In [24]:
missing_counts = df.isnull().sum()
print("\nMissing Values per Column:")
print(missing_counts)


Missing Values per Column:
time                                               0
generation biomass                                19
generation fossil brown coal/lignite              18
generation fossil coal-derived gas                18
generation fossil gas                             18
generation fossil hard coal                       18
generation fossil oil                             19
generation fossil oil shale                       18
generation fossil peat                            18
generation geothermal                             18
generation hydro pumped storage aggregated     35064
generation hydro pumped storage consumption       19
generation hydro run-of-river and poundage        19
generation hydro water reservoir                  18
generation marine                                 19
generation nuclear                                17
generation other                                  18
generation other renewable                        18
generation solar  

In [25]:
categorical_cols = []
numerical_cols = [col for col in df.columns if col not in categorical_cols]

for col in numerical_cols:
    if df[col].dtype in ['int64', 'float64']:
        if df[col].skew() > 1 or df[col].skew() < -1:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mean())

for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

df_cleaned = df.dropna(thresh=int(0.7*len(df.columns)))

print("\nDataset After Handling Missing Values:")
print(df_cleaned.head())


Dataset After Handling Missing Values:
                        time  generation biomass  \
0  2015-01-01 00:00:00+01:00               447.0   
1  2015-01-01 01:00:00+01:00               449.0   
2  2015-01-01 02:00:00+01:00               448.0   
3  2015-01-01 03:00:00+01:00               438.0   
4  2015-01-01 04:00:00+01:00               428.0   

   generation fossil brown coal/lignite  generation fossil coal-derived gas  \
0                                 329.0                                 0.0   
1                                 328.0                                 0.0   
2                                 323.0                                 0.0   
3                                 254.0                                 0.0   
4                                 187.0                                 0.0   

   generation fossil gas  generation fossil hard coal  generation fossil oil  \
0                 4844.0                       4821.0                  162.0   
1           

In [26]:
for col in numerical_cols:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')

negative_values = {}
for col in numerical_cols:
    if (df_cleaned[col] < 0).any():
        negative_values[col] = df_cleaned[df_cleaned[col] < 0][col]

if negative_values:
    print("Columns with negative values where not expected:")
    for col, vals in negative_values.items():
        print(f"- {col}: {len(vals)} negative entries")
else:
    print("No unexpected negative values found.")

outlier_summary = {}
for col in numerical_cols:
    if df_cleaned[col].dtype in ['int64', 'float64']:
        Q1 = df_cleaned[col].quantile(0.25)
        Q3 = df_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df_cleaned[(df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)][col]
        if not outliers.empty:
            outlier_summary[col] = len(outliers)

if outlier_summary:
    print("\nColumns with potential outliers (using IQR method):")
    for col, count in outlier_summary.items():
        print(f"- {col}: {count} potential outliers")
else:
    print("No significant outliers detected with IQR method.")


No unexpected negative values found.

Columns with potential outliers (using IQR method):
- generation biomass: 87 potential outliers
- generation fossil gas: 2187 potential outliers
- generation fossil oil: 246 potential outliers
- generation hydro pumped storage consumption: 3772 potential outliers
- generation hydro water reservoir: 344 potential outliers
- generation nuclear: 76 potential outliers
- generation other: 1267 potential outliers
- generation other renewable: 5 potential outliers
- generation waste: 328 potential outliers
- generation wind onshore: 380 potential outliers
- forecast wind onshore day ahead: 436 potential outliers
- price day ahead: 841 potential outliers
- price actual: 699 potential outliers
