In [3]:
import pandas as pd

# Read the raw CSV file
df = pd.read_csv('raw_dam_idm_price.csv')

# Remove square brackets from date column and convert to datetime
df['Date (WET)'] = df['Date (WET)'].str.strip('[]')
df['Date (WET)'] = pd.to_datetime(df['Date (WET)'], format='%d/%m/%Y %H:%M')

# Localize to WET, convert to UK time, then remove timezone info
df['Date (UK)'] = df['Date (WET)'].dt.tz_localize('WET', ambiguous='NaT').dt.tz_convert('Europe/London').dt.tz_localize(None)

# Keep only the columns we need: Date, DAM, IDM
df = df[['Date (UK)', 'DAM price', 'IDM price']]

# Remove rows with empty UK date first
df = df.dropna(subset=['Date (UK)'])

# Check for missing values
print("Missing values check:")
print(df.isnull().sum())
print(f"\nTotal rows: {len(df)}")
print(f"Complete rows (no missing values): {df.dropna().shape[0]}")

# Fill missing values with column means for price columns only
df['DAM price'].fillna(df['DAM price'].mean(), inplace=True)
df['IDM price'].fillna(df['IDM price'].mean(), inplace=True)

print(f"Final rows after cleaning: {len(df)}")

# Display the result
print("\nFirst 5 rows of cleaned data:")
print(df.head())

# Save directly to final cleaned file
df.to_csv('cleaned_dam_idm_data.csv', index=False)
print("\nData saved to 'cleaned_dam_idm_data.csv'")

Missing values check:
Date (UK)      0
DAM price      0
IDM price    220
dtype: int64

Total rows: 87652
Complete rows (no missing values): 87432
Final rows after cleaning: 87652

First 5 rows of cleaned data:
            Date (UK)  DAM price  IDM price
0 2015-01-01 00:00:00      39.60      37.76
1 2015-01-01 01:00:00      37.18      32.60
2 2015-01-01 02:00:00      32.40      30.28
3 2015-01-01 03:00:00      29.29      25.99
4 2015-01-01 04:00:00      27.24      25.54

Data saved to 'cleaned_dam_idm_data.csv'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['DAM price'].fillna(df['DAM price'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['IDM price'].fillna(df['IDM price'].mean(), inplace=True)
