# EV Charging — Data cleanup & aggregation

This notebook performs data cleaning and aggregation to produce a building-level hourly time series suitable for baseline models and neural networks. It follows the steps you suggested: load, fix dates, convert text-numbers (commas), fill missing values, aggregate to hourly totals, and ensure continuous hourly index.

Save the output `df_hourly` and inspect its shape; the target is a single-column hourly series (e.g., `Flex_7_2kW`).

In [13]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import sys
# Ensure notebook runs from the repository root so relative paths work
cwd = Path.cwd()
repo_root = cwd
for p in [cwd] + list(cwd.parents):
    if (p / 'README.md').exists() or (p / '.git').exists() or (p / 'pyproject.toml').exists():
        repo_root = p
        break
if repo_root != cwd:
    print('Changing working dir from:', cwd, 'to repo root:', repo_root)
    os.chdir(repo_root)
print('cwd:', Path.cwd())
print('python:', sys.executable)
# If you keep a .env file in the repo, optionally load it so DATASET_PATH is available

cwd: /Users/cyrils/Developer/Python/NeuralNetworks
python: /Users/cyrils/Developer/Python/NeuralNetworks/.venv/bin/python


## 1) Load data

The notebook will try the project-relative path and fall back to the `DATASET_PATH` environment variable if set.

In [5]:
# Try to find the CSV locally and load robustly (handles semicolon + decimal comma)
candidates = [
    'data/trondheim/Dataset_2_Hourly_EV_per_user.csv',
    'data/trondheim/Dataset_2_Hourly_EV_per_user_sample.csv',
    'data/trondheim/Dataset_2_Hourly_EV_per_user_*.csv',
]
dataset_path = None
for p in candidates:
    if '*' in p:
        import glob
        matches = glob.glob(os.path.join(os.getcwd(), p))
        if matches:
            dataset_path = matches[0]
            break
    else:
        fp = os.path.join(os.getcwd(), p)
        if os.path.exists(fp):
            dataset_path = fp
            break

if dataset_path is None:
    dataset_path = os.getenv('DATASET_PATH')

# Read with several fallbacks to handle common CSV variants
if dataset_path is None or not os.path.exists(dataset_path):
    print('Dataset not found. Please download and place the CSV in data/trondheim/ or set DATASET_PATH.')
    df = pd.DataFrame()
else:
    print('Attempting to load', dataset_path)
    read_attempts = [
        {'sep': ',', 'engine': 'c', 'low_memory': False},
        {'sep': ';', 'engine': 'c', 'low_memory': False, 'decimal': ','},
        {'sep': ',', 'engine': 'python', 'low_memory': False, 'on_bad_lines': 'skip'},
        {'sep': ';', 'engine': 'python', 'low_memory': False, 'decimal': ',', 'on_bad_lines': 'skip'},
    ]
    last_err = None
    for opts in read_attempts:
        try:
            print('Trying read_csv with options:', opts)
            df = pd.read_csv(dataset_path, **opts)
            print('Success with options:', opts)
            break
        except Exception as e:
            last_err = e
            print('read_csv failed for opts', opts, ' — ', e)
    else:
        print('All read attempts failed:', last_err)
        df = pd.DataFrame()

    if not df.empty:
        print('Loaded rows:', len(df))
        try:
            display(df.head())
        except Exception:
            pass
        print('Columns:', list(df.columns))

Attempting to load /Users/cyrils/Developer/Python/NeuralNetworks/data/trondheim/Dataset_2_Hourly_EV_per_user.csv
Trying read_csv with options: {'sep': ',', 'engine': 'c', 'low_memory': False}
read_csv failed for opts {'sep': ',', 'engine': 'c', 'low_memory': False}  —  Error tokenizing data. C error: Expected 4 fields in line 360, saw 5

Trying read_csv with options: {'sep': ';', 'engine': 'c', 'low_memory': False, 'decimal': ','}
Success with options: {'sep': ';', 'engine': 'c', 'low_memory': False, 'decimal': ','}
Loaded rows: 88156


Unnamed: 0,date_from,date_to,User_ID,session_ID,Synthetic_3_6kW,Synthetic_7_2kW,Flex_3_6kW,Flex_7_2kW
0,21.12.2018 10:00,21.12.2018 11:00,AdO3-4,1.0,0.3,0.3,,0.06
1,21.12.2018 10:00,21.12.2018 11:00,AdO3-4,2.0,0.87,0.87,,0.114
2,21.12.2018 11:00,21.12.2018 12:00,AdO3-4,3.0,1.62,3.24,,
3,21.12.2018 12:00,21.12.2018 13:00,AdO3-4,3.0,3.6,7.2,,
4,21.12.2018 13:00,21.12.2018 14:00,AdO3-4,3.0,3.6,7.2,,


Columns: ['date_from', 'date_to', 'User_ID', 'session_ID', 'Synthetic_3_6kW', 'Synthetic_7_2kW', 'Flex_3_6kW', 'Flex_7_2kW']


In [6]:
# Quick check: ensure `df` exists and show a short summary
try:
    print('df exists:', 'df' in globals())
    if 'df' in globals() and not df.empty:
        print('rows,cols =', df.shape)
        print('columns =', list(df.columns))
    else:
        print('df is empty or not defined')
except Exception as e:
    print('Error checking df:', e)

df exists: True
rows,cols = (88156, 8)
columns = ['date_from', 'date_to', 'User_ID', 'session_ID', 'Synthetic_3_6kW', 'Synthetic_7_2kW', 'Flex_3_6kW', 'Flex_7_2kW']


## 2) Fix dates

Convert the primary timestamp column to pandas datetime. Update the column name if your CSV uses a different field (e.g., `date_from` vs `timestamp`).

In [7]:
# Common timestamp column names to try
ts_candidates = ['date_from', 'timestamp', 'datetime', 'start_time', 'time', 'date']
ts_col = next((c for c in ts_candidates if c in df.columns), None)
if ts_col is None:
    print('No timestamp column found automatically — please set ts_col to the correct column name')
else:
    print('Using timestamp column:', ts_col)
    # Try parsing with dayfirst=True (many European datasets use DD.MM.YYYY formats)
    df[ts_col] = pd.to_datetime(df[ts_col], errors='coerce', dayfirst=True)
    # If parsing failed for many rows, try a strict format fallback
    if df[ts_col].isna().sum() > 0:
        try:
            df[ts_col] = pd.to_datetime(df[ts_col], format='%d.%m.%Y %H:%M', errors='coerce')
        except Exception:
            pass
    print('Parsed timestamps — min/max:', df[ts_col].min(), df[ts_col].max())
    print('NaT count:', df[ts_col].isna().sum())

Using timestamp column: date_from
Parsed timestamps — min/max: 2018-12-21 10:00:00 2020-02-01 04:00:00
NaT count: 0


## 3) Fix text numbers (commas -> dots) for energy columns

Replace commas with dots and convert to numeric for energy columns. Update `energy_cols` if your file uses different names.

In [8]:
# Edit this list to match your CSV column names if needed
energy_cols = ['Synthetic_3_6kW', 'Synthetic_7_2kW', 'Flex_3_6kW', 'Flex_7_2kW']
energy_cols = [c for c in energy_cols if c in df.columns]
print('Energy columns found:', energy_cols)

for col in energy_cols:
    # coerce to string, replace comma with dot, then numeric
    df[col] = df[col].astype(str).str.replace(',', '.')
    df[col] = pd.to_numeric(df[col], errors='coerce')

print('After conversion — sample stats:')
for col in energy_cols:
    print(col, df[col].describe())

Energy columns found: ['Synthetic_3_6kW', 'Synthetic_7_2kW', 'Flex_3_6kW', 'Flex_7_2kW']
After conversion — sample stats:
Synthetic_3_6kW count    3.111000e+04
mean     2.812241e+00
std      1.134489e+00
min      4.000000e-16
25%      2.040000e+00
50%      3.600000e+00
75%      3.600000e+00
max      3.600000e+00
Name: Synthetic_3_6kW, dtype: float64
Synthetic_7_2kW count    1.895700e+04
mean     4.614975e+00
std      2.464613e+00
min      1.600000e-15
25%      2.380000e+00
50%      4.950000e+00
75%      7.200000e+00
max      7.200000e+00
Name: Synthetic_7_2kW, dtype: float64
Flex_3_6kW count    6.208600e+04
mean     3.288170e+00
std      8.331529e-01
min      4.000000e-16
25%      3.600000e+00
50%      3.600000e+00
75%      3.600000e+00
max      3.600000e+00
Name: Flex_3_6kW, dtype: float64
Flex_7_2kW count    7.262400e+04
mean     6.605624e+00
std      1.584428e+00
min      7.990000e-16
25%      7.200000e+00
50%      7.200000e+00
75%      7.200000e+00
max      7.200000e+00
Name: Flex_

## 4) Handle missing data

Assumption: NaN means zero energy for that slot (common for sparse charging datasets). Change strategy if your data semantics differ.

In [9]:
if len(energy_cols) > 0 and not df.empty:
    df[energy_cols] = df[energy_cols].fillna(0)
    print('Filled NaNs with 0 for energy columns')
else:
    print('No energy columns found or empty dataframe — skipping fillna')

Filled NaNs with 0 for energy columns


## 5) Aggregate to building-level hourly series

Select one target variable (e.g., `Flex_7_2kW`) to forecast. We'll sum across rows for each timestamp and resample hourly to ensure continuity.

In [10]:
# Choose target column
target_col = 'Flex_7_2kW' if 'Flex_7_2kW' in df.columns else (energy_cols[0] if len(energy_cols)>0 else None)
if target_col is None or df.empty:
    print('No target column available — cannot create hourly series')
    df_hourly = pd.DataFrame()
else:
    # Group by timestamp and sum the energy across rows (building-level)
    df_tmp = df[[ts_col, target_col]].copy()
    df_tmp = df_tmp.dropna(subset=[ts_col])
    df_grouped = df_tmp.groupby(ts_col)[target_col].sum().reset_index()
    # set index and resample hourly (fill missing hours with 0)
    df_hourly = df_grouped.set_index(ts_col).resample('H').sum().fillna(0)
    df_hourly = df_hourly.rename(columns={target_col: 'total_energy_kWh'})
    print('Final Shape for NN (rows,cols):', df_hourly.shape)
    display(df_hourly.head())

Final Shape for NN (rows,cols): (9763, 1)


  df_hourly = df_grouped.set_index(ts_col).resample('H').sum().fillna(0)


Unnamed: 0_level_0,total_energy_kWh
date_from,Unnamed: 1_level_1
2018-12-21 10:00:00,0.174
2018-12-21 11:00:00,0.0
2018-12-21 12:00:00,0.0
2018-12-21 13:00:00,0.0
2018-12-21 14:00:00,0.0


## 6) Save cleaned result (optional)

Save `df_hourly` to `data/processed/ev_hourly.csv` for downstream notebooks.

In [11]:
out_dir = Path('data/processed')
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / 'ev_hourly.csv'
if not df_hourly.empty:
    df_hourly.to_csv(out_path)
    print('Saved processed hourly CSV to', out_path)
else:
    print('No hourly data to save')

Saved processed hourly CSV to data/processed/ev_hourly.csv


In [12]:
# Merge hourly weather into EV hourly data, create time/lags/rolling features, and save features CSV
from pathlib import Path
import pandas as pd
import numpy as np

ev_path = Path('data/processed/ev_hourly.csv')
weather_path = Path('data/trondheim/Norway_Trondheim_ExactLoc_Weather.csv')
out_path = Path('data/processed/ev_features.csv')

assert ev_path.exists(), 'Run earlier cells to produce ev_hourly.csv'
ev = pd.read_csv(ev_path, index_col=0, parse_dates=True).sort_index()

# Load weather (daily); try to parse known datetime column names
if weather_path.exists():
    w = pd.read_csv(weather_path, sep=None, engine='python')
    dt_cols = [c for c in w.columns if 'date' in c.lower() or 'time' in c.lower() or 'datetime' in c.lower()]
    if len(dt_cols):
        dtc = dt_cols[0]
        w[dtc] = pd.to_datetime(w[dtc], dayfirst=True, errors='coerce')
        w = w.set_index(dtc).sort_index()
        # Collapse duplicate timestamps by averaging numeric columns, then resample/interpolate to hourly
        w_num = w.select_dtypes(include=[float, int, 'number'])
        if not w_num.empty:
            w_num = w_num.groupby(w_num.index).mean()
            w_hour = w_num.resample('h').interpolate(method='time').ffill().bfill()
            df = ev.join(w_hour, how='left')
            df = df.fillna(method='ffill').fillna(method='bfill')
        else:
            # no numeric weather columns to merge
            print('Weather had no numeric columns; skipping numeric merge')
            df = ev.copy()
    else:
        print('No datetime-like column found in weather file; skipping weather merge')
        df = ev.copy()
else:
    print('Weather file not found; creating features from EV only')
    df = ev.copy()

# Feature engineering: time features
if 'total_energy_kWh' not in df.columns:
    df['total_energy_kWh'] = ev['total_energy_kWh']

df['hour'] = df.index.hour
df['dow'] = df.index.dayofweek
# cyclic encoding for hour and day-of-week

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)

# Lags and rolling stats
for lag in [1, 24, 168]:
    df[f'lag_{lag}'] = df['total_energy_kWh'].shift(lag)

df['roll_3h'] = df['total_energy_kWh'].rolling(3, min_periods=1).mean()
df['roll_24h'] = df['total_energy_kWh'].rolling(24, min_periods=1).mean()

# Drop initial rows with NaN lags if any
df = df.dropna(subset=['lag_1'])

out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path)
print('Saved features to', out_path, 'shape=', df.shape)


Saved features to data/processed/ev_features.csv shape= (9762, 46)


  df = df.fillna(method='ffill').fillna(method='bfill')


## Notes and next steps

- Check column names in the raw CSV and edit `ts_candidates` and `energy_cols` accordingly.
- If NaN semantics differ, replace `fillna(0)` with a strategy appropriate for your data.
- You can now use `notebooks/ev_charging_analysis.ipynb` to load `data/processed/ev_hourly.csv` and continue with EDA and modelling.