# Eesti tarbimiskõverate 7-päevased protsentprofiilid

**Eesmärk.** Luua järgmise 7 päeva tunniprofiilid protsentides (sum=100) kasutades ajalooliselt temperatuurilt sarnaseid päevi ning päevatüüpe (workday/saturday/sunday/holiday).

In [1]:

# --- Config & imports ---
from pathlib import Path
import sys, importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta, timezone

PROJECT_ROOT = Path(".").resolve()
LOCAL_TZ = "Europe/Tallinn"
MIN_MATCHES = 10

OUTPUT_DIR = PROJECT_ROOT / "output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
FORECAST_CSV = OUTPUT_DIR / "forecast_profiles_next7d.csv"

print("PROJECT_ROOT =", PROJECT_ROOT)
print("OUTPUT_DIR   =", OUTPUT_DIR)


PROJECT_ROOT = C:\Users\tarmo\OneDrive\Dokumendid\GIT Andmetarkus\portfolio\Electricity-Consumption-Forecast
OUTPUT_DIR   = C:\Users\tarmo\OneDrive\Dokumendid\GIT Andmetarkus\portfolio\Electricity-Consumption-Forecast\output


In [2]:

# --- Import project modules (historical consumption & temperature) ---
sys.path.append(str(PROJECT_ROOT))

ecw = importlib.import_module("el_consumption_weekday")
tp  = importlib.import_module("temp")

try:
    tf = importlib.import_module("temp_forecast")  # optional 7d forecast
    HAS_TEMP_FORECAST = True
except Exception:
    tf = None
    HAS_TEMP_FORECAST = False

hourly_df = ecw.sum_hourly_el_consumption.copy()
daily_df  = ecw.sum_daily_el_consumption.copy()
temp_df   = tp.avg_day_temp.copy()

hourly_df.head(3), daily_df.head(3), temp_df.head(3)


[warn] holidays library missing or failed; set hourly is_holiday=False. Install: pip install holidays

=== Data Quality Report (hourly) ===
Total rows: 17543
sum_cons_time present: 17543 | missing: 0
sum_el_hourly_value present: 17543 | missing: 0
Duplicate timestamps: 0
Date range: 2023-09-17 07:00:00+00:00 … 2025-09-17 06:00:00+00:00
Imputed (mean of neighbors): 1
Remaining missing after imputation: 0
[warn] holidays library missing or failed; set daily is_holiday=False. Install: pip install holidays

=== Hourly preview ===
              sum_cons_time  sum_el_hourly_value  imputed weekday  is_weekend  \
0 2023-09-17 07:00:00+00:00                609.6    False  Sunday        True   
1 2023-09-17 08:00:00+00:00                680.9    False  Sunday        True   
2 2023-09-17 09:00:00+00:00                805.5    False  Sunday        True   
3 2023-09-17 10:00:00+00:00                817.8    False  Sunday        True   
4 2023-09-17 11:00:00+00:00                838.8    False  Sund

ModuleNotFoundError: No module named 'temp'

In [None]:

# --- Build historical daily percent profiles ---
hourly = hourly_df.copy()
hourly['sum_cons_time'] = pd.to_datetime(hourly['sum_cons_time'], utc=True, errors='coerce')
hourly = hourly.dropna(subset=['sum_cons_time']).sort_values('sum_cons_time')
hourly['time_local'] = hourly['sum_cons_time'].dt.tz_convert(LOCAL_TZ)
hourly['date_local'] = hourly['time_local'].dt.date
hourly['hour_local'] = hourly['time_local'].dt.hour

def daytype_from_row(weekday_name, is_weekend, is_holiday):
    if bool(is_holiday): return 'holiday'
    if bool(is_weekend):
        if weekday_name == 'Saturday': return 'saturday'
        if weekday_name == 'Sunday':   return 'sunday'
        return 'weekend'
    return 'workday'

hourly['daytype'] = [
    daytype_from_row(w, iw, ih)
    for w, iw, ih in zip(hourly.get('weekday', pd.Series(['']*len(hourly))), 
                         hourly.get('is_weekend', pd.Series([False]*len(hourly))), 
                         hourly.get('is_holiday', pd.Series([False]*len(hourly))))]

daily_hourly = (hourly.groupby(['date_local','hour_local'], as_index=False)['sum_el_hourly_value']
                .sum(min_count=1).rename(columns={'sum_el_hourly_value':'consumption_hour'}))
daily_tot = (daily_hourly.groupby('date_local', as_index=False)['consumption_hour']
             .sum(min_count=1).rename(columns={'consumption_hour':'consumption_day'}))
profiles = daily_hourly.merge(daily_tot, on='date_local', how='left')
profiles['percent'] = np.where(profiles['consumption_day']>0,
                               profiles['consumption_hour']/profiles['consumption_day']*100.0, np.nan)

daytypes = (hourly.groupby('date_local', as_index=False).agg({'daytype':'first'}))
profiles = profiles.merge(daytypes, on='date_local', how='left')

temp_df2 = temp_df.copy()
temp_df2['avg_day_temp_date'] = pd.to_datetime(temp_df2['avg_day_temp_date']).dt.date
temp_df2['temp_c'] = pd.to_numeric(temp_df2['hour_day_value'], errors='coerce')
temp_df2 = temp_df2[['avg_day_temp_date','temp_c']].dropna()

profiles = profiles.merge(temp_df2, left_on='date_local', right_on='avg_day_temp_date', how='left')
profiles['temp_round'] = profiles['temp_c'].round().astype('Int64')
profiles = profiles.dropna(subset=['percent','temp_round'])

# Helper base (only full 24h days)
base = (profiles[['date_local','daytype','temp_round','hour_local','percent']]
        .rename(columns={'hour_local':'hour'}))
counts = base.groupby('date_local')['hour'].count()
full_days = counts[counts>=24].index
base = base[base['date_local'].isin(full_days)].copy()

base.head(3)


Unnamed: 0,date_local,daytype,temp_round,hour,percent
15,2023-09-18,sunday,12,0,3.712008
16,2023-09-18,sunday,12,1,3.576925
17,2023-09-18,sunday,12,2,3.52876


In [None]:

# --- Get next 7 days forecast temperatures ---
def get_next7d_forecast():
    if tf is not None:
        import pandas as _pd
        for name in dir(tf):
            obj = getattr(tf, name)
            if isinstance(obj, _pd.DataFrame):
                cols = [c.lower() for c in obj.columns]
                if any('date' in c for c in cols) and any(('temp' in c) or ('tavg' in c) for c in cols):
                    fc = obj.copy()
                    cols_map = {c.lower(): c for c in fc.columns}
                    date_col = next((c for c in fc.columns if 'date' in c.lower()), None)
                    temp_col = next((c for c in fc.columns if ('temp' in c.lower()) or ('tavg' in c.lower())), None)
                    if date_col and temp_col:
                        out = fc[[date_col, temp_col]].copy()
                        out.columns = ['date','temp_c']
                        out['date'] = pd.to_datetime(out['date']).dt.date
                        out['temp_c'] = pd.to_numeric(out['temp_c'], errors='coerce')
                        out = out.dropna().sort_values('date').head(7)
                        if not out.empty:
                            return out
    # fallback: skeleton for manual input
    today_local = datetime.now(timezone.utc).astimezone().date()
    dummy = pd.DataFrame({'date':[today_local + timedelta(days=i) for i in range(1,8)],
                          'temp_c':[np.nan]*7})
    print("[info] temp_forecast.py puudub või struktuur tundmatu. Täida 'temp_c' käsitsi (°C).")
    return dummy

forecast7 = get_next7d_forecast()

def infer_daytype_from_date(d):
    wd = pd.Timestamp(d).dayofweek
    return 'workday' if wd<=4 else ('saturday' if wd==5 else 'sunday')

if 'daytype' not in forecast7.columns:
    forecast7['daytype'] = [infer_daytype_from_date(d) for d in forecast7['date']]

forecast7


[info] temp_forecast.py puudub või struktuur tundmatu. Täida 'temp_c' käsitsi (°C).


Unnamed: 0,date,temp_c,daytype
0,2025-09-18,,workday
1,2025-09-19,,workday
2,2025-09-20,,saturday
3,2025-09-21,,sunday
4,2025-09-22,,workday
5,2025-09-23,,workday
6,2025-09-24,,workday


In [None]:

# --- Sampling rules & profile aggregation ---
def select_sample_for_day(target_daytype, target_temp_c, base_df, min_matches=10):
    import pandas as _pd
    if _pd.isna(target_temp_c):
        return _pd.DataFrame(columns=['date_local','hour','percent'])

    T = int(round(float(target_temp_c)))
    pool = base_df.copy()

    if target_daytype == 'workday':
        pool = pool[pool['daytype']=='workday']
        pool_h = None; pool_wknd = None
    elif target_daytype == 'saturday':
        pool = pool[pool['daytype']=='saturday']
        pool_h = None; pool_wknd = None
    elif target_daytype == 'sunday':
        pool = pool[pool['daytype']=='sunday']
        pool_h = None; pool_wknd = None
    elif target_daytype == 'holiday':
        pool_h = pool[pool['daytype']=='holiday']
        pool_wknd = pool[pool['daytype'].isin(['saturday','sunday'])]
        pool = None
    else:
        pool_h = None; pool_wknd = None  # use all if unknown

    def by_temp_range(df, T, d):
        return df[df['temp_round'].between(T-d, T+d)] if df is not None else df

    if target_daytype == 'holiday':
        sel = by_temp_range(pool_h, T, 0)
        d = 0
        while sel['date_local'].nunique() < min_matches:
            extra = by_temp_range(pool_wknd, T, d)
            if extra is not None:
                sel = pd.concat([sel, extra])
            if sel['date_local'].nunique() >= min_matches:
                break
            d += 1
            sel = by_temp_range(pool_h, T, d)
    else:
        sel = by_temp_range(pool, T, 0)
        d = 0
        while sel['date_local'].nunique() < min_matches:
            d += 1
            sel = by_temp_range(pool, T, d)

    sel = sel[['date_local','hour','percent','daytype','temp_round']].drop_duplicates()
    return sel

def aggregate_profile(sample_df):
    if sample_df.empty: return None
    prof = sample_df.groupby('hour', as_index=False)['percent'].mean()
    s = prof['percent'].sum()
    if s>0: prof['percent'] = prof['percent']/s*100.0
    return prof


In [None]:
# --- Build 7-day forecast profiles ---
profile_rows, debug_rows = [], []

for _, row in forecast7.iterrows():
    d, t, dtp = row['date'], row['temp_c'], row['daytype']
    sel = select_sample_for_day(dtp, t, base, min_matches=MIN_MATCHES)
    prof = aggregate_profile(sel)
    if prof is None or prof.empty: 
        continue
    for _, r in prof.iterrows():
        profile_rows.append({'date': d, 'daytype': dtp, 'hour': int(r['hour']), 'percent': float(r['percent'])})
    picked_days = sel[['date_local']].drop_duplicates().sort_values('date_local')
    debug_rows.append({
        'date': d,
        'daytype': dtp,
        'forecast_temp_c': float(t) if pd.notna(t) else None,
        'temp_target_round': int(round(t)) if pd.notna(t) else None,
        'temp_min_round': int(sel['temp_round'].min()) if not sel.empty else None,
        'temp_max_round': int(sel['temp_round'].max()) if not sel.empty else None,
        'n_days_used': int(picked_days.shape[0]),
        'picked_dates': ','.join(picked_days['date_local'].astype(str).tolist())
    })

if profile_rows and 'date' in profile_rows[0] and 'hour' in profile_rows[0]:
    forecast_profiles = pd.DataFrame(profile_rows).sort_values(['date','hour']).reset_index(drop=True)
    forecast_profiles.to_csv(FORECAST_CSV, index=False)
    print("Saved:", FORECAST_CSV)
    display(forecast_profiles.head(24))
else:
    print("No forecast profiles generated. Check input data and sampling logic.")
    forecast_profiles = pd.DataFrame()

forecast_meta = pd.DataFrame(debug_rows).sort_values('date').reset_index(drop=True)

No forecast profiles generated. Check input data and sampling logic.


KeyError: 'date'

In [None]:
# --- DEBUG: Check forecast7 and base ---
print("forecast7:")
print(forecast7)
print("\nbase sample:")
print(base.head(10))
print("\nbase daytypes:", base['daytype'].unique())
print("base temp_round range:", base['temp_round'].min(), "to", base['temp_round'].max())

# Try to select sample for the first forecast day
if not forecast7.empty:
    first = forecast7.iloc[0]
    print("\nFirst forecast day:", first)
    sel = select_sample_for_day(first['daytype'], first['temp_c'], base, min_matches=MIN_MATCHES)
    print("Sample for first forecast day:")
    print(sel)
    print("n_days_used:", sel['date_local'].nunique())

In [None]:

# --- Plots for quick visual check ---
if not forecast_profiles.empty:
    for d in sorted(forecast_profiles['date'].unique()):
        sub = forecast_profiles[forecast_profiles['date']==d]
        plt.figure(figsize=(9,4))
        plt.plot(sub['hour'], sub['percent'], marker='o')
        plt.title(f"{d} – {sub['daytype'].iloc[0]}: 24h % profile (sum=100)")
        plt.xlabel("Hour"); plt.ylabel("% of day"); plt.grid(True)
        plt.show()

    # sample-temperature histogram for the first day
    first = forecast_meta.iloc[0]
    sel0 = select_sample_for_day(first['daytype'], first['forecast_temp_c'], base, min_matches=MIN_MATCHES)
    if not sel0.empty:
        tmp = sel0[['date_local','temp_round']].drop_duplicates()
        plt.figure(figsize=(6,3.5))
        tmp['temp_round'].hist(bins=range(int(tmp['temp_round'].min()-1), int(tmp['temp_round'].max()+2)))
        plt.title(f"Sample temp overview: {first['date']} ({first['daytype']}), target≈{first['temp_target_round']}°C")
        plt.xlabel("Temp (°C, rounded)"); plt.ylabel("Days"); plt.grid(True)
        plt.show()


In [None]:

# --- Optional CSV exports (commented) ---
# (Uncomment if you want these for auditing)
# profiles.to_csv(OUTPUT_DIR / "historical_profiles_percent.csv", index=False)
# base.to_csv(OUTPUT_DIR / "historical_daily_vectors.csv", index=False)
# forecast_meta.to_csv(OUTPUT_DIR / "forecast_profiles_meta.csv", index=False)
print("Optional audit CSVs are commented out. Remove '#' to export them.")
