In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from ydata_synthetic.synthesizers.timeseries import TimeSeriesSynthesizer
from ydata_synthetic.synthesizers.timeseries.timegan.model import (
    ModelParameters, TrainParameters
)

In [None]:
def make_sequences(gdf, seq_len, stride, cols):
    a = gdf[cols].to_numpy(dtype=float)
    return [a[i:i+seq_len] for i in range(0, len(a)-seq_len+1, stride)]

def sequences_to_long(arr3d, cols, start_id=1):
    rows=[]
    for i, s in enumerate(arr3d, start=start_id):
        for t in range(s.shape[0]):
            rows.append((i, t, *s[t]))
    return pd.DataFrame(rows, columns=['synthetic_id','t'] + cols)

def argmax_back(df_in, prefix, new_col):
    cols = [c for c in df_in.columns if c.startswith(prefix)]
    if not cols:
        return df_in
    # idx of max within the group; convert to label by stripping prefix
    idx = df_in[cols].to_numpy().argmax(axis=1)
    labels = [c[len(prefix):] for c in cols]
    df_in[new_col] = np.array(labels, dtype=object)[idx]
    df_in = df_in.drop(columns=cols)
    return df_in

# Hist compare (kWh)
def quick_hist_compare(real_df, syn_df, col='kwh', bins=60):
    plt.figure()
    plt.hist(real_df[col].dropna(), bins=bins, density=True, alpha=0.5, label='real')
    plt.hist(syn_df[col].dropna(),  bins=bins, density=True, alpha=0.5, label='synthetic')
    plt.title(f'Distribution: {col}'); plt.legend(); plt.show()
    
# Autocorrelation at lag 1 (kwh)
def acf1_seq(arr3d, k_idx):
    vals=[]
    for s in arr3d:
        x = s[:, k_idx]
        if x.size > 1 and np.std(x) > 0:
            vals.append(np.corrcoef(x[:-1], x[1:])[0,1])
    return float(np.nanmean(vals)) if len(vals) else np.nan
    
def nn_gap(real3d, syn3d):
    r = real3d.reshape(len(real3d), -1)
    s = syn3d.reshape(len(syn3d), -1)
    nn = NearestNeighbors(n_neighbors=1).fit(s)
    dists, _ = nn.kneighbors(r)
    return float(np.median(dists)), float(np.percentile(dists, 5))

In [None]:
df_energy = pd.read_csv("synthetic_energy_panel.csv")
df = df_energy.copy().sort_values(['house_id','t'])

# Impute kwh within household (no FutureWarning)
df['kwh'] = df.groupby('house_id', group_keys=False)['kwh'].transform(lambda s: s.ffill().bfill())

# One-hot categoricals (keep all levels to allow reconstruction later)
cat_cols = ['dwelling_type','income_class','region']
df = pd.get_dummies(df, columns=cat_cols, drop_first=False)

# Feature set to model (you can add/remove)
base_feats = [
    'kwh','high_usage','temperature','price_index','is_weekend','efficiency',
    'treatment','post'
]
oh_feats = [c for c in df.columns if c.startswith('dwelling_type_') or c.startswith('income_class_') or c.startswith('region_')]
feat_cols = base_feats + oh_feats

for col in feat_cols:
    df[col] = df.groupby('house_id')[col].transform(lambda x: x.fillna(x.mean()))
    df[col] = df[col].fillna(df[col].mean())


# Scale features to [0,1]
scaler = MinMaxScaler()
df[feat_cols] = scaler.fit_transform(df[feat_cols])

In [None]:
SEQ_LEN = 60   # window length (try 90/120 for more seasonality capture)
STRIDE  = 1    # sliding step

seqs = []
for _, g in df.groupby('house_id', sort=False):
    seqs.extend(make_sequences(g, SEQ_LEN, STRIDE, feat_cols))
seqs = np.asarray(seqs)  # (N, SEQ_LEN, N_FEATURES)
print("Sequences:", seqs.shape)

N_FEATURES = seqs.shape[-1]
train, test = train_test_split(seqs, test_size=0.2, random_state=42, shuffle=True)

feature_cols = feat_cols

print(feature_cols)

D = len(feature_cols)

model_params = ModelParameters(batch_size=128, lr=1e-3, noise_dim=32, layers_dim=24)
train_params = TrainParameters(epochs=150, sequence_length=104, number_sequences=D)


synth = TimeSeriesSynthesizer(modelname="timegan", model_parameters=model_params)
window=104
blocks = []

stride = window  # non-overlap
for _, g in df.groupby("house_id", sort=False):
    X = g[feature_cols].to_numpy()
    if len(X) >= window:
        for s in range(0, len(X) - window + 1, stride):
            blocks.append(X[s:s+window])  # shape (window, D)

if not blocks:
    raise ValueError("No windows formed. Lower `window` or ensure entities have enough rows.")


train_mat = np.vstack(blocks)                 # (N*window, D)
df_train = pd.DataFrame(train_mat, columns=feature_cols)

try:
    synth.fit(data=df_train, num_cols=feature_cols, cat_cols=[], train_arguments=train_params)
except TypeError:
    synth.fit(data=df_train, num_cols=feature_cols, cat_cols=[], train_args=train_params)


syn = synth.sample(n_samples=len(test))                           # (N, SEQ_LEN, N_FEATURES)
syn_unscaled = scaler.inverse_transform(syn.reshape(-1, N_FEATURES)).reshape(syn.shape)

syn_long = sequences_to_long(syn_unscaled, feat_cols)
print("Synthetic panel shape:", syn_long.shape)
print(syn_long.head())


syn_tidy = syn_long.copy()
syn_tidy = argmax_back(syn_tidy, 'dwelling_type_', 'dwelling_type')
syn_tidy = argmax_back(syn_tidy, 'income_class_', 'income_class')
syn_tidy = argmax_back(syn_tidy, 'region_', 'region') 

try:
    syn_tidy['region'] = syn_tidy['region'].astype(int)
except Exception:
    pass

print("Tidy synthetic panel columns:", syn_tidy.columns.tolist())
print(syn_tidy.head())


# Build a comparable "real" panel from test sequences (inverse-scaled)
real_unscaled = scaler.inverse_transform(test.reshape(-1, N_FEATURES)).reshape(test.shape)
real_long = sequences_to_long(real_unscaled, feat_cols)

quick_hist_compare(real_long, syn_long, 'kwh')

k_kwh = feat_cols.index('kwh')
print("ACF1 (real): ", acf1_seq(real_unscaled, k_kwh))
print("ACF1 (synth):", acf1_seq(syn_unscaled,  k_kwh))

# Nearest-neighbor privacy heuristic (bigger distances -> safer)
med, p5 = nn_gap(real_unscaled, syn_unscaled)
print(f"NN distance median={med:.3f}, 5th pct={p5:.3f}")

Sequences: (335500, 60, 19)
['kwh', 'high_usage', 'temperature', 'price_index', 'is_weekend', 'efficiency', 'treatment', 'post', 'dwelling_type_apartment', 'dwelling_type_detached', 'dwelling_type_semi', 'income_class_high', 'income_class_low', 'income_class_mid', 'region_1', 'region_2', 'region_3', 'region_4', 'region_5']
A DataProcessor is not available for the TimeGAN.


2025-09-02 12:22:31.757622: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-09-02 12:22:31.757689: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-09-02 12:22:31.757705: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-09-02 12:22:31.757987: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-09-02 12:22:31.758207: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Emddeding network training:   0%|          | 0/150 [00:00<?, ?it/s]2025-09-02 12:23:15.997674: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin op

KeyboardInterrupt: 