## Read time series data and split

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pickle

In [2]:
# Read
df = pd.read_parquet("dehli.parquet")

# Downsample from 5-min resolution to hourly
df = df.resample("h").mean()

# Change to numeric index (remove datetime)
df = df.reset_index(drop=True)

# Columns of interest
cols = ['Power demand', 'temp', 'dwpt', 'rhum', 'wdir', 'wspd', 'pres']
df = df[cols]

# Fill nans with nearest
df = df.ffill()

# Split in train/test
split_idx = int(len(df) * 0.8)
X = df.iloc[:split_idx].copy()
Y = df.iloc[split_idx:].copy()

## Transform to orthogonal subspace
We view the time series as a transition from one state to another via a vector. The vector is defined as the start and stop point of the state in the subspace.
* Training data is first scaled and transformed to subspace using PCA.
* Cluster data and assign centroid membership to each data point
* Create vector

In [4]:
# Hyperparameters:
pca_exp_var = 0.95
n_clusters = int(len(X) / 100)
print(n_clusters)

276


In [5]:
# Scale and PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(pca_exp_var)
X_pca = pca.fit_transform(X_scaled)

# Cluster each subspace point
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
X["cluster"] = kmeans.fit_predict(X_pca)

# Compute next_cluster
X["next_cluster"] = X["cluster"].shift(-1).astype("Int64")

## Vectorize transitions

In [6]:
# Build unique transitions table (excluding self transitions)
unique_transitions = (
    X.loc[X["cluster"] != X["next_cluster"], ["cluster", "next_cluster"]]
      .dropna()
      .drop_duplicates()
      .reset_index(drop=True)
)

# Assign a unique vector_id to each unique transition
unique_transitions["vector_id"] = range(1, len(unique_transitions) + 1)

# Merge back to X (left join) — duplicates preserved
X = X.merge(
    unique_transitions,
    on=["cluster", "next_cluster"],
    how="left"
)

# Self-transitions --> vector_id = 0
X["vector_id"] = X["vector_id"].fillna(0).astype(int)
X = X.dropna()

In [7]:
# Build a vocabulary {vector_id → (cluster, next_cluster)}
vocab = (
    unique_transitions
    .set_index("vector_id")[["cluster", "next_cluster"]]
    .apply(tuple, axis=1)
    .to_dict()
)
# Build a reverse vocabulary {(cluster, next_cluster) → vector_id}
reverse_vocab = {
    (row.cluster, row.next_cluster): row.vector_id
    for row in unique_transitions.itertuples()
}

## Create vocabulary and store it

In [8]:
# Store results and vocabulary
X.to_parquet("X.parquet")

data = {
    'vocab': vocab,
    'reverse_vocab': reverse_vocab,
}

with open('vocab.pickle', 'wb') as f:
    pickle.dump(data, f)