In [2]:
import numpy as np
import pandas as pd
from scipy.stats import kurtosis
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
from torch.utils.data import TensorDataset, DataLoader

In [4]:
# -----------------------------
# CONFIG - à ajuster
# -----------------------------
FILE_PATH = "D:\Formation_Data_Engineer\Data_FullStack\Data_Engineer_Full_Stack\Projet_groupe\pf_2020-03-30_filtered_downsampled.csv"  # remplacer si besoin
time_col = "time"
amp_col = "amplitude"
chan_col = "channel"
station_col = "station"

# fenêtrage (exemples)
fs = 1/60                # échantillonnage supposé (1/60 Hz données en minute)
win_minut = 10        # taille fenêtre pour features en secondes (ex : 60 -> 1 min)
step_min = 10       # pas entre fenêtres (ici pas chevauchement)
env_window = 200        # smoothing median window (200 valeurs comme l'étude)

# séquences (en nombre de fenêtres)
seq_length = 60         # nombre de fenêtres par séquence
step_seq = 1            # sliding step pour construire sequences

# split temporel (appliqué après concat de toutes les sequences)
train_frac = 0.7
val_frac = 0.15
test_frac = 0.15

# éruption (adapter / paramétrer par fichier si nécessaire)
eruption_start = pd.to_datetime("2020-04-02T08:20:00Z")
eruption_end   = pd.to_datetime("2020-04-06T09:30:00Z")

# random seed reproducibility
RND = 42
np.random.seed(RND)
torch.manual_seed(RND)

eruption_start = eruption_start.tz_localize(None)
eruption_end   = eruption_end.tz_localize(None)


# -----------------------------
# Fonctions utilitaires
# -----------------------------
def type_component(ch):
    c = str(ch).upper()
    return 0 if c.endswith("Z") else 1  # 0 vertical, 1 horizontal

def shannon_entropy(segment, bins=50):
    p, _ = np.histogram(segment, bins=bins, density=True)
    p = p[p > 0]
    if p.size == 0:
        return 0.0
    return -np.sum(p * np.log2(p))

def compute_delay_hours(ts):
    """
    Retourne le délai (en heures) avant le début de l'éruption (eruption_start).
    - si ts > eruption_end -> np.nan (on ignore après la période)
    - si ts in [eruption_start, eruption_end] -> 0.0
    - else -> (eruption_start - ts) en heures
    """
    if pd.isna(ts):
        return np.nan
    if ts > eruption_end:
        return np.nan
    if eruption_start <= ts <= eruption_end:
        return 0.0
    return (eruption_start - ts).total_seconds() / 3600.0

def compute_delay_class(hours):

    # si hours == np.nan -> renvoie vecteur de 0 (ou préférence: class 0?). ici retourne 0 vecteur (aucune classe positive)
    if pd.isna(hours):
        return np.array([1,0,0,0,0,0], dtype=int)  # garder "classe 0" par défaut pour NaN
    conds = [
        hours >= 24,
        hours < 24,
        hours < 16,
        hours < 12,
        hours < 1,
        hours <= 0
    ]
    return np.array(conds, dtype=int)

# -----------------------------
# 1) Lecture et checks
# -----------------------------
df = pd.read_csv(FILE_PATH)
# normaliser noms colonnes
df.columns = [c.strip().lower() for c in df.columns]

# parse time
df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
df = df.dropna(subset=[time_col])
df[time_col]   = df[time_col].dt.tz_localize(None)
# ajouter component_flag (0 vertical, 1 horizontal) derived from channel
df["component_flag"] = df[chan_col].apply(type_component).astype(int)

# ajouter colonnes temporelles utiles (optionnelles, seront récupérées par window-end)
df["year"] = df[time_col].dt.year.astype(int)
df["month"] = df[time_col].dt.month.astype(int)
df["day"] = df[time_col].dt.day.astype(int)
df["hour"] = df[time_col].dt.hour.astype(int)
df["minute"] = df[time_col].dt.minute.astype(int)
df["seconde"] = df[time_col].dt.second.astype(int)

# -----------------------------
# 2) Construction des features PAR COUPLE (station, channel)
#    -> calcule features par fenêtre (alignées sur la fin de fenêtre),
#       applique smoothing local, puis construit sequences (avant concat)
# -----------------------------
all_seqs = []        # sequences (seq_length, feat_dim)
all_labels = []      # target vectors length 6 (np.int8)
all_times = []       # timestamp of sequence end (for temporal split)
all_stations = []    # station id (string) per sequence
all_components = []  # component_flag per sequence

win = int(win_minut)    # 1 sample/min
step = int(step_min)

grouped = df.groupby([station_col, chan_col])

for (st, ch), g in grouped:
    g = g.sort_values(time_col).reset_index(drop=True)
    sig = g[amp_col].values.astype(float)
    times = g[time_col].values
    n = len(sig)
    if n < win:
        continue

    # features per window
    feat_rows = []
    feat_times = []
    for i in range(0, n - win + 1, step):
        seg = sig[i:i+win]
        t_end = pd.to_datetime(times[i + win - 1])  # window end timestamp (exists in g)
        SE = shannon_entropy(seg)
        K  = float(kurtosis(seg, fisher=True, bias=False))
        std = float(np.std(seg))
        mean = float(np.mean(seg))
        med = float(np.median(seg))
        p90 = float(np.percentile(seg, 90))
        p10 = float(np.percentile(seg, 10))
        tens = p90 - p10
        feat_rows.append([SE, K, std, mean, med, p90, p10, tens])
        feat_times.append(t_end)

    feat_df = pd.DataFrame(feat_rows, columns=[
        "SE","Kurtosis","std","mean","median","per90","per10","tension"
    ])
    feat_df["time"] = pd.to_datetime(feat_times)

    # local smoothing (median envelope) per this channel/station
    for col in ["SE","Kurtosis","std","mean","median","per90","per10","tension"]:
        if col in feat_df.columns:
            feat_df[col + "_env"] = feat_df[col].rolling(window=env_window, min_periods=1).median()

    # copy meta info (station, component flag) into feat_df rows
    feat_df["station"] = st
    # component_flag extracted from original group (all rows same channel)
    comp_flag = int(g["component_flag"].iloc[0])
    feat_df["component_flag"] = comp_flag

    # attach time-derived columns from original g by matching the window-end timestamps
    tmp = g[[time_col, "year", "month", "day", "hour", "minute","seconde"]].copy()
    tmp = tmp.rename(columns={time_col: "time"})
    feat_df = pd.concat([feat_df,tmp])

    # build sequences (SLIDING) from feat_df BEFORE concat with other groups
    M = len(feat_df)
    if M < seq_length:
        continue

    # choose per-step features (take only existing columns)
    per_step_features = [
        "SE","Kurtosis","std","mean","median","per90","per10","tension",
        "SE_env","Kurtosis_env","std_env","mean_env","median_env","per90_env","per10_env","tension_env"
    ]
    per_step_features = [c for c in per_step_features if c in feat_df.columns]

    per_step_arr = feat_df[per_step_features].to_numpy(dtype=np.float32)

    # for each possible sequence in this channel/station
    for i in range(0, M - seq_length + 1, step_seq):
        seq = per_step_arr[i:i+seq_length]  # (seq_length, feat_dim)
        t_seq_end = feat_df["time"].iloc[i + seq_length - 1]
        delay_h = compute_delay_hours(t_seq_end)
        label_vec = compute_delay_class(delay_h)  # vector length 6, dtype int
        all_seqs.append(seq)
        all_labels.append(label_vec)
        all_times.append(pd.to_datetime(t_seq_end))
        all_stations.append(st)
        all_components.append(comp_flag)

# -----------------------------
# 3) Concat all sequences across all station/channel groups
#    Now sequences built independently per group are concatenated
# -----------------------------
if len(all_seqs) == 0:
    raise RuntimeError("Aucune séquence construite. Vérifier fenêtrage / données.")

X = np.stack(all_seqs).astype(np.float32)      # (N_seq, seq_length, feat_dim)
y = np.stack(all_labels).astype(np.int64)      # (N_seq, 6)
times_arr = np.array(all_times)                # datetime64 array
stations_arr = np.array(all_stations)          # (N_seq,)
components_arr = np.array(all_components).astype(np.int64)  # (N_seq,)

print("Built sequences:", X.shape, "labels:", y.shape)

# -----------------------------
# 4) Temporal split (train/val/test) BEFORE any scaling/encoding
# -----------------------------
order = np.argsort(times_arr)
X = X[order]
y = y[order]
times_arr = times_arr[order]
stations_arr = stations_arr[order]
components_arr = components_arr[order]

N = len(X)
i_train = int(N * train_frac)
i_val = int(N * (train_frac + val_frac))

X_train = X[:i_train]; y_train = y[:i_train]; st_train = stations_arr[:i_train]; comp_train = components_arr[:i_train]
X_val   = X[i_train:i_val]; y_val = y[i_train:i_val]; st_val = stations_arr[i_train:i_val]; comp_val = components_arr[i_train:i_val]
X_test  = X[i_val:]; y_test = y[i_val:]; st_test = stations_arr[i_val:]; comp_test = components_arr[i_val:]

print("Split sizes (train/val/test):", len(X_train), len(X_val), len(X_test))

# -----------------------------
# 5) Fit encoders/scalers on TRAIN only, then transform val/test
#    Station OneHotEncoder fitted on station strings of train set
#    Scaler fitted on flattened per-step numeric features of train set
# -----------------------------

# station OHE
ohe = OneHotEncoder(handle_unknown="infrequent_if_exist", sparse_output=False)
ohe.fit(np.array(st_train).reshape(-1,1))

# scaler fitted on flattened train numeric features

# flatten train for scaler: (N_train*L, F)
Ntr, L, F = X_train.shape
X_train_flat = X_train.reshape(Ntr * L, F)
scaler = StandardScaler().fit(X_train_flat)

def transform_and_concat(X_part, stations_part, components_part, scaler, ohe):

    Np, Lp, Fp = X_part.shape
    X_flat = X_part.reshape(-1, Fp)
    X_flat = scaler.transform(X_flat)
    X_scaled = X_flat.reshape(Np, Lp, Fp)

    st_ohe = ohe.transform(np.array(stations_part).reshape(-1,1))  # (N, S)
    comp_col = np.array(components_part).reshape(-1,1)            # (N, 1)

    st_rep = np.repeat(st_ohe[:, np.newaxis, :], Lp, axis=1)      # (N, L, S)
    comp_rep = np.repeat(comp_col[:, np.newaxis, :], Lp, axis=1)  # (N, L, 1)

    X_out = np.concatenate([X_scaled, comp_rep.astype(np.float32), st_rep.astype(np.float32)], axis=2)
    return X_out

X_train_proc = transform_and_concat(X_train, st_train, comp_train, scaler, ohe)
X_val_proc   = transform_and_concat(X_val,   st_val,   comp_val,   scaler, ohe)
X_test_proc  = transform_and_concat(X_test,  st_test,  comp_test,  scaler, ohe)

print("Final shapes after encoding/scaling:", X_train_proc.shape, X_val_proc.shape, X_test_proc.shape)
# y already shape (N,6)

# -----------------------------
# 6) Save arrays and build DataLoaders
# -----------------------------
np.save("X_train.npy", X_train_proc)
np.save("X_val.npy", X_val_proc)
np.save("X_test.npy", X_test_proc)
np.save("y_train.npy", y_train)
np.save("y_val.npy", y_val)
np.save("y_test.npy", y_test)

# Build torch loaders (labels as float for multi-label BCE or long for multiclass depending model)
batch_size = 64
# here labels are vectors (N,6) -> use float32 if training with BCEWithLogitsLoss, else adapt
train_ds = TensorDataset(torch.from_numpy(X_train_proc).float(), torch.from_numpy(y_train).float())
val_ds   = TensorDataset(torch.from_numpy(X_val_proc).float(),   torch.from_numpy(y_val).float())
test_ds  = TensorDataset(torch.from_numpy(X_test_proc).float(),  torch.from_numpy(y_test).float())

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

print("Preprocessing complete. Loaders ready.")


  FILE_PATH = "D:\Formation_Data_Engineer\Data_FullStack\Data_Engineer_Full_Stack\Projet_groupe\pf_2020-03-30_filtered_downsampled.csv"  # remplacer si besoin


Built sequences: (401629, 60, 16) labels: (401629, 6)
Split sizes (train/val/test): 281140 60244 60245
Final shapes after encoding/scaling: (281140, 60, 19) (60244, 60, 19) (60245, 60, 19)
Preprocessing complete. Loaders ready.
