In [71]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from torch.utils.data import TensorDataset, DataLoader
import torch

In [72]:
# ============================================================
# CONFIG GLOBAL
# ============================================================

file_path = r"D:\Formation_Data_Engineer\Data_FullStack\Data_Engineer_Full_Stack\Projet_groupe\pf_2020-03-30_filtered_downsampled.csv"

time_col = "time"
amp_col = "amplitude"

fs = 1/60      # échantillonnage 1/minute
win = 10       # taille fenêtre (min)
step = 10      # step fenêtres
env_window = 20

eruption_start = pd.to_datetime("2020-04-02T08:20:00Z")
eruption_end   = pd.to_datetime("2020-04-06T09:30:00Z")

In [73]:
# ============================================================
# FONCTIONS UTILITAIRES
# ============================================================

def type_component(ch):
    ch = str(ch).upper()
    return 0 if ch.endswith("Z") else 1   # 0 = vertical, 1 = horizontal

def shannon_entropy(segment, bins=50):
    p, _ = np.histogram(segment, bins=bins, density=True)
    p = p[p > 0]
    if len(p)==0:
        return 0.0
    return -np.sum(p * np.log2(p))

# def frequency_index(segment, fs):
#     N = len(segment)
#     freqs = np.fft.rfftfreq(N, d=1/fs)
#     S = np.abs(np.fft.rfft(segment))**2

#     low = (freqs>=1)&(freqs<=5.5)
#     high = (freqs>=6)&(freqs<=16)

#     E_low = S[low].sum()
#     E_high = S[high].sum()

#     if E_low == 0:
#         return np.nan

#     return np.log10(E_high/E_low) ==> à réintégrer si prise en compte de la Frequency Index

def compute_delay_class(hours):
    if pd.isna(hours): return 0
    if hours <= 0: return 4
    if hours <= 1: return 3
    if hours <= 12: return 2
    if hours <= 16: return 1
    return 0

def compute_delay_hours(t):
    """Return delay relative to eruption window."""
    if t > eruption_end:
        return np.nan
    if eruption_start <= t <= eruption_end:
        return 0.0
    # avant l'éruption → temps restant avant début
    return (eruption_start - t).total_seconds()/3600.0

In [74]:
# ============================================================
# 1. LECTURE + PREPROCESS LÉGER
# ============================================================

df = pd.read_csv(file_path)

# standardisation nom colonnes
df.columns = [c.strip().lower() for c in df.columns]

# conversion time
df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
df = df.dropna(subset=[time_col])

# composante numérique
df["component_flag"] = df["channel"].apply(type_component)

In [75]:
# mapping channel -> composante
component_map = {
    "EHZ": 0, "BHZ": 0, "HHZ": 0,   # Vertical
    "EHE": 1, "EHN": 1, "BHE": 1, "BHN": 1, "HHE": 1, "HHN": 1  # Horizontal
}

final_frames = []

In [76]:
# ---------------------------
# Séparation par station / channel
# ---------------------------
station_channel_groups = {
    (st, ch): g.sort_values(time_col).copy()
    for (st, ch), g in df.groupby(["station", "channel"])
}

In [77]:
for (st, ch), g in station_channel_groups.items():

    sig = g[amp_col].astype(float).values
    times = g[time_col].values
    n = len(sig)

    if n < win:
        continue

    rows = []
    for i in range(0, n - win + 1, step):
        seg = sig[i:i+win]
        t_end = times[i+win-1]

        SE  = shannon_entropy(seg)
        K   = float(kurtosis(seg, fisher=True, bias=False))
        #FI  = frequency_index(seg, fs) à réintégrer si prise en compte de la Frequency Index
        std = float(np.std(seg))
        mean = float(np.mean(seg))
        med  = float(np.median(seg))
        p90 = float(np.percentile(seg,90))
        p10 = float(np.percentile(seg,10))
        tens = p90 - p10

        rows.append([t_end, SE, K, std, mean, med, p90, p10, tens]) #, FI à réintégrer si prise en compte de la Frequency Index

    feat = pd.DataFrame(
        rows,
        columns=[
            "time","SE","Kurtosis","std","mean","median",
            "per90","per10","tension"
        ] #,"FI" à réintégrer si prise en compte de la Frequency Index
    )

    feat["station"] = st
    feat["channel"] = ch

    # -------- enveloppe (par DF isolé) --------
    for col in ["SE","Kurtosis","std","mean","median","per90","per10","tension"]: #,"FI" à réintégrer si prise en compte de la Frequency Index
        feat[col+"_env"] = feat[col].rolling(env_window, min_periods=1).median()

    # -------- delay + classe --------
    feat["time"] = feat["time"].dt.tz_localize("UTC").dt.tz_convert("UTC")
    feat["delay_hours"] = feat["time"].apply(compute_delay_hours)
    feat["label"] = feat["delay_hours"].apply(compute_delay_class).astype(int)

    # -------- tag composante --------
    feat["component_flag"] = component_map.get(ch, -1)

    final_frames.append(feat)

In [78]:
# -----------------------------
# Concat final
# -----------------------------
final_df = pd.concat(final_frames, ignore_index=True)

In [83]:

# ---------------------------
# Encodage
# ---------------------------
# Colonnes numériques à garder
numeric_features = ["component_flag","SE","Kurtosis","std","mean","median","per90","per10","tension",
                    "SE_env","Kurt_env","std_env",
                    "year","month","day","hour","minute"] #,"FI","FI_env" à réintégrer si prise en compte de la Frequency Index

# garder uniquement celles existantes
numeric_features = [c for c in numeric_features if c in final_df.columns]

# Colonnes catégorielles à encoder : station
categorical_candidates = [c for c in final_df if c.lower() in ("station")]


# Build preprocess pipeline
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="infrequent_if_exist", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_candidates)
    ],
    remainder="drop"
)

# Remplir NaN numériques par median avant scaler
final_df[numeric_features] = final_df[numeric_features].fillna(final_df[numeric_features].median())

# Remplir catégoriques na par 'unk'
for c in categorical_candidates:
    final_df[c] = final_df[c].fillna("unk")


X_num = final_df[numeric_features]
X_cat = final_df[categorical_candidates]

X_all= pd.concat([X_num,X_cat],axis=1)
X_all_transformed = preprocessor.fit_transform(final_df)

In [84]:
# ---------------------------
# 10. Construction des sequences (sliding) pour le modèle
# ---------------------------
# On construit sequences non chevauchantes (ou chevauchantes selon step_sequence)
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 64
seq_length = 60       # longueur des séquences pour le modèle (en minutes)

step_seq = 1  # si 1 -> séquences glissantes à chaque point; si seq_length -> non chevauchées
X = X_all_transformed
y = final_df["label"].values
T = len(X)

seqs = []
labels = []
times_seq = []

for i in range(0, T - seq_length, step_seq):
    seq = X[i:i+seq_length]
    # label associé au temps de fin de séquence (alignement)
    lab = y[i+seq_length-1]
    seqs.append(seq)
    labels.append(lab)
    times_seq.append(final_df.index[i+seq_length-1])

X_seq = np.stack(seqs).astype(np.float32)            # shape (N_seq, seq_length, n_features)
y_seq = np.array(labels)          # shape (N_seq,)

# ---------------------------
# 11. Train/Val/Test split temporel
# ---------------------------
N = len(X_seq)
train_frac = 0.7
val_frac = 0.15
test_frac = 0.15

i_train_end = int(N * train_frac)
i_val_end   = int(N * (train_frac + val_frac))

X_train = X_seq[:i_train_end]
y_train = y_seq[:i_train_end]
X_val   = X_seq[i_train_end:i_val_end]
y_val   = y_seq[i_train_end:i_val_end]
X_test  = X_seq[i_val_end:]
y_test  = y_seq[i_val_end:]

# Convert to torch tensors
X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.long).to(device)
X_val_t   = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_t   = torch.tensor(y_val, dtype=torch.long).to(device)
X_test_t  = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_t  = torch.tensor(y_test, dtype=torch.long).to(device)

train_ds = TensorDataset(X_train_t, y_train_t)
val_ds   = TensorDataset(X_val_t, y_val_t)
test_ds  = TensorDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False)