In [22]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import math

In [23]:
# ============================================================
# CONFIG GLOBAL
# ============================================================

file_path = r"C:\Users\ponro\OneDrive\Desktop\DataScience_FullStack\Projets\Projet_Collectifs\dsfs-ft-36-eruption\pf_2020-03-30_filtered_downsampled.csv"

time_col = "time"
amp_col = "amplitude"

fs = 1/60      # échantillonnage 1/min
win = 10       # taille fenêtre (min)
step = 10      # step fenêtres
env_window = 20

eruption_start = pd.to_datetime("2020-04-02T08:20:00Z")
eruption_end   = pd.to_datetime("2020-04-06T09:30:00Z")

In [24]:
# ============================================================
# FONCTIONS UTILITAIRES
# ============================================================

def type_component(ch):
    ch = str(ch).upper()
    return 0 if ch.endswith("Z") else 1   # 0 = vertical, 1 = horizontal

def shannon_entropy(segment, bins=50):
    p, _ = np.histogram(segment, bins=bins, density=True)
    p = p[p > 0]
    if len(p)==0:
        return 0.0
    return -np.sum(p * np.log2(p))

# def frequency_index(segment, fs):
#     N = len(segment)
#     freqs = np.fft.rfftfreq(N, d=1/fs)
#     S = np.abs(np.fft.rfft(segment))**2

#     low = (freqs>=1)&(freqs<=5.5)
#     high = (freqs>=6)&(freqs<=16)

#     E_low = S[low].sum()
#     E_high = S[high].sum()

#     if E_low == 0:
#         return np.nan

#     return np.log10(E_high/E_low) ==> à réintégrer si prise en compte de la Frequency Index

def compute_delay_class(hours):
    if pd.isna(hours): return 0
    if hours <= 0: return 4
    if hours <= 1: return 3
    if hours <= 12: return 2
    if hours <= 16: return 1
    return 0

def compute_delay_hours(t):
    """Return delay relative to eruption window."""
    if t > eruption_end:
        return np.nan
    if eruption_start <= t <= eruption_end:
        return 0.0
    # avant l'éruption → temps restant avant début
    return (eruption_start - t).total_seconds()/3600.0

In [25]:
# ============================================================
# 1. LECTURE + PREPROCESS LÉGER
# ============================================================

df = pd.read_csv(file_path)

# standardisation nom colonnes
df.columns = [c.strip().lower() for c in df.columns]

# conversion time
df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
df = df.dropna(subset=[time_col])

# composante numérique
df["component_flag"] = df["channel"].apply(type_component)

In [26]:
# mapping channel -> composante
component_map = {
    "EHZ": 0, "BHZ": 0, "HHZ": 0,   # Vertical
    "EHE": 1, "EHN": 1, "BHE": 1, "BHN": 1, "HHE": 1, "HHN": 1  # Horizontal
}

final_frames = []

In [27]:
# ---------------------------
# Séparation par station / channel
# ---------------------------
station_channel_groups = {
    (st, ch): g.sort_values(time_col).copy()
    for (st, ch), g in df.groupby(["station", "channel"])
}

In [28]:
for (st, ch), g in station_channel_groups.items():

    sig = g[amp_col].astype(float).values
    times = g[time_col].values
    n = len(sig)

    if n < win:
        continue

    rows = []
    for i in range(0, n - win + 1, step):
        seg = sig[i:i+win]
        t_end = times[i+win-1]

        SE  = shannon_entropy(seg)
        K   = float(kurtosis(seg, fisher=True, bias=False))
        #FI  = frequency_index(seg, fs)
        std = float(np.std(seg))
        mean = float(np.mean(seg))
        med  = float(np.median(seg))
        p90 = float(np.percentile(seg,90))
        p10 = float(np.percentile(seg,10))
        tens = p90 - p10

        rows.append([t_end, SE, K, std, mean, med, p90, p10, tens])

    feat = pd.DataFrame(
        rows,
        columns=[
            "time","SE","Kurtosis","std","mean","median",
            "per90","per10","tension"
        ]
    )

    feat["station"] = st
    feat["channel"] = ch

    # -------- enveloppe (par DF isolé) --------
    for col in ["SE","Kurtosis","std","mean","median","per90","per10","tension"]: #,"FI" à réintégrer si prise en compte de la Frequency Index
        feat[col+"_env"] = feat[col].rolling(env_window, min_periods=1).median()


    # -------- delay + classe --------
    feat["time"] = feat["time"].dt.tz_localize("UTC").dt.tz_convert("UTC")
    feat["delay_hours"] = feat["time"].apply(compute_delay_hours)
    feat["label"] = feat["delay_hours"].apply(compute_delay_class).astype(int)

    # -------- tag composante --------
    feat["component_flag"] = component_map.get(ch, -1)

    final_frames.append(feat)

In [29]:
# -----------------------------
# Concat final
# -----------------------------
final_df = pd.concat(final_frames, ignore_index=True)

In [30]:
final_df=final_df.drop("channel",axis=1)

In [31]:
final_df.columns

Index(['time', 'SE', 'Kurtosis', 'std', 'mean', 'median', 'per90', 'per10',
       'tension', 'station', 'SE_env', 'Kurtosis_env', 'std_env', 'mean_env',
       'median_env', 'per90_env', 'per10_env', 'tension_env', 'delay_hours',
       'label', 'component_flag'],
      dtype='object')

In [32]:
final_df

Unnamed: 0,time,SE,Kurtosis,std,mean,median,per90,per10,tension,station,...,Kurtosis_env,std_env,mean_env,median_env,per90_env,per10_env,tension_env,delay_hours,label,component_flag
0,2020-03-30 00:01:30+00:00,0.663807,0.476227,123.100520,-37.565111,-43.303673,68.307201,-177.854938,246.162139,BON,...,0.476227,123.100520,-37.565111,-43.303673,68.307201,-177.854938,246.162139,80.308333,0,1
1,2020-03-30 00:03:10+00:00,0.600959,-0.576031,161.805842,-55.421499,-51.182505,110.381080,-278.973581,389.354661,BON,...,-0.049902,142.453181,-46.493305,-47.243089,89.344140,-228.414260,317.758400,80.280556,0,1
2,2020-03-30 00:04:50+00:00,0.620373,-1.066237,170.512009,25.619871,30.244040,209.834037,-204.720460,414.554497,BON,...,-0.576031,161.805842,-37.565111,-43.303673,110.381080,-204.720460,389.354661,80.252778,0,1
3,2020-03-30 00:06:30+00:00,0.375461,0.950616,261.956437,7.998256,12.949737,314.964549,-252.770878,567.735428,BON,...,-0.049902,166.158925,-14.783427,-15.176968,160.107559,-228.745669,401.954579,80.225000,0,1
4,2020-03-30 00:08:10+00:00,0.741771,-0.029760,124.305987,75.616784,100.710240,186.933463,-113.865500,300.798963,BON,...,-0.029760,161.805842,7.998256,12.949737,186.933463,-204.720460,389.354661,80.197222,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36538,2020-04-05 02:14:47.758394+00:00,0.002753,2.187842,69780.523874,29143.652972,34258.642648,93453.768729,-22759.210530,116212.979258,FLR,...,-0.163562,29535.114165,3580.894895,3070.843802,35872.897584,-27281.367725,65249.987124,0.000000,4,0
36539,2020-04-05 02:16:27.758394+00:00,0.005049,-0.530139,42305.027506,4658.524839,-2260.855634,43813.493828,-48337.527853,92151.021681,FLR,...,-0.163562,31171.584825,3580.894895,2861.958980,35872.897584,-28764.718119,69699.182783,0.000000,4,0
36540,2020-04-05 02:18:07.758394+00:00,0.005568,-0.382233,36741.916753,10836.654763,7980.286960,47143.691426,-23686.238955,70829.930381,FLR,...,-0.163562,32775.612062,3580.894895,2861.958980,35872.897584,-28764.718119,68116.144827,0.000000,4,0
36541,2020-04-05 02:19:47.758394+00:00,0.004860,-1.489538,51381.296421,-21348.278999,-4537.472479,27276.842897,-86672.218653,113949.061550,FLR,...,-0.339294,33717.597925,2969.230929,2192.649339,32813.803837,-30341.979201,72535.417285,0.000000,4,0


In [33]:
# Nouvelle Cellule de Vérification (après In[2])
print("\n--- Distribution des Labels ---")
print(final_df['label'].value_counts(normalize=True).sort_index())


--- Distribution des Labels ---
label
0    0.423118
1    0.023643
2    0.065019
3    0.005911
4    0.482309
Name: proportion, dtype: float64


In [34]:

# ---------------------------
# Encodage
# ---------------------------
# Colonnes numériques à garder
numeric_features = ["component_flag","SE","Kurtosis","std","mean","median","per90","per10","tension",
                    "SE_env","Kurt_env","std_env",
                    "year","month","day","hour","minute"] #,"FI","FI_env" à réintégrer si prise en compte de la Frequency Index

# garder uniquement celles existantes
numeric_features = [c for c in numeric_features if c in final_df.columns]

# Colonnes catégorielles à encoder : station
categorical_candidates = [c for c in final_df if c.lower() in ("station")]


# Build preprocess pipeline
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="infrequent_if_exist", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_candidates)
    ],
    remainder="drop"
)

# Remplir NaN numériques par median avant scaler
final_df[numeric_features] = final_df[numeric_features].fillna(final_df[numeric_features].median())

# Remplir catégoriques na par 'unk'
for c in categorical_candidates:
    final_df[c] = final_df[c].fillna("unk")


X_num = final_df[numeric_features]
X_cat = final_df[categorical_candidates]

X_all= pd.concat([X_num,X_cat],axis=1)
X_all_transformed = preprocessor.fit_transform(final_df)

In [35]:
# ---------------------------
# 10. Construction des sequences (sliding) pour le modèle
# ---------------------------
# On construit sequences non chevauchantes (ou chevauchantes selon step_sequence)
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 64
seq_length = 60       # longueur des séquences pour le modèle (en minutes)

step_seq = 1  # si 1 -> séquences glissantes à chaque point; si seq_length -> non chevauchées
X = X_all_transformed
y = final_df["label"].values
T = len(X)

seqs = []
labels = []
times_seq = []

for i in range(0, T - seq_length, step_seq):
    seq = X[i:i+seq_length]
    # label associé au temps de fin de séquence (alignement)
    lab = y[i+seq_length-1]
    seqs.append(seq)
    labels.append(lab)
    times_seq.append(final_df.index[i+seq_length-1])

X_seq = np.stack(seqs).astype(np.float32)            # shape (N_seq, seq_length, n_features)
y_seq = np.array(labels)          # shape (N_seq,)

# Conversion immédiate en Tenseur PyTorch (en évitant le risque du dtype('O'))
# On s'assure que tout est converti en Float32 (pour le GPU) et Long (pour les labels)
X_seq_t = torch.from_numpy(np.ascontiguousarray(X_seq)).float().to(device)
y_seq_t = torch.from_numpy(y_seq).to(device).long()

print(f"Shape de X_seq: {X_seq.shape}") 
# **ATTENDU : (N_sequences, 60, N_features)**
# REÇU (Probable) : (N_sequences, N_features)

if len(X_seq.shape) != 3:
    raise ValueError("X_seq n'a pas 3 dimensions. Le séquençage a échoué.")

# ---------------------------
# 11. Train/Val/Test split temporel
# ---------------------------
N = len(X_seq)
train_frac = 0.7
val_frac = 0.15
test_frac = 0.15

i_train_end = int(N * train_frac)
i_val_end   = int(N * (train_frac + val_frac))

X_train = X_seq[:i_train_end]
y_train = y_seq[:i_train_end]
X_val   = X_seq[i_train_end:i_val_end]
y_val   = y_seq[i_train_end:i_val_end]
X_test  = X_seq[i_val_end:]
y_test  = y_seq[i_val_end:]

# Convert to torch tensors
X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_t = torch.tensor(y_train, dtype=torch.long).to(device)
X_val_t   = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_t   = torch.tensor(y_val, dtype=torch.long).to(device)
X_test_t  = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_t  = torch.tensor(y_test, dtype=torch.long).to(device)

train_ds = TensorDataset(X_train_t, y_train_t)
val_ds   = TensorDataset(X_val_t, y_val_t)
test_ds  = TensorDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

Shape de X_seq: (36483, 60, 13)


In [36]:
# Nouvelle Cellule de Vérification (après In[11])
def check_label_distribution(y_array, name):
    counts = pd.Series(y_array).value_counts(normalize=True).sort_index()
    print(f"\n[D] Distribution des labels dans {name}:")
    print(counts)

check_label_distribution(y_train_t, "Train")
check_label_distribution(y_val_t, "Validation")
check_label_distribution(y_test_t, "Test")


[D] Distribution des labels dans Train:
0    0.421764
1    0.022555
2    0.062025
3    0.005639
4    0.488018
Name: proportion, dtype: float64

[D] Distribution des labels dans Validation:
0    0.423246
1    0.026316
2    0.072368
3    0.006579
4    0.471491
Name: proportion, dtype: float64

[D] Distribution des labels dans Test:
0    0.423168
1    0.026311
2    0.072355
3    0.006578
4    0.471588
Name: proportion, dtype: float64


### Application du transformer

In [37]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        # Initialisation de la matrice de codage positionnel
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        
        # Calcul de la division pour les fonctions sin/cos
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        # Application des fonctions sin et cos
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0).transpose(0, 1) # Shape: (Max_len, 1, d_model)
        self.register_buffer('pe', pe)

        

    def forward(self, x):
        
        tpe = self.pe[:x.size(1), :].squeeze()
        
        # x shape: (Seq_len, Batch_size, d_model)
        # Ajout du codage positionnel aux embeddings
        x = x + tpe
        return x

In [38]:
class TransformerClassifier(nn.Module):
    def __init__(self, n_features_input, n_classes, seq_length, 
                 d_model=128, nhead=4, num_layers=2, dropout=0.1):
        """
        :param n_features_input: Nombre total de colonnes (Numériques + One-Hot) dans X_all.
        :param n_classes: Nombre de classes de label (5 dans votre cas : 0 à 4).
        :param seq_length: Longueur de séquence (60 minutes).
        :param d_model: Dimension de l'embedding interne (souvent 64, 128, 256...).
        :param nhead: Nombre de têtes d'attention.
        :param num_layers: Nombre de blocs encodeurs à empiler.
        """
        super(TransformerClassifier, self).__init__()
        
        self.model_type = 'Transformer'
        self.d_model = d_model
        
        # 1. Feature Embedding (Projection d'entrée)
        # Permet de projeter les N_features dans l'espace d_model
        self.input_projection = nn.Linear(n_features_input, d_model)

        # 2. Positional Encoding
        self.pos_encoder = PositionalEncoding(d_model, max_len=seq_length)
        self.dropout = nn.Dropout(p=dropout)

        # 3. Transformer Encoder Blocks (PyTorch natif)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=nhead, 
            dropout=dropout, 
            batch_first=True # Attention: le format attendu est (Seq_len, Batch_size, d_model)
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)

        # 4. Output Head (Classification)
        # Utilise un Global Average Pooling sur la séquence, puis une couche linéaire
        self.decoder = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(d_model // 2, n_classes)
        )

    def forward(self, src):
        # src shape: (Batch_size, Seq_len, N_features)
        
        # 0. Transposer pour le format Transformer: (Seq_len, Batch_size, N_features)
        # src = src.transpose(0, 1) 
        
        # 1. Projection (Embedding)
        src = self.input_projection(src) * math.sqrt(self.d_model) # Scaling par sqrt(d_model)
        
        # 2. Positional Encoding
        src = self.pos_encoder(src)
        src = self.dropout(src)
        

        # 3. Transformer Encoder
        output = self.transformer_encoder(src) # output shape: (Seq_len, Batch_size, d_model)
        

        # 4. Global Average Pooling sur la dimension de la séquence (dim=0 après transpose)
        # On prend la moyenne des vecteurs de features de la séquence de 60 minutes
        sequence_avg = output.mean(dim=1) # sequence_avg shape: (Batch_size, d_model)
        

        # 5. Classification Head
        return self.decoder(sequence_avg)

In [39]:
print(X_all_transformed.shape)

(36543, 13)


In [40]:
X_all_transformed.dtype

dtype('float64')

In [41]:
# HYPERPARAMÈTRES
N_FEATURES_INPUT = X_all_transformed.shape[1]  # <--- REMPLACER AVEC LA TAILLE VRAIMENT OBTENUE (ex: 35)
N_CLASSES = 5                      # Classes 0 à 4
SEQ_LENGTH = 60                    # Longueur de séquence définie dans la Cellule 45

# Paramètres du Transformer
D_MODEL = 128
N_HEAD = 4
N_LAYERS = 2

# Instanciation
model = TransformerClassifier(
    n_features_input=13,
    n_classes=N_CLASSES,
    seq_length=SEQ_LENGTH,
    d_model=D_MODEL,
    nhead=N_HEAD,
    num_layers=N_LAYERS
)

# Afficher la structure pour vérification
print(model)

TransformerClassifier(
  (input_projection): Linear(in_features=13, out_features=128, bias=True)
  (pos_encoder): PositionalEncoding()
  (dropout): Dropout(p=0.1, inplace=False)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
 

In [42]:
# =======================================================
# 1. Définition des Hyperparamètres d'Entraînement
# =======================================================
LEARNING_RATE = 1e-4  # Taux d'apprentissage, valeur standard pour les Transformers
NUM_EPOCHS = 5       # Nombre d'époques pour l'entraînement (à ajuster)
# Le DEVICE a été défini dans la Cellule 1 (ex: 'cuda' ou 'cpu')

# =======================================================
# 2. Critère de Perte et Optimiseur
# =======================================================
# Utilisation de la CrossEntropyLoss pour la classification multi-classe (5 classes)
criterion = nn.CrossEntropyLoss()

# Utilisation d'AdamW, une version d'Adam avec régularisation (Weight Decay)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# =======================================================
# 3. Fonctions d'Entraînement et de Validation
# =======================================================

def train_epoch(model, dataloader, criterion, optimizer, device):
    """Effectue une passe d'entraînement sur toutes les données du DataLoader."""
    model.train() # Met le modèle en mode entraînement
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    
    for inputs, labels in dataloader:
        # AJOUTEZ CETTE LIGNE DE VÉRIFICATION :
        if inputs.dim() != 3:
            print(f"ERREUR DIMENSION: Tenseur d'entrée a {inputs.dim()} dimensions. Taille : {inputs.size()}")
            # Arrêtez ici pour diagnostiquer le batch_size
            raise ValueError("Tenseur d'entrée invalide (pas 3 dims).")
        inputs, labels = inputs.to(device), labels.to(device)

        
        # 1. Mise à zéro des gradients
        optimizer.zero_grad()
        
        # 2. Passe avant (Forward pass)
        outputs = model(inputs)
        
        # 3. Calcul de la perte
        loss = criterion(outputs, labels)
        
        # 4. Passe arrière (Backward pass)
        loss.backward()
        
        # 5. Mise à jour des poids
        optimizer.step()
        
        total_loss += loss.item() * inputs.size(0)
        
        # Calcul de l'exactitude (Accuracy)
        _, predicted = torch.max(outputs.data, 1)
        total_samples += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

    avg_loss = total_loss / total_samples
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy


def evaluate(model, dataloader, criterion, device):
    """Évalue le modèle sur un jeu de données (validation ou test)."""
    model.eval() # Met le modèle en mode évaluation (désactive dropout/batchnorm)
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad(): # Désactive le calcul des gradients pour l'évaluation
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item() * inputs.size(0)
            
            # Calcul de l'exactitude (Accuracy)
            _, predicted = torch.max(outputs.data, 1)
            total_samples += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

    avg_loss = total_loss / total_samples
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy

In [43]:
# =======================================================
# 4. Boucle Principale d'Entraînement
# =======================================================
best_val_loss = float('inf')

print(f"Début de l'entraînement sur {NUM_EPOCHS} époques...")

for epoch in range(1, NUM_EPOCHS + 1):
    # --- Entraînement ---
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    
    # --- Validation ---
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    
    print(f"Epoch {epoch}/{NUM_EPOCHS}:")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"  Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}")
    
    # Sauvegarde du meilleur modèle basé sur la perte de validation
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        
        # Vous pouvez sauvegarder le modèle ici si vous le souhaitez
        # torch.save(model.state_dict(), 'best_transformer_model.pth')
        print(f"  -> Modèle sauvegardé à l'époque {epoch} (Val Loss: {val_loss:.4f})")

Début de l'entraînement sur 5 époques...
Epoch 1/5:
  Train Loss: 0.3616 | Train Acc: 0.9050
  Val Loss:   0.3997 | Val Acc:   0.8905
  -> Modèle sauvegardé à l'époque 1 (Val Loss: 0.3997)
Epoch 2/5:
  Train Loss: 0.2192 | Train Acc: 0.9352
  Val Loss:   0.6441 | Val Acc:   0.8807
Epoch 3/5:
  Train Loss: 0.1935 | Train Acc: 0.9429
  Val Loss:   0.6936 | Val Acc:   0.8810
Epoch 4/5:
  Train Loss: 0.1792 | Train Acc: 0.9471
  Val Loss:   0.7845 | Val Acc:   0.8818
Epoch 5/5:
  Train Loss: 0.1720 | Train Acc: 0.9475
  Val Loss:   0.8019 | Val Acc:   0.8812
