### Import Libraries

In [6]:
import os, re
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torchvision.transforms as T
from torchvision.models import resnet50

import matplotlib.pyplot as plt
import torch.nn.functional as F

from torch_geometric.data import Data as PyGData, Batch as PyGBatch
from torch_geometric.nn import GATConv
from torch_geometric.utils import to_dense_batch

from torch.utils.data import Dataset
import math, random, time
import torch.optim as optim
from torch.utils.data import DataLoader, Subset

### Path: Input and Output

In [3]:
RGB_FOLDER = Path(r"D:\Datasets\Datasets\EPIC_Kitchen\RGB\P01_01\Original")   
LABEL_CSV   = Path(r"D:\Datasets\Datasets\EPIC\Labels\P01_01.csv")    
OUTPUT_FUSED_CSV = Path(r"D:\Datasets\Datasets\EPIC\Features\RGB_Only\P01_01_rgb_only.csv")

SAMPLE_RATE = 1 
FEAT_DIM = 512 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

OUTPUT_FUSED_CSV.parent.mkdir(parents=True, exist_ok=True)

_frame_number_re = re.compile(r"(\d+)(?=\.[^.]+$)")
def parse_frame_index(fname: str):
    m = _frame_number_re.search(fname)
    if m:
        return int(m.group(1))
    digs = re.findall(r"\d+", fname)
    return int(digs[-1]) if digs else 0

### Feature extractor: ResNet50 backbone + PCA projection to 512-D from 2048

In [4]:
_resnet = resnet50(weights=True)
_resnet = nn.Sequential(*list(_resnet.children())[:-1]).to(DEVICE).eval()
_proj = nn.Linear(2048, FEAT_DIM).to(DEVICE).eval()

_transform = T.Compose([
    T.Resize((224,224)),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

@torch.no_grad()
def extract_feature_from_pil(pil_img: Image.Image):
    x = _transform(pil_img).unsqueeze(0).to(DEVICE)   
    feat = _resnet(x).view(1, -1)                    
    feat = _proj(feat)                               
    return feat.squeeze(0).cpu().numpy().astype(np.float32)



### Extract RGB features and save into CSV

In [76]:
def extract_and_save_rgb_only(csv_labels_path: Path,
                              rgb_folder: Path,
                              out_fused_csv: Path,
                              sample_rate: int = 1):
    # load labels
    labels_df = pd.read_csv(csv_labels_path)

    # list rgb frames
    rgb_files = sorted([p for p in rgb_folder.iterdir()
                        if p.suffix.lower() in [".jpg",".png",".jpeg"]])
    sampled = rgb_files[::sample_rate]
    if len(sampled) == 0:
        raise RuntimeError(f"No frames found in {rgb_folder}")

    fused_rows = []
    feat_cols = [f"feat_{i}" for i in range(FEAT_DIM)]

    for fp in tqdm(sampled, desc="Extract RGB-only"):
        fname = fp.name
        frame_idx = parse_frame_index(fname)

        # --- RGB feature ---
        try:
            pil = Image.open(fp).convert("RGB")
            rgb_feat = extract_feature_from_pil(pil)  # (FEAT_DIM,)
        except Exception as e:
            print(f"[WARN] RGB skip {fname}: {e}")
            continue

        fused_vec = rgb_feat 
        lr = labels_df[(labels_df["StartFrame"] <= frame_idx) &
                       (labels_df["EndFrame"] >= frame_idx)]
        if not lr.empty:
            action_label = int(lr.iloc[0].get("ActionLabel", -1))
            action_name  = str(lr.iloc[0].get("ActionName", "Unknown"))
        else:
            action_label, action_name = -1, "Unknown"

        row = {
            "frame_idx": int(frame_idx),
            "frame_name": fname,
            "ActionLabel": int(action_label),
            "ActionName": action_name
        }
        for i_val, v in enumerate(fused_vec):
            row[f"feat_{i_val}"] = float(v)
        fused_rows.append(row)

    if len(fused_rows) == 0:
        raise RuntimeError("No fused rows extracted; check paths and files.")

    df_fused = pd.DataFrame(fused_rows)
    df_fused.to_csv(out_fused_csv, index=False)
    print(f"[SAVED] RGB-only CSV -> {out_fused_csv}")
    return df_fused

df_fused = extract_and_save_rgb_only(
    csv_labels_path = LABEL_CSV,
    rgb_folder      = RGB_FOLDER,
    out_fused_csv   = OUTPUT_FUSED_CSV,
    sample_rate     = SAMPLE_RATE
)

data = pd.read_csv(OUTPUT_FUSED_CSV)
data.head()

## NOW From Here, We have to import the features and Label file

In [1]:
import pandas as pd
from pathlib import Path

def load_fused_csv_by_path(fused_csv_path: str):
    fp = Path(fused_csv_path)
    if not fp.exists():
        raise FileNotFoundError(f"Fused features CSV not found: {fp}")
    df = pd.read_csv(fp)
    if "frame_idx" not in df.columns:
        raise KeyError("Fused CSV must contain 'frame_idx' column")
    df["frame_idx"] = df["frame_idx"].astype(int)
    df = df.sort_values("frame_idx").reset_index(drop=True)
    return df

def load_label_csv_by_path(label_csv_path: str):
    fp = Path(label_csv_path)
    if not fp.exists():
        raise FileNotFoundError(f"Label CSV not found: {fp}")
    df = pd.read_csv(fp)
    return df

# ------------------------------------ Paths ---------------------------
fused_df = load_fused_csv_by_path(r"EPIC-Kitchens\Features\RGB_Only\P01_05_rgb_only.csv")
labels_df = load_label_csv_by_path(r"EPIC-Kitchens\Labels\P01_05.csv")

In [18]:
fused_df

Unnamed: 0,frame_idx,frame_name,ActionLabel,ActionName,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,...,feat_502,feat_503,feat_504,feat_505,feat_506,feat_507,feat_508,feat_509,feat_510,feat_511
0,0,frame_00000.jpg,-1,Unknown,-0.077153,-0.060141,-0.063853,-0.175518,-0.600904,0.060846,...,-0.100195,-0.011176,0.040939,-0.115192,0.337024,-0.334822,0.071299,0.363217,0.035090,0.130953
1,1,frame_00001.jpg,-1,Unknown,-0.078960,-0.046701,-0.057949,-0.184592,-0.615166,0.065532,...,-0.097203,-0.003051,0.027492,-0.099771,0.331598,-0.338926,0.076210,0.371555,0.051404,0.124059
2,2,frame_00002.jpg,-1,Unknown,-0.160049,-0.007246,-0.074976,-0.242157,-0.495369,0.244383,...,0.021188,0.076688,0.034153,-0.087643,0.294703,-0.237091,0.121907,0.275383,0.008786,0.021006
3,3,frame_00003.jpg,-1,Unknown,-0.158195,0.001839,-0.073915,-0.236323,-0.492643,0.253376,...,-0.011987,0.039287,0.034120,-0.120837,0.304935,-0.245893,0.105011,0.285998,-0.018592,0.051988
4,4,frame_00004.jpg,-1,Unknown,-0.054177,-0.061776,-0.066674,-0.177514,-0.485966,0.233870,...,0.077171,-0.067824,0.042044,-0.137903,0.280240,-0.247863,0.156662,0.275507,0.035300,-0.094748
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76320,76320,frame_76320.jpg,-1,Unknown,-0.391543,0.130388,0.190418,-0.307356,-0.693469,0.317763,...,-0.412232,-0.095521,0.046825,0.204191,0.083198,-0.064733,0.072751,0.159636,-0.103465,0.076448
76321,76321,frame_76321.jpg,-1,Unknown,-0.496509,0.123087,0.151213,-0.352214,-0.785692,0.329649,...,-0.440851,-0.046618,0.003361,0.243825,0.187601,-0.037656,0.004446,0.199940,-0.151751,0.041779
76322,76322,frame_76322.jpg,-1,Unknown,-0.471430,0.085628,0.167869,-0.328093,-0.783837,0.304862,...,-0.440618,-0.022471,-0.008947,0.244638,0.219109,-0.040398,0.009183,0.185624,-0.140589,0.042815
76323,76323,frame_76323.jpg,-1,Unknown,-0.466581,0.063967,0.173334,-0.321778,-0.687768,0.327833,...,-0.429024,0.001221,0.060987,0.165486,0.191083,-0.147650,0.033018,0.160600,-0.213605,0.077080


In [19]:
labels_df

Unnamed: 0,StartFrame,EndFrame,Verb,Verb_class,Noun,Noun_class,ActionLabel,ActionName
0,248.0,355.0,open,2.0,fridge,10.0,0.0,open fridge
1,390.0,484.0,take,0.0,mushroom,110.0,1.0,take mushroom
2,481.0,522.0,move,9.0,container,29.0,2.0,move container
3,524.0,853.0,take,0.0,sausage,84.0,3.0,take sausage
4,849.0,973.0,put,1.0,mushroom,110.0,4.0,put mushroom
...,...,...,...,...,...,...,...,...
256,75526.0,75600.0,put,1.0,rosemary,331.0,121.0,put rosemary
257,75751.0,75824.0,take,0.0,knife,5.0,27.0,take knife
258,75823.0,76139.0,mix,6.0,food,37.0,122.0,mix food
259,,,,,,,,


### Dataset Loader
_We have to create the Dataset Loader for this particular task (Action Anticipation)_

In [7]:
IGNORE_INDEX = -1

class SingleVideoAnticipationDataset(Dataset):
    def __init__(self, fused_df_or_path, labels_df_or_path,
                 t_obs: int, k_fut: int, feat_dim: int,
                 fps: float, horizons_s: list[float]):
        
        # load paths
        if isinstance(fused_df_or_path, (str, Path)):
            fused_df = pd.read_csv(fused_df_or_path)
        else:
            fused_df = fused_df_or_path.copy()
        if isinstance(labels_df_or_path, (str, Path)):
            labels_df = pd.read_csv(labels_df_or_path)
        else:
            labels_df = labels_df_or_path.copy()

        if "frame_idx" not in fused_df.columns:
            raise KeyError("fused_df must contain 'frame_idx'")

        fused_df["frame_idx"] = fused_df["frame_idx"].astype(int)
        self.fused_df = fused_df.set_index("frame_idx", drop=False).sort_index()
        self.labels_df = labels_df.reset_index(drop=True)

        if not all(c in self.labels_df.columns for c in ["StartFrame", "EndFrame"]):
            raise KeyError("labels_df must contain StartFrame and EndFrame")

        self.t_obs = int(t_obs)
        self.k_fut = int(k_fut)
        self.feat_dim = int(feat_dim)
        self.feat_cols = [f"feat_{i}" for i in range(self.feat_dim)]

        # NEW: time info
        self.fps = float(fps)
        assert len(horizons_s) == self.k_fut, "len(horizons_s) must equal k_fut"
        self.horizons_s = list(horizons_s)

        # samples: one per label row (use EndFrame as obs_end)
        self.samples = []
        for ridx, row in self.labels_df.iterrows():
            try:
                obs_end = int(row["EndFrame"])
            except:
                continue
            self.samples.append({"label_row_idx": int(ridx), "obs_end": obs_end})

        if len(self.samples) == 0:
            raise RuntimeError("No valid label rows found")

    def __len__(self):
        return len(self.samples)

    # NEW: time-based future labels instead of next segments
    def _time_based_future_labels(self, obs_end: int):
        labels_df = self.labels_df

        def pick(cols):
            for c in cols:
                if c in labels_df.columns:
                    return c
            return None

        vcol = pick(["Verb_class","verb","Verb","verb_class"])
        ncol = pick(["Noun_class","noun","Noun","noun_class"])
        acol = pick(["Action_class","action","Action","ActionLabel"])

        verb_targets   = []
        noun_targets   = []
        action_targets = []

        for h_sec in self.horizons_s:
            future_frame = obs_end + int(round(h_sec * self.fps))
            seg = labels_df[(labels_df["StartFrame"] <= future_frame) &
                            (labels_df["EndFrame"]   >= future_frame)]
            if seg.empty:
                verb_targets.append(IGNORE_INDEX)
                noun_targets.append(IGNORE_INDEX)
                action_targets.append(IGNORE_INDEX)
            else:
                row = seg.iloc[0]
                if vcol is not None and not pd.isna(row[vcol]):
                    verb_targets.append(int(row[vcol]))
                else:
                    verb_targets.append(IGNORE_INDEX)

                if ncol is not None and not pd.isna(row[ncol]):
                    noun_targets.append(int(row[ncol]))
                else:
                    noun_targets.append(IGNORE_INDEX)

                if acol is not None and not pd.isna(row[acol]):
                    action_targets.append(int(row[acol]))
                else:
                    action_targets.append(IGNORE_INDEX)

        return {
            "verb":   torch.LongTensor(verb_targets),
            "noun":   torch.LongTensor(noun_targets),
            "action": torch.LongTensor(action_targets)
        }

    def __getitem__(self, idx):
        rec = self.samples[idx]
        obs_end = rec["obs_end"]
        obs_start = obs_end - (self.t_obs - 1)
        if obs_start < 0:
            obs_start = 0
            obs_end = obs_start + (self.t_obs - 1)

        fused_idx_min = int(self.fused_df.index.min())
        fused_idx_max = int(self.fused_df.index.max())
        obs_end = min(obs_end, fused_idx_max)
        obs_start = max(obs_end - (self.t_obs - 1), fused_idx_min)

        desired = list(range(obs_start, obs_end + 1))
        sel = self.fused_df.reindex(desired).fillna(method="ffill").fillna(method="bfill").fillna(0.0)

        if sel.shape[0] < self.t_obs:
            if sel.shape[0] == 0:
                zero_row = {c:0.0 for c in self.feat_cols}
                sel = pd.DataFrame([zero_row] * self.t_obs)
            else:
                first = sel.iloc[[0]]
                pads = pd.concat([first] * (self.t_obs - sel.shape[0]), ignore_index=True)
                sel = pd.concat([pads, sel.reset_index(drop=True)], ignore_index=True)

        for c in self.feat_cols:
            if c not in sel.columns:
                sel[c] = 0.0

        F_window = torch.from_numpy(sel[self.feat_cols].values).float()
        y_multi = self._time_based_future_labels(obs_end)

        meta = {"obs_start": int(obs_start),
                "obs_end":   int(obs_end),
                "label_row_idx": int(rec["label_row_idx"])}
        return F_window, y_multi, meta

### GRAPH Construction 
_Here We are creating the Graph using kNN strategy_

In [8]:
K = 5
DROP = 0.1

def build_topk_edge_index(features: torch.Tensor, k=K):
    Tn = int(features.size(0))
    x = F.normalize(features, dim=1)
    sim = torch.matmul(x, x.t())   # (T,T)
    sim.fill_diagonal_(-1.0)
    vals, idxs = torch.topk(sim, k, dim=1)
    src = torch.arange(Tn).unsqueeze(1).expand(-1, k).reshape(-1)
    dst = idxs.reshape(-1)
    edge = torch.stack([src, dst], dim=0)
    edge_rev = torch.stack([dst, src], dim=0)
    return torch.cat([edge, edge_rev], dim=1).long()

class BatchedGAT(nn.Module):
    def __init__(self, in_dim, hid_dim=None, num_layers=3, heads=8, dropout=DROP):
        super().__init__()
        hid = hid_dim or in_dim
        self.convs = nn.ModuleList()
        for i in range(num_layers):
            in_ch = in_dim if i==0 else hid
            self.convs.append(GATConv(in_ch, hid//heads, heads=heads, concat=True, dropout=dropout))
        self.proj = nn.Linear(hid, in_dim)
        self.norm = nn.LayerNorm(in_dim)
        self.act = nn.GELU()
    def forward(self, pyg_batch: PyGBatch, T_per_sample: int):
        x = pyg_batch.x; edge_index = pyg_batch.edge_index
        h = x
        for conv in self.convs:
            h = conv(h, edge_index); h = self.act(h)
        h = self.proj(h)
        node_feats, mask = to_dense_batch(h, batch=pyg_batch.batch)  # (B, max_nodes, D)
        B, max_nodes, D = node_feats.shape
        if max_nodes < T_per_sample:
            pad = torch.zeros(B, T_per_sample - max_nodes, D, device=node_feats.device)
            node_feats = torch.cat([node_feats, pad], dim=1)
        elif max_nodes > T_per_sample:
            node_feats = node_feats[:, :T_per_sample, :]
        return self.norm(node_feats)

### Encoder, Decoder, Anticipation Model
_Now here, we have GETR + HATD_

In [9]:
class GETR(nn.Module):
    def __init__(self, d_model, nhead=8, num_layers=3, dim_feedforward=2048, dropout=DROP, max_len=1000):
        super().__init__()
        enc = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead,
            dim_feedforward=dim_feedforward, dropout=dropout,
            activation='gelu', batch_first=True
        )
        self.encoder = nn.TransformerEncoder(enc, num_layers=num_layers)
        self.pos_emb = nn.Parameter(torch.randn(1, max_len, d_model))
    def forward(self, x):
        B,T,D = x.shape
        pos = self.pos_emb[:, :T, :].to(x.device)
        return self.encoder(x + pos)

class AnticipationModel(nn.Module):
    def __init__(self, feat_dim, num_classes: dict,
                 k_fut=5, gat_layers=3, gat_heads=8,
                 dec_layers=3, dec_heads=8, dropout=DROP):
        super().__init__()
        self.feat_dim = feat_dim; self.k_fut = k_fut
        self.gat = BatchedGAT(in_dim=feat_dim, hid_dim=feat_dim,
                              num_layers=gat_layers, heads=gat_heads,
                              dropout=dropout)
        self.encoder = GETR(d_model=feat_dim, nhead=dec_heads, num_layers=3)
        dec_layer = nn.TransformerDecoderLayer(
            d_model=feat_dim, nhead=dec_heads,
            dim_feedforward=feat_dim*4, dropout=dropout,
            activation='gelu', batch_first=True
        )
        self.decoder = nn.TransformerDecoder(dec_layer, num_layers=dec_layers)
        self.queries = nn.Parameter(torch.randn(1, k_fut, feat_dim))
        assert isinstance(num_classes, dict)
        self.verb_head   = nn.Linear(feat_dim, num_classes["verb"])
        self.noun_head   = nn.Linear(feat_dim, num_classes["noun"])
        self.action_head = nn.Linear(feat_dim, num_classes["action"])

    def forward(self, F_batch):
        # F_batch: (B, T, D)
        B,T,D = F_batch.shape
        device = F_batch.device
        data_list=[]
        for b in range(B):
            x = F_batch[b]
            edge_index = build_topk_edge_index(x.detach().cpu(), k=K).to(device)
            data_list.append(PyGData(x=x, edge_index=edge_index))
        pyg_batch = PyGBatch.from_data_list(data_list).to(device)
        G = self.gat(pyg_batch, T_per_sample=T)   # (B,T,D)
        H = self.encoder(F_batch)                 # (B,T,D)
        U = H + G
        q = self.queries.expand(B, -1, -1).to(device)
        dec_out = self.decoder(tgt=q, memory=U)   # (B, K_fut, D)
        return {
            "verb":   self.verb_head(dec_out),
            "noun":   self.noun_head(dec_out),
            "action": self.action_head(dec_out)
        }

### Cross-Entropy Loss
_We have to make Masked Loss Fusion where we have to classify the verb, noun and action classes_

In [10]:
import torch.nn.functional as F

def masked_cross_entropy(logits, labels, ignore_index=IGNORE_INDEX):
    B,K,C = logits.shape
    logits_flat = logits.view(B*K, C)
    labels_flat = labels.view(B*K)
    loss_flat = F.cross_entropy(logits_flat, labels_flat, reduction='none', ignore_index=ignore_index)
    mask = (labels_flat != ignore_index).float()
    valid = mask.sum()

    if valid == 0:
        return (logits_flat * 0.0).sum()

    return (loss_flat * mask).sum() / valid


def topk_accuracy_per_task(logits, labels, topk=(1,5), ignore_index=IGNORE_INDEX):
    B,K,C = logits.shape
    res = {}
    overall = {k:0 for k in topk}
    total_cnt = 0
    preds_topk = logits.topk(max(topk), dim=-1)[1]  # (B,K,maxk)
    for h in range(K):
        lab = labels[:,h]; mask = (lab != ignore_index); cnt = int(mask.sum().item())
        for k in topk:
            if cnt == 0:
                res.setdefault(f"per_h{h+1}_top{k}", None)
                continue
            predk = preds_topk[:,h,:k]  # (B,k)
            lab_exp = lab.unsqueeze(1).expand(-1, k)
            hits = (predk == lab_exp)
            hit = int(hits[mask].any(dim=1).float().sum().item())
            res[f"per_h{h+1}_top{k}"] = hit / cnt
            overall[k] += hit
        total_cnt += cnt
    for k in topk:
        res[f"overall_top{k}"] = overall[k] / total_cnt if total_cnt>0 else None
    return res

### Training and Validation
_Now here we _

In [17]:
from pathlib import Path
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

# === EDIT these paths ===
FUSED_CSV_PATH = r"D:\Datasets\Datasets\EPIC\Features\RGB_Only\P01_05_rgb_only.csv"
LABEL_CSV_PATH = r"D:\Datasets\Datasets\EPIC\Labels\P01_05.csv"
BEST_MODEL_PATH = Path(r"D:\Datasets\Datasets\EPIC\Model\P01_05_model_rgb_only.pth")
# ========================

# Hyperparams
T_OBS = 90
FEAT_DIM = 512
BATCH_SIZE = 8
NUM_EPOCHS = 50
LR = 1e-4
WD = 1e-4
NUM_WORKERS = 0

# === Time-based anticipation config ===
FPS = 30.0
HORIZONS_S = [0.25, 0.5, 0.75, 1.0, 1.25, 1.50, 1.75, 2.0]   # seconds into the future
K_FUT = len(HORIZONS_S)               # model will output one label per horizon
# =====================================

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
IGNORE_INDEX = -1  # must match your dataset + masked_cross_entropy


def detect_num_classes_from_labels_df(labels_df):
    verbs = set()
    nouns = set()
    actions = set()
    for cand in ["Verb_class", "verb", "Verb", "verb_class"]:
        if cand in labels_df.columns:
            verbs.update(labels_df[cand].dropna().astype(int).tolist())
            break
    for cand in ["Noun_class", "noun", "Noun", "noun_class"]:
        if cand in labels_df.columns:
            nouns.update(labels_df[cand].dropna().astype(int).tolist())
            break
    for cand in ["Action_class", "action", "Action", "ActionLabel"]:
        if cand in labels_df.columns:
            actions.update(labels_df[cand].dropna().astype(int).tolist())
            break
    nv = (max(verbs) + 1) if len(verbs) > 0 else 1
    nn_ = (max(nouns) + 1) if len(nouns) > 0 else 1
    na = (max(actions) + 1) if len(actions) > 0 else 1
    return {"verb": int(nv), "noun": int(nn_), "action": int(na)}


def topk_counts(logits, labels, k):
    # logits: (B, K_fut, C); labels: (B, K_fut)
    with torch.no_grad():
        B, K, C = logits.shape
        topk_preds = logits.topk(k, dim=-1)[1]  # (B, K, k)
        hits = 0
        total = 0
        for h in range(K):
            lab = labels[:, h]  # (B,)
            mask = (lab != IGNORE_INDEX)
            if int(mask.sum().item()) == 0:
                continue
            predk = topk_preds[:, h, :]  # (B, k)
            lab_exp = lab.unsqueeze(1).expand(-1, k)
            masked_pred = predk[mask]   # (M, k)
            masked_lab = lab_exp[mask]  # (M, k)
            hit_vec = (masked_pred == masked_lab).any(dim=1).float()
            hits += int(hit_vec.sum().item())
            total += int(mask.sum().item())
        return hits, total


# Load fused and labels
fused_df = pd.read_csv(FUSED_CSV_PATH)
labels_df = pd.read_csv(LABEL_CSV_PATH)

# Dataset
dataset = SingleVideoAnticipationDataset(
    fused_df,
    labels_df,
    t_obs=T_OBS,
    k_fut=K_FUT,        # must equal len(HORIZONS_S)
    feat_dim=FEAT_DIM,
    fps=FPS,
    horizons_s=HORIZONS_S
)

# split indices for train/val (60/40)
indices = list(range(len(dataset)))
random.seed(42)
random.shuffle(indices)
split_at = int(0.6 * len(indices))
train_idx = indices[:split_at]
val_idx = indices[split_at:]

train_ds = Subset(dataset, train_idx)
val_ds = Subset(dataset, val_idx)

train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=(DEVICE == "cuda")
)
val_loader = DataLoader(
    val_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=(DEVICE == "cuda")
)

# detect number of classes and instantiate model
num_classes = detect_num_classes_from_labels_df(labels_df)
print("Detected num_classes:", num_classes)
model = AnticipationModel(
    feat_dim=FEAT_DIM,
    num_classes=num_classes,
    k_fut=K_FUT
).to(DEVICE)

opt = optim.Adam(model.parameters(), lr=LR, weight_decay=WD)
sched = optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', factor=0.5, patience=3)

best_val_loss = float("inf")

for epoch in range(1, NUM_EPOCHS + 1):
    t0 = time.time()

    # ------------- TRAIN -------------
    model.train()
    train_loss_sum = 0.0
    train_samples = 0
    train_counts = {
        "verb_top1": [0, 0], "verb_top5": [0, 0],
        "noun_top1": [0, 0], "noun_top5": [0, 0],
        "action_top1": [0, 0], "action_top5": [0, 0]
    }

    pbar = tqdm(train_loader, desc=f"Epoch {epoch} Train", leave=False)
    for F_batch, y_multi, meta in pbar:
        F_batch = F_batch.to(DEVICE)               # (B, T, D)
        y_v = y_multi["verb"].to(DEVICE)           # (B, K_fut)
        y_n = y_multi["noun"].to(DEVICE)
        y_a = y_multi["action"].to(DEVICE)

        opt.zero_grad()
        logits = model(F_batch)   # dict: "verb"/"noun"/"action" -> (B, K_fut, C)

        loss_v = masked_cross_entropy(logits["verb"], y_v)
        loss_n = masked_cross_entropy(logits["noun"], y_n)
        loss_a = masked_cross_entropy(logits["action"], y_a)
        loss = loss_a + 0.5 * loss_v + 0.5 * loss_n

        loss.backward()
        opt.step()

        b = F_batch.size(0)
        train_loss_sum += float(loss.item()) * b
        train_samples += b

        for (task, lab, lg) in [
            ("verb", y_v, logits["verb"]),
            ("noun", y_n, logits["noun"]),
            ("action", y_a, logits["action"])
        ]:
            h1, t1 = topk_counts(lg.detach().cpu(), lab.detach().cpu(), k=1)
            h5, t5 = topk_counts(lg.detach().cpu(), lab.detach().cpu(), k=5)
            train_counts[f"{task}_top1"][0] += h1
            train_counts[f"{task}_top1"][1] += t1
            train_counts[f"{task}_top5"][0] += h5
            train_counts[f"{task}_top5"][1] += t5

    train_loss = train_loss_sum / max(1, train_samples)
    train_metrics = {}
    for task in ["verb", "noun", "action"]:
        h1, t1 = train_counts[f"{task}_top1"]
        h5, t5 = train_counts[f"{task}_top5"]
        train_metrics[f"{task}_top1"] = (h1 / t1) if t1 > 0 else None
        train_metrics[f"{task}_top5"] = (h5 / t5) if t5 > 0 else None

    # ------------- VALIDATION -------------
    model.eval()
    val_loss_sum = 0.0
    val_samples = 0
    val_counts = {
        "verb_top1": [0, 0], "verb_top5": [0, 0],
        "noun_top1": [0, 0], "noun_top5": [0, 0],
        "action_top1": [0, 0], "action_top5": [0, 0]
    }

    # store logits/labels for per-horizon + P/R/F1 metrics
    val_logits_store = {"verb": [], "noun": [], "action": []}
    val_labels_store = {"verb": [], "noun": [], "action": []}

    with torch.no_grad():
        pbar = tqdm(val_loader, desc=f"Epoch {epoch} Val", leave=False)
        for F_batch, y_multi, meta in pbar:
            F_batch = F_batch.to(DEVICE)
            y_v = y_multi["verb"].to(DEVICE)
            y_n = y_multi["noun"].to(DEVICE)
            y_a = y_multi["action"].to(DEVICE)

            logits = model(F_batch)
            loss_v = masked_cross_entropy(logits["verb"], y_v)
            loss_n = masked_cross_entropy(logits["noun"], y_n)
            loss_a = masked_cross_entropy(logits["action"], y_a)
            loss = loss_a + 0.5 * loss_v + 0.5 * loss_n

            b = F_batch.size(0)
            val_loss_sum += float(loss.item()) * b
            val_samples += b

            for (task, lab, lg) in [
                ("verb", y_v, logits["verb"]),
                ("noun", y_n, logits["noun"]),
                ("action", y_a, logits["action"])
            ]:
                h1, t1 = topk_counts(lg.detach().cpu(), lab.detach().cpu(), k=1)
                h5, t5 = topk_counts(lg.detach().cpu(), lab.detach().cpu(), k=5)
                val_counts[f"{task}_top1"][0] += h1
                val_counts[f"{task}_top1"][1] += t1
                val_counts[f"{task}_top5"][0] += h5
                val_counts[f"{task}_top5"][1] += t5

            # store for per-horizon + P/R/F1 metrics
            val_logits_store["verb"].append(logits["verb"].detach().cpu())
            val_logits_store["noun"].append(logits["noun"].detach().cpu())
            val_logits_store["action"].append(logits["action"].detach().cpu())
            val_labels_store["verb"].append(y_v.detach().cpu())
            val_labels_store["noun"].append(y_n.detach().cpu())
            val_labels_store["action"].append(y_a.detach().cpu())

    val_loss = val_loss_sum / max(1, val_samples)

    # overall val metrics (top-1/top-5 over all horizons)
    val_metrics = {}
    for task in ["verb", "noun", "action"]:
        h1, t1 = val_counts[f"{task}_top1"]
        h5, t5 = val_counts[f"{task}_top5"]
        val_metrics[f"{task}_top1"] = (h1 / t1) if t1 > 0 else None
        val_metrics[f"{task}_top5"] = (h5 / t5) if t5 > 0 else None

    # per-horizon metrics (time-based)
    per_horizon_metrics = {"verb": {}, "noun": {}, "action": {}}
    for task in ["verb", "noun", "action"]:
        if len(val_logits_store[task]) == 0:
            continue
        logits_all = torch.cat(val_logits_store[task], dim=0)  # (N, K_fut, C)
        labels_all = torch.cat(val_labels_store[task], dim=0)  # (N, K_fut)
        m = topk_accuracy_per_task(
            logits_all,
            labels_all,
            topk=(1, 5),
            ignore_index=IGNORE_INDEX
        )
        per_horizon_metrics[task] = m

    # macro precision / recall / F1 over all horizons (validation)
    prf_metrics = {"verb": {}, "noun": {}, "action": {}}
    for task in ["verb", "noun", "action"]:
        if len(val_logits_store[task]) == 0:
            continue

        logits_all = torch.cat(val_logits_store[task], dim=0)
        labels_all = torch.cat(val_labels_store[task], dim=0)

        preds_all = logits_all.argmax(dim=-1)  # (N, K_fut)
        mask = (labels_all != IGNORE_INDEX)
        if mask.sum().item() == 0:
            continue

        y_true = labels_all[mask].numpy()
        y_pred = preds_all[mask].numpy()

        p, r, f1, _ = precision_recall_fscore_support(
            y_true,
            y_pred,
            average="macro",
            zero_division=0
        )
        prf_metrics[task]["precision"] = p
        prf_metrics[task]["recall"] = r
        prf_metrics[task]["f1"] = f1

    # mean Top-5 recall across horizons for each task
    mean_top5_recall = {}
    for task in ["verb", "noun", "action"]:
        mh = per_horizon_metrics[task]
        if not mh:
            mean_top5_recall[task] = None
            continue

        vals = []
        for h_idx in range(K_FUT):
            key = f"per_h{h_idx+1}_top5"
            if key in mh and mh[key] is not None:
                vals.append(mh[key])
        mean_top5_recall[task] = float(np.mean(vals)) if len(vals) > 0 else None

    # scheduler + logging
    sched.step(val_loss)
    elapsed = time.time() - t0

    print(f"Epoch {epoch}/{NUM_EPOCHS} | Time {elapsed:.1f}s")
    print(f"  Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    for task in ["verb", "noun", "action"]:
        print(
            f"  {task.upper():6s} Train Top1: {train_metrics[f'{task}_top1']}, "
            f"Top5: {train_metrics[f'{task}_top5']}; "
            f"Val Top1: {val_metrics[f'{task}_top1']}, "
            f"Top5: {val_metrics[f'{task}_top5']}"
        )

    # print macro precision / recall / F1 (validation)
    for task in ["verb", "noun", "action"]:
        if prf_metrics[task]:
            p = prf_metrics[task]["precision"]
            r = prf_metrics[task]["recall"]
            f1 = prf_metrics[task]["f1"]
            print(
                f"  {task.upper():6s} Val Precision: {p:.4f}, "
                f"Recall: {r:.4f}, F1: {f1:.4f}"
            )

    # print mean Top-5 recall
    print("  ---- Mean Top-5 Recall (validation) ----")
    for task in ["verb", "noun", "action"]:
        print(
            f"     {task.upper():6s}  Mean Top-5 Recall: {mean_top5_recall[task]}"
        )

    # print per-horizon by seconds
    for task in ["verb", "noun", "action"]:
        mh = per_horizon_metrics[task]
        if not mh:
            continue
        print(f"  {task.upper():6s} per-horizon (time-based):")
        for h_idx, t_sec in enumerate(HORIZONS_S):
            key1 = f"per_h{h_idx+1}_top1"
            key5 = f"per_h{h_idx+1}_top5"
            v1 = mh.get(key1, None)
            v5 = mh.get(key5, None)
            print(f"    @ {t_sec:4.2f}s  Top1: {v1}  Top5: {v5}")
        print(
            f"    overall_top1: {mh.get('overall_top1', None)}, "
            f"overall_top5: {mh.get('overall_top5', None)}"
        )

    # optional: save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(
            {
                'epoch': epoch,
                'model_state': model.state_dict(),
                'opt_state': opt.state_dict(),
                'val_loss': val_loss
            },
            BEST_MODEL_PATH
        )
        print(f"[SAVED BEST] -> {BEST_MODEL_PATH}")

print("Training finished.")

Detected num_classes: {'verb': 81, 'noun': 332, 'action': 123}


Epoch 1 Train:   0%|          | 0/20 [00:00<?, ?it/s]

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Epoch 1 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 1/50 | Time 5.8s
  Train Loss: 9.0768 | Val Loss: 9.0472
  VERB   Train Top1: 0.16693418940609953, Top5: 0.5537720706260032; Val Top1: 0.24934383202099739, Top5: 0.6745406824146981
  NOUN   Train Top1: 0.08828250401284109, Top5: 0.2680577849117175; Val Top1: 0.015748031496062992, Top5: 0.15223097112860892
  ACTION Train Top1: 0.027287319422150885, Top5: 0.1476725521669342; Val Top1: 0.015748031496062992, Top5: 0.015748031496062992
  VERB   Val Precision: 0.0608, Recall: 0.0730, F1: 0.0374
  NOUN   Val Precision: 0.0005, Recall: 0.0323, F1: 0.0010
  ACTION Val Precision: 0.0003, Recall: 0.0196, F1: 0.0006
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6676052832077639
     NOUN    Mean Top-5 Recall: 0.1551121641819037
     ACTION  Mean Top-5 Recall: 0.015830903523067503
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6176470588235294
    @ 0.50s  Top1: 0.1891891891891892  Top5: 0.6216216216216216
    @ 0.75s  Top1: 0

Epoch 2 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 2 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 2/50 | Time 4.9s
  Train Loss: 7.6611 | Val Loss: 8.5579
  VERB   Train Top1: 0.10754414125200643, Top5: 0.7207062600321027; Val Top1: 0.28346456692913385, Top5: 0.7532808398950132
  NOUN   Train Top1: 0.12841091492776885, Top5: 0.42375601926163725; Val Top1: 0.015748031496062992, Top5: 0.2887139107611549
  ACTION Train Top1: 0.060995184590690206, Top5: 0.24879614767255218; Val Top1: 0.0, Top5: 0.09186351706036745
  VERB   Val Precision: 0.0830, Recall: 0.0804, F1: 0.0592
  NOUN   Val Precision: 0.0005, Recall: 0.0323, F1: 0.0010
  ACTION Val Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7498405352310685
     NOUN    Mean Top-5 Recall: 0.28911830109769016
     ACTION  Mean Top-5 Recall: 0.09402043552185005
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.29411764705882354  Top5: 0.7352941176470589
    @ 0.50s  Top1: 0.32432432432432434  Top5: 0.7297297297297297
    @ 0.75s  Top1: 0.272727272727272

Epoch 3 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 3 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 3/50 | Time 5.1s
  Train Loss: 6.9528 | Val Loss: 8.7282
  VERB   Train Top1: 0.22792937399678972, Top5: 0.7142857142857143; Val Top1: 0.05511811023622047, Top5: 0.6666666666666666
  NOUN   Train Top1: 0.1476725521669342, Top5: 0.4510433386837881; Val Top1: 0.09186351706036745, Top5: 0.3228346456692913
  ACTION Train Top1: 0.09149277688603531, Top5: 0.2680577849117175; Val Top1: 0.0, Top5: 0.031496062992125984
  VERB   Val Precision: 0.0037, Recall: 0.0667, F1: 0.0070
  NOUN   Val Precision: 0.0333, Recall: 0.0628, F1: 0.0265
  ACTION Val Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6592641705087658
     NOUN    Mean Top-5 Recall: 0.3257712915294051
     ACTION  Mean Top-5 Recall: 0.029072667808388956
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.058823529411764705  Top5: 0.5882352941176471
    @ 0.50s  Top1: 0.05405405405405406  Top5: 0.6216216216216216
    @ 0.75s  Top1: 0.045454545454545456

Epoch 4 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 4 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 4/50 | Time 5.4s
  Train Loss: 6.3083 | Val Loss: 8.2591
  VERB   Train Top1: 0.2568218298555377, Top5: 0.7672552166934189; Val Top1: 0.27296587926509186, Top5: 0.7349081364829396
  NOUN   Train Top1: 0.30658105939004815, Top5: 0.5457463884430177; Val Top1: 0.16010498687664043, Top5: 0.29133858267716534
  ACTION Train Top1: 0.1926163723916533, Top5: 0.4301765650080257; Val Top1: 0.10761154855643044, Top5: 0.27296587926509186
  VERB   Val Precision: 0.1420, Recall: 0.1922, F1: 0.1512
  NOUN   Val Precision: 0.0595, Recall: 0.0999, F1: 0.0590
  ACTION Val Precision: 0.0349, Recall: 0.0584, F1: 0.0413
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7312920349142926
     NOUN    Mean Top-5 Recall: 0.2879417106692715
     ACTION  Mean Top-5 Recall: 0.2762992813927945
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.3235294117647059  Top5: 0.7058823529411765
    @ 0.50s  Top1: 0.3783783783783784  Top5: 0.7027027027027027
    @ 0.75s  Top1: 0.11363636

Epoch 5 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 5 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 5/50 | Time 5.4s
  Train Loss: 5.2263 | Val Loss: 8.1828
  VERB   Train Top1: 0.32263242375601925, Top5: 0.869983948635634; Val Top1: 0.1994750656167979, Top5: 0.7611548556430446
  NOUN   Train Top1: 0.39325842696629215, Top5: 0.6837881219903692; Val Top1: 0.16272965879265092, Top5: 0.32808398950131235
  ACTION Train Top1: 0.3434991974317817, Top5: 0.6934189406099518; Val Top1: 0.01837270341207349, Top5: 0.2755905511811024
  VERB   Val Precision: 0.1525, Recall: 0.1396, F1: 0.1233
  NOUN   Val Precision: 0.0917, Recall: 0.0824, F1: 0.0758
  ACTION Val Precision: 0.0231, Recall: 0.0260, F1: 0.0164
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7564614173737292
     NOUN    Mean Top-5 Recall: 0.328298978792233
     ACTION  Mean Top-5 Recall: 0.27588980516445283
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.7352941176470589
    @ 0.50s  Top1: 0.21621621621621623  Top5: 0.7297297297297297
    @ 0.75s  Top1: 0.22727272

Epoch 6 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 6 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 6/50 | Time 5.5s
  Train Loss: 4.3080 | Val Loss: 8.1204
  VERB   Train Top1: 0.4189406099518459, Top5: 0.9197431781701445; Val Top1: 0.1784776902887139, Top5: 0.800524934383202
  NOUN   Train Top1: 0.45585874799357945, Top5: 0.7945425361155698; Val Top1: 0.2125984251968504, Top5: 0.42782152230971127
  ACTION Train Top1: 0.43980738362760835, Top5: 0.8041733547351525; Val Top1: 0.12335958005249344, Top5: 0.27296587926509186
  VERB   Val Precision: 0.1362, Recall: 0.1821, F1: 0.1218
  NOUN   Val Precision: 0.1552, Recall: 0.1588, F1: 0.1366
  ACTION Val Precision: 0.0736, Recall: 0.0921, F1: 0.0767
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7949319430713553
     NOUN    Mean Top-5 Recall: 0.4265372346509917
     ACTION  Mean Top-5 Recall: 0.2742342919957818
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.20588235294117646  Top5: 0.7647058823529411
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.7567567567567568
    @ 0.75s  Top1: 0.15909090

Epoch 7 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 7 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 7/50 | Time 5.6s
  Train Loss: 3.5170 | Val Loss: 7.9324
  VERB   Train Top1: 0.5473515248796148, Top5: 0.9630818619582665; Val Top1: 0.2020997375328084, Top5: 0.7690288713910761
  NOUN   Train Top1: 0.6131621187800963, Top5: 0.9036918138041734; Val Top1: 0.2283464566929134, Top5: 0.4540682414698163
  ACTION Train Top1: 0.6597110754414125, Top5: 0.9406099518459069; Val Top1: 0.14698162729658792, Top5: 0.29658792650918636
  VERB   Val Precision: 0.1798, Recall: 0.1841, F1: 0.1381
  NOUN   Val Precision: 0.1540, Recall: 0.1644, F1: 0.1257
  ACTION Val Precision: 0.0819, Recall: 0.1121, F1: 0.0887
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7615988113842951
     NOUN    Mean Top-5 Recall: 0.4551181247692271
     ACTION  Mean Top-5 Recall: 0.29609184781715336
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.7058823529411765
    @ 0.50s  Top1: 0.1891891891891892  Top5: 0.7297297297297297
    @ 0.75s  Top1: 0.1363636363

Epoch 8 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 8 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 8/50 | Time 5.5s
  Train Loss: 2.8194 | Val Loss: 7.9447
  VERB   Train Top1: 0.6035313001605136, Top5: 0.9839486356340289; Val Top1: 0.1942257217847769, Top5: 0.7375328083989501
  NOUN   Train Top1: 0.6837881219903692, Top5: 0.9454253611556982; Val Top1: 0.2440944881889764, Top5: 0.49606299212598426
  ACTION Train Top1: 0.7191011235955056, Top5: 0.9678972712680578; Val Top1: 0.2020997375328084, Top5: 0.29133858267716534
  VERB   Val Precision: 0.1572, Recall: 0.1772, F1: 0.1427
  NOUN   Val Precision: 0.2209, Recall: 0.1662, F1: 0.1523
  ACTION Val Precision: 0.1248, Recall: 0.1404, F1: 0.1223
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7292273897914043
     NOUN    Mean Top-5 Recall: 0.49953870764696007
     ACTION  Mean Top-5 Recall: 0.2915085832293516
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.17647058823529413  Top5: 0.5882352941176471
    @ 0.50s  Top1: 0.21621621621621623  Top5: 0.7297297297297297
    @ 0.75s  Top1: 0.181818181

Epoch 9 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 9 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 9/50 | Time 5.0s
  Train Loss: 2.4049 | Val Loss: 7.8179
  VERB   Train Top1: 0.6886035313001605, Top5: 0.9743178170144462; Val Top1: 0.2545931758530184, Top5: 0.8320209973753281
  NOUN   Train Top1: 0.7431781701444623, Top5: 0.9807383627608347; Val Top1: 0.28083989501312334, Top5: 0.5013123359580053
  ACTION Train Top1: 0.8282504012841091, Top5: 0.9839486356340289; Val Top1: 0.1679790026246719, Top5: 0.31758530183727035
  VERB   Val Precision: 0.1801, Recall: 0.1803, F1: 0.1676
  NOUN   Val Precision: 0.2009, Recall: 0.1977, F1: 0.1797
  ACTION Val Precision: 0.0967, Recall: 0.1164, F1: 0.0932
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.8266807940919849
     NOUN    Mean Top-5 Recall: 0.5028943034119118
     ACTION  Mean Top-5 Recall: 0.3166269625701674
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.2647058823529412  Top5: 0.7647058823529411
    @ 0.50s  Top1: 0.2702702702702703  Top5: 0.8108108108108109
    @ 0.75s  Top1: 0.204545454545

Epoch 10 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 10 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 10/50 | Time 5.4s
  Train Loss: 1.9528 | Val Loss: 8.0750
  VERB   Train Top1: 0.7576243980738363, Top5: 0.9887640449438202; Val Top1: 0.1889763779527559, Top5: 0.7112860892388452
  NOUN   Train Top1: 0.8186195826645265, Top5: 0.9887640449438202; Val Top1: 0.2545931758530184, Top5: 0.4671916010498688
  ACTION Train Top1: 0.8475120385232745, Top5: 0.9919743178170144; Val Top1: 0.19160104986876642, Top5: 0.30183727034120733
  VERB   Val Precision: 0.1845, Recall: 0.1768, F1: 0.1605
  NOUN   Val Precision: 0.2220, Recall: 0.1798, F1: 0.1821
  ACTION Val Precision: 0.1152, Recall: 0.1470, F1: 0.1230
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.702861525646302
     NOUN    Mean Top-5 Recall: 0.46853682286635073
     ACTION  Mean Top-5 Recall: 0.30028463778580544
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.17647058823529413  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.16216216216216217  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.20454545

Epoch 11 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 11 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 11/50 | Time 5.5s
  Train Loss: 1.4108 | Val Loss: 7.9064
  VERB   Train Top1: 0.8539325842696629, Top5: 0.9967897271268058; Val Top1: 0.1968503937007874, Top5: 0.7375328083989501
  NOUN   Train Top1: 0.8651685393258427, Top5: 0.9871589085072231; Val Top1: 0.2650918635170604, Top5: 0.4776902887139108
  ACTION Train Top1: 0.9229534510433387, Top5: 0.9903691813804173; Val Top1: 0.18635170603674542, Top5: 0.29133858267716534
  VERB   Val Precision: 0.1771, Recall: 0.1822, F1: 0.1620
  NOUN   Val Precision: 0.2133, Recall: 0.1777, F1: 0.1786
  ACTION Val Precision: 0.1284, Recall: 0.1383, F1: 0.1216
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7292159659016042
     NOUN    Mean Top-5 Recall: 0.4786827180706624
     ACTION  Mean Top-5 Recall: 0.2915085832293516
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.20588235294117646  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.1891891891891892  Top5: 0.7297297297297297
    @ 0.75s  Top1: 0.2045454545

Epoch 12 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 12 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 12/50 | Time 5.5s
  Train Loss: 1.1276 | Val Loss: 7.9025
  VERB   Train Top1: 0.9181380417335474, Top5: 0.9951845906902087; Val Top1: 0.2572178477690289, Top5: 0.7952755905511811
  NOUN   Train Top1: 0.9181380417335474, Top5: 0.9935794542536116; Val Top1: 0.29396325459317585, Top5: 0.43832020997375326
  ACTION Train Top1: 0.942215088282504, Top5: 0.9935794542536116; Val Top1: 0.17585301837270342, Top5: 0.28608923884514437
  VERB   Val Precision: 0.1770, Recall: 0.1957, F1: 0.1733
  NOUN   Val Precision: 0.2435, Recall: 0.1992, F1: 0.2010
  ACTION Val Precision: 0.0981, Recall: 0.1034, F1: 0.0947
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7939762189752182
     NOUN    Mean Top-5 Recall: 0.44144325297909687
     ACTION  Mean Top-5 Recall: 0.2866316892899577
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.7941176470588235
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.7837837837837838
    @ 0.75s  Top1: 0.2045454

Epoch 13 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 13 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 13/50 | Time 5.5s
  Train Loss: 0.8846 | Val Loss: 7.9926
  VERB   Train Top1: 0.9309791332263242, Top5: 0.9983948635634029; Val Top1: 0.23622047244094488, Top5: 0.7847769028871391
  NOUN   Train Top1: 0.9390048154093098, Top5: 0.9983948635634029; Val Top1: 0.30708661417322836, Top5: 0.4776902887139108
  ACTION Train Top1: 0.9582664526484751, Top5: 0.9951845906902087; Val Top1: 0.17060367454068243, Top5: 0.2887139107611549
  VERB   Val Precision: 0.1821, Recall: 0.1913, F1: 0.1691
  NOUN   Val Precision: 0.2378, Recall: 0.2209, F1: 0.1993
  ACTION Val Precision: 0.1036, Recall: 0.1178, F1: 0.1027
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7787869179625144
     NOUN    Mean Top-5 Recall: 0.4799935234933099
     ACTION  Mean Top-5 Recall: 0.2883291372328684
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.20588235294117646  Top5: 0.7647058823529411
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.7297297297297297
    @ 0.75s  Top1: 0.22727272

Epoch 14 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 14 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 14/50 | Time 5.2s
  Train Loss: 0.7115 | Val Loss: 7.9633
  VERB   Train Top1: 0.9309791332263242, Top5: 0.9967897271268058; Val Top1: 0.23097112860892388, Top5: 0.7664041994750657
  NOUN   Train Top1: 0.971107544141252, Top5: 0.9983948635634029; Val Top1: 0.3123359580052493, Top5: 0.44881889763779526
  ACTION Train Top1: 0.9791332263242376, Top5: 0.9967897271268058; Val Top1: 0.2020997375328084, Top5: 0.29658792650918636
  VERB   Val Precision: 0.2429, Recall: 0.1934, F1: 0.1749
  NOUN   Val Precision: 0.2373, Recall: 0.2082, F1: 0.2040
  ACTION Val Precision: 0.1090, Recall: 0.1276, F1: 0.1107
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7582973476706129
     NOUN    Mean Top-5 Recall: 0.45284625681564405
     ACTION  Mean Top-5 Recall: 0.297421603754341
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6764705882352942
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.7297297297297297
    @ 0.75s  Top1: 0.25  Top5:

Epoch 15 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 15 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 15/50 | Time 5.2s
  Train Loss: 0.5699 | Val Loss: 8.0101
  VERB   Train Top1: 0.9582664526484751, Top5: 0.9983948635634029; Val Top1: 0.2335958005249344, Top5: 0.7664041994750657
  NOUN   Train Top1: 0.9759229534510433, Top5: 0.9983948635634029; Val Top1: 0.31758530183727035, Top5: 0.4540682414698163
  ACTION Train Top1: 0.9791332263242376, Top5: 0.9983948635634029; Val Top1: 0.1968503937007874, Top5: 0.30446194225721784
  VERB   Val Precision: 0.2429, Recall: 0.2062, F1: 0.1981
  NOUN   Val Precision: 0.2475, Recall: 0.2172, F1: 0.2139
  ACTION Val Precision: 0.1057, Recall: 0.1279, F1: 0.1109
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7604246377055841
     NOUN    Mean Top-5 Recall: 0.4570681266086345
     ACTION  Mean Top-5 Recall: 0.30285099441012486
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.7058823529411765
    @ 0.50s  Top1: 0.21621621621621623  Top5: 0.7297297297297297
    @ 0.75s  Top1: 0.22727272

Epoch 16 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 16 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 16/50 | Time 5.6s
  Train Loss: 0.5618 | Val Loss: 8.0582
  VERB   Train Top1: 0.9743178170144462, Top5: 0.9967897271268058; Val Top1: 0.2204724409448819, Top5: 0.7244094488188977
  NOUN   Train Top1: 0.9759229534510433, Top5: 1.0; Val Top1: 0.30708661417322836, Top5: 0.4566929133858268
  ACTION Train Top1: 0.9807383627608347, Top5: 1.0; Val Top1: 0.1942257217847769, Top5: 0.32020997375328086
  VERB   Val Precision: 0.2594, Recall: 0.2093, F1: 0.1960
  NOUN   Val Precision: 0.2458, Recall: 0.2185, F1: 0.2095
  ACTION Val Precision: 0.1102, Recall: 0.1286, F1: 0.1142
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.718414146793197
     NOUN    Mean Top-5 Recall: 0.4579704577499362
     ACTION  Mean Top-5 Recall: 0.31755892901937527
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6764705882352942
    @ 0.50s  Top1: 0.21621621621621623  Top5: 0.7027027027027027
    @ 0.75s  Top1: 0.22727272727272727  Top5: 0.727272727272

Epoch 17 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 17 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 17/50 | Time 5.5s
  Train Loss: 0.4660 | Val Loss: 8.0556
  VERB   Train Top1: 0.9727126805778491, Top5: 0.9983948635634029; Val Top1: 0.23097112860892388, Top5: 0.7191601049868767
  NOUN   Train Top1: 0.9775280898876404, Top5: 1.0; Val Top1: 0.31758530183727035, Top5: 0.46981627296587924
  ACTION Train Top1: 0.9791332263242376, Top5: 1.0; Val Top1: 0.17585301837270342, Top5: 0.32020997375328086
  VERB   Val Precision: 0.2044, Recall: 0.2100, F1: 0.1856
  NOUN   Val Precision: 0.2604, Recall: 0.2324, F1: 0.2182
  ACTION Val Precision: 0.1212, Recall: 0.1224, F1: 0.1069
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7128805960010255
     NOUN    Mean Top-5 Recall: 0.47189361607681907
     ACTION  Mean Top-5 Recall: 0.31837095077343425
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6764705882352942
    @ 0.50s  Top1: 0.21621621621621623  Top5: 0.6756756756756757
    @ 0.75s  Top1: 0.22727272727272727  Top5: 0.7272727

Epoch 18 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 18 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 18/50 | Time 5.2s
  Train Loss: 0.4351 | Val Loss: 7.9673
  VERB   Train Top1: 0.9727126805778491, Top5: 0.9983948635634029; Val Top1: 0.2283464566929134, Top5: 0.7454068241469817
  NOUN   Train Top1: 0.9823434991974318, Top5: 1.0; Val Top1: 0.31496062992125984, Top5: 0.45931758530183725
  ACTION Train Top1: 0.9839486356340289, Top5: 1.0; Val Top1: 0.1784776902887139, Top5: 0.32545931758530183
  VERB   Val Precision: 0.1693, Recall: 0.2076, F1: 0.1663
  NOUN   Val Precision: 0.2360, Recall: 0.2230, F1: 0.2106
  ACTION Val Precision: 0.1133, Recall: 0.1234, F1: 0.1063
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7396149704859805
     NOUN    Mean Top-5 Recall: 0.4634049689368561
     ACTION  Mean Top-5 Recall: 0.32410343394750085
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.7058823529411765
    @ 0.50s  Top1: 0.21621621621621623  Top5: 0.7027027027027027
    @ 0.75s  Top1: 0.22727272727272727  Top5: 0.7272727272

Epoch 19 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 19 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 19/50 | Time 9.1s
  Train Loss: 0.4154 | Val Loss: 7.9917
  VERB   Train Top1: 0.9823434991974318, Top5: 0.9983948635634029; Val Top1: 0.23622047244094488, Top5: 0.7322834645669292
  NOUN   Train Top1: 0.9791332263242376, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.44881889763779526
  ACTION Train Top1: 0.985553772070626, Top5: 1.0; Val Top1: 0.2047244094488189, Top5: 0.33070866141732286
  VERB   Val Precision: 0.2375, Recall: 0.2160, F1: 0.1832
  NOUN   Val Precision: 0.2370, Recall: 0.2217, F1: 0.2069
  ACTION Val Precision: 0.1269, Recall: 0.1392, F1: 0.1236
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7268877665042393
     NOUN    Mean Top-5 Recall: 0.450225529470652
     ACTION  Mean Top-5 Recall: 0.3290750248565918
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.7058823529411765
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.7027027027027027
    @ 0.75s  Top1: 0.22727272727272727  Top5: 0.7272727272727

Epoch 20 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 20 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 20/50 | Time 5.5s
  Train Loss: 0.4050 | Val Loss: 8.0156
  VERB   Train Top1: 0.9871589085072231, Top5: 0.9983948635634029; Val Top1: 0.2440944881889764, Top5: 0.7375328083989501
  NOUN   Train Top1: 0.9807383627608347, Top5: 1.0; Val Top1: 0.31496062992125984, Top5: 0.45931758530183725
  ACTION Train Top1: 0.9871589085072231, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.32808398950131235
  VERB   Val Precision: 0.2412, Recall: 0.2211, F1: 0.1936
  NOUN   Val Precision: 0.2429, Recall: 0.2227, F1: 0.2111
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7318041031923546
     NOUN    Mean Top-5 Recall: 0.4626641044092269
     ACTION  Mean Top-5 Recall: 0.3260596739793988
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.7058823529411765
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.7027027027027027
    @ 0.75s  Top1: 0.22727272727272727  Top5: 0.75
    @ 1.

Epoch 21 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 21 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 21/50 | Time 5.6s
  Train Loss: 0.3939 | Val Loss: 8.0070
  VERB   Train Top1: 0.985553772070626, Top5: 0.9983948635634029; Val Top1: 0.2545931758530184, Top5: 0.7244094488188977
  NOUN   Train Top1: 0.9823434991974318, Top5: 1.0; Val Top1: 0.31758530183727035, Top5: 0.46981627296587924
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3333333333333333
  VERB   Val Precision: 0.2424, Recall: 0.2302, F1: 0.2052
  NOUN   Val Precision: 0.2382, Recall: 0.2240, F1: 0.2093
  ACTION Val Precision: 0.1149, Recall: 0.1300, F1: 0.1153
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7187036514128966
     NOUN    Mean Top-5 Recall: 0.4745958473152345
     ACTION  Mean Top-5 Recall: 0.3311417222357892
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6764705882352942
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.7027027027027027
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s

Epoch 22 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 22 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 22/50 | Time 5.5s
  Train Loss: 0.3549 | Val Loss: 8.0277
  VERB   Train Top1: 0.9887640449438202, Top5: 0.9983948635634029; Val Top1: 0.2545931758530184, Top5: 0.7244094488188977
  NOUN   Train Top1: 0.9871589085072231, Top5: 1.0; Val Top1: 0.31758530183727035, Top5: 0.46194225721784776
  ACTION Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3333333333333333
  VERB   Val Precision: 0.2440, Recall: 0.2302, F1: 0.2054
  NOUN   Val Precision: 0.2400, Recall: 0.2240, F1: 0.2098
  ACTION Val Precision: 0.1177, Recall: 0.1320, F1: 0.1175
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.718254657160023
     NOUN    Mean Top-5 Recall: 0.46559795139299653
     ACTION  Mean Top-5 Recall: 0.33039909863132344
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6764705882352942
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.7027027027027027
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.0

Epoch 23 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 23 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 23/50 | Time 5.6s
  Train Loss: 0.3498 | Val Loss: 8.0105
  VERB   Train Top1: 0.9919743178170144, Top5: 0.9983948635634029; Val Top1: 0.2545931758530184, Top5: 0.7244094488188977
  NOUN   Train Top1: 0.9887640449438202, Top5: 1.0; Val Top1: 0.31496062992125984, Top5: 0.4671916010498688
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3333333333333333
  VERB   Val Precision: 0.2417, Recall: 0.2302, F1: 0.2051
  NOUN   Val Precision: 0.2410, Recall: 0.2227, F1: 0.2087
  ACTION Val Precision: 0.1144, Recall: 0.1300, F1: 0.1138
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7167062337365997
     NOUN    Mean Top-5 Recall: 0.4699523281980312
     ACTION  Mean Top-5 Recall: 0.3311417222357892
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6764705882352942
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s

Epoch 24 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 24 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 24/50 | Time 5.7s
  Train Loss: 0.3468 | Val Loss: 8.0324
  VERB   Train Top1: 0.9903691813804173, Top5: 0.9983948635634029; Val Top1: 0.24671916010498687, Top5: 0.7191601049868767
  NOUN   Train Top1: 0.985553772070626, Top5: 1.0; Val Top1: 0.31496062992125984, Top5: 0.4645669291338583
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.33070866141732286
  VERB   Val Precision: 0.2263, Recall: 0.2120, F1: 0.1889
  NOUN   Val Precision: 0.2484, Recall: 0.2227, F1: 0.2127
  ACTION Val Precision: 0.1176, Recall: 0.1320, F1: 0.1163
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.712834198368521
     NOUN    Mean Top-5 Recall: 0.46677997971318275
     ACTION  Mean Top-5 Recall: 0.3293117672808342
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.7027027027027027
    @ 0.75s  Top1: 0.22727272727272727  Top5: 0.727272727272

Epoch 25 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 25 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 25/50 | Time 5.9s
  Train Loss: 0.3392 | Val Loss: 8.0140
  VERB   Train Top1: 0.9919743178170144, Top5: 0.9983948635634029; Val Top1: 0.2545931758530184, Top5: 0.7244094488188977
  NOUN   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.31496062992125984, Top5: 0.4671916010498688
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3333333333333333
  VERB   Val Precision: 0.2385, Recall: 0.2302, F1: 0.2030
  NOUN   Val Precision: 0.2416, Recall: 0.2227, F1: 0.2102
  ACTION Val Precision: 0.1151, Recall: 0.1320, F1: 0.1148
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7160767021328034
     NOUN    Mean Top-5 Recall: 0.47080628472632985
     ACTION  Mean Top-5 Recall: 0.3321526763717433
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6756756756756757
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00

Epoch 26 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 26 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 26/50 | Time 5.5s
  Train Loss: 0.3212 | Val Loss: 8.0352
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7112860892388452
  NOUN   Train Top1: 0.9887640449438202, Top5: 1.0; Val Top1: 0.31496062992125984, Top5: 0.46981627296587924
  ACTION Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2422, Recall: 0.2302, F1: 0.2035
  NOUN   Val Precision: 0.2421, Recall: 0.2227, F1: 0.2096
  ACTION Val Precision: 0.1136, Recall: 0.1320, F1: 0.1154
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7045789260507487
     NOUN    Mean Top-5 Recall: 0.47222505547075844
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6756756756756757
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.22

Epoch 27 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 27 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 27/50 | Time 5.4s
  Train Loss: 0.3201 | Val Loss: 8.0294
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7112860892388452
  NOUN   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.31496062992125984, Top5: 0.46981627296587924
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.32808398950131235
  VERB   Val Precision: 0.2411, Recall: 0.2302, F1: 0.2050
  NOUN   Val Precision: 0.2421, Recall: 0.2227, F1: 0.2096
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7056512299723173
     NOUN    Mean Top-5 Recall: 0.47082131215525047
     ACTION  Mean Top-5 Recall: 0.3259333889024558
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6764705882352942
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6756756756756757
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.22

Epoch 28 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 28 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 28/50 | Time 5.1s
  Train Loss: 0.3208 | Val Loss: 8.0482
  VERB   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.25196850393700787, Top5: 0.7217847769028871
  NOUN   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.31496062992125984, Top5: 0.4645669291338583
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.33070866141732286
  VERB   Val Precision: 0.2269, Recall: 0.2160, F1: 0.1893
  NOUN   Val Precision: 0.2428, Recall: 0.2227, F1: 0.2103
  ACTION Val Precision: 0.1176, Recall: 0.1320, F1: 0.1163
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7180319671311985
     NOUN    Mean Top-5 Recall: 0.4656129788219171
     ACTION  Mean Top-5 Recall: 0.3293117672808342
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.7058823529411765
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.7027027027027027
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.229

Epoch 29 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 29 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 29/50 | Time 5.0s
  Train Loss: 0.3247 | Val Loss: 8.0375
  VERB   Train Top1: 0.9887640449438202, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7139107611548556
  NOUN   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.31496062992125984, Top5: 0.47244094488188976
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3333333333333333
  VERB   Val Precision: 0.2396, Recall: 0.2302, F1: 0.2032
  NOUN   Val Precision: 0.2446, Recall: 0.2231, F1: 0.2115
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7087043908918575
     NOUN    Mean Top-5 Recall: 0.47530534163927995
     ACTION  Mean Top-5 Recall: 0.3315844945535615
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6764705882352942
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6756756756756757
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.229

Epoch 30 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 30 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 30/50 | Time 5.5s
  Train Loss: 0.3045 | Val Loss: 8.0403
  VERB   Train Top1: 0.9887640449438202, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7112860892388452
  NOUN   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.47506561679790027
  ACTION Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2388, Recall: 0.2302, F1: 0.2031
  NOUN   Val Precision: 0.2423, Recall: 0.2214, F1: 0.2091
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7050279203036223
     NOUN    Mean Top-5 Recall: 0.4775780689120072
     ACTION  Mean Top-5 Recall: 0.3344254036444706
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6756756756756757
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.22916

Epoch 31 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 31 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 31/50 | Time 5.7s
  Train Loss: 0.2978 | Val Loss: 8.0535
  VERB   Train Top1: 0.9887640449438202, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7112860892388452
  NOUN   Train Top1: 0.9887640449438202, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.47244094488188976
  ACTION Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.32808398950131235
  VERB   Val Precision: 0.2411, Recall: 0.2302, F1: 0.2050
  NOUN   Val Precision: 0.2451, Recall: 0.2214, F1: 0.2097
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7045789260507487
     NOUN    Mean Top-5 Recall: 0.4741996905336288
     ACTION  Mean Top-5 Recall: 0.3259333889024558
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6756756756756757
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291

Epoch 32 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 32 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 32/50 | Time 5.9s
  Train Loss: 0.3104 | Val Loss: 8.0475
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7086614173228346
  NOUN   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.46981627296587924
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3333333333333333
  VERB   Val Precision: 0.2275, Recall: 0.2166, F1: 0.1932
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7016495419252439
     NOUN    Mean Top-5 Recall: 0.47159552386696213
     ACTION  Mean Top-5 Recall: 0.3315844945535615
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291

Epoch 33 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 33 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 33/50 | Time 5.5s
  Train Loss: 0.3149 | Val Loss: 8.0429
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.25196850393700787, Top5: 0.7086614173228346
  NOUN   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.47506561679790027
  ACTION Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.33858267716535434
  VERB   Val Precision: 0.2197, Recall: 0.2132, F1: 0.1861
  NOUN   Val Precision: 0.2425, Recall: 0.2214, F1: 0.2092
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7016495419252439
     NOUN    Mean Top-5 Recall: 0.4774649769113378
     ACTION  Mean Top-5 Recall: 0.33661838610061096
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.22

Epoch 34 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 34 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 34/50 | Time 5.4s
  Train Loss: 0.3005 | Val Loss: 8.0455
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.25196850393700787, Top5: 0.7086614173228346
  NOUN   Train Top1: 0.9887640449438202, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.47506561679790027
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2333, Recall: 0.2265, F1: 0.1976
  NOUN   Val Precision: 0.2425, Recall: 0.2214, F1: 0.2092
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.7016495419252439
     NOUN    Mean Top-5 Recall: 0.4774649769113378
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.229

Epoch 35 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 35 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 35/50 | Time 5.4s
  Train Loss: 0.2982 | Val Loss: 8.0507
  VERB   Train Top1: 0.9887640449438202, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7034120734908137
  NOUN   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.4776902887139108
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2371, Recall: 0.2302, F1: 0.2013
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6964412085919105
     NOUN    Mean Top-5 Recall: 0.48006914357800445
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291

Epoch 36 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 36 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 36/50 | Time 5.4s
  Train Loss: 0.2978 | Val Loss: 8.0509
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.48031496062992124
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2389, Recall: 0.2302, F1: 0.2015
  NOUN   Val Precision: 0.2425, Recall: 0.2214, F1: 0.2092
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6985963810057036
     NOUN    Mean Top-5 Recall: 0.48344752195638285
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.229

Epoch 37 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 37 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 37/50 | Time 5.4s
  Train Loss: 0.3031 | Val Loss: 8.0521
  VERB   Train Top1: 0.9887640449438202, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.4776902887139108
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.33858267716535434
  VERB   Val Precision: 0.2415, Recall: 0.2302, F1: 0.2034
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1145, Recall: 0.1320, F1: 0.1150
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6985963810057036
     NOUN    Mean Top-5 Recall: 0.48006914357800445
     ACTION  Mean Top-5 Recall: 0.33638164367636847
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.229

Epoch 38 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 38 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 38/50 | Time 5.7s
  Train Loss: 0.3115 | Val Loss: 8.0508
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.47506561679790027
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.33858267716535434
  VERB   Val Precision: 0.2404, Recall: 0.2302, F1: 0.2033
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1145, Recall: 0.1320, F1: 0.1150
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6985963810057036
     NOUN    Mean Top-5 Recall: 0.4763926729897692
     ACTION  Mean Top-5 Recall: 0.33638164367636847
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.229

Epoch 39 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 39 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 39/50 | Time 5.4s
  Train Loss: 0.2973 | Val Loss: 8.0502
  VERB   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.47506561679790027
  ACTION Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2395, Recall: 0.2302, F1: 0.2032
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6990453752585772
     NOUN    Mean Top-5 Recall: 0.4763926729897692
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291

Epoch 40 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 40 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 40/50 | Time 5.6s
  Train Loss: 0.3079 | Val Loss: 8.0486
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7034120734908137
  NOUN   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.47244094488188976
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3333333333333333
  VERB   Val Precision: 0.2373, Recall: 0.2302, F1: 0.2013
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6964412085919105
     NOUN    Mean Top-5 Recall: 0.4730142946113908
     ACTION  Mean Top-5 Recall: 0.3315844945535615
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.22916

Epoch 41 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 41 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 41/50 | Time 5.3s
  Train Loss: 0.2982 | Val Loss: 8.0515
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.47506561679790027
  ACTION Train Top1: 0.9951845906902087, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3333333333333333
  VERB   Val Precision: 0.2373, Recall: 0.2302, F1: 0.2013
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6990453752585772
     NOUN    Mean Top-5 Recall: 0.4763926729897692
     ACTION  Mean Top-5 Recall: 0.3315844945535615
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.22916

Epoch 42 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 42 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 42/50 | Time 5.4s
  Train Loss: 0.2799 | Val Loss: 8.0518
  VERB   Train Top1: 0.9887640449438202, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.47506561679790027
  ACTION Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2374, Recall: 0.2302, F1: 0.2013
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6990453752585772
     NOUN    Mean Top-5 Recall: 0.4763926729897692
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291

Epoch 43 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 43 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 43/50 | Time 5.5s
  Train Loss: 0.3014 | Val Loss: 8.0503
  VERB   Train Top1: 0.9887640449438202, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.47506561679790027
  ACTION Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2374, Recall: 0.2302, F1: 0.2013
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6990453752585772
     NOUN    Mean Top-5 Recall: 0.4763926729897692
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291

Epoch 44 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 44 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 44/50 | Time 5.6s
  Train Loss: 0.3121 | Val Loss: 8.0499
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.4776902887139108
  ACTION Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2375, Recall: 0.2302, F1: 0.2014
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6990453752585772
     NOUN    Mean Top-5 Recall: 0.47977105136814757
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291

Epoch 45 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 45 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 45/50 | Time 5.5s
  Train Loss: 0.2849 | Val Loss: 8.0479
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.4776902887139108
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2374, Recall: 0.2302, F1: 0.2013
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1162, Recall: 0.1320, F1: 0.1155
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6990453752585772
     NOUN    Mean Top-5 Recall: 0.47977105136814757
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291

Epoch 46 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 46 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 46/50 | Time 5.8s
  Train Loss: 0.3041 | Val Loss: 8.0483
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.4776902887139108
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2375, Recall: 0.2302, F1: 0.2014
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1145, Recall: 0.1320, F1: 0.1150
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6990453752585772
     NOUN    Mean Top-5 Recall: 0.47977105136814757
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291

Epoch 47 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 47 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 47/50 | Time 5.3s
  Train Loss: 0.2872 | Val Loss: 8.0482
  VERB   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.4776902887139108
  ACTION Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2375, Recall: 0.2302, F1: 0.2014
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1145, Recall: 0.1320, F1: 0.1150
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6990453752585772
     NOUN    Mean Top-5 Recall: 0.47977105136814757
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291

Epoch 48 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 48 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 48/50 | Time 5.4s
  Train Loss: 0.3029 | Val Loss: 8.0481
  VERB   Train Top1: 0.9903691813804173, Top5: 0.9983948635634029; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.4776902887139108
  ACTION Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2375, Recall: 0.2302, F1: 0.2014
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1145, Recall: 0.1320, F1: 0.1150
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6990453752585772
     NOUN    Mean Top-5 Recall: 0.47977105136814757
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00

Epoch 49 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 49 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 49/50 | Time 5.5s
  Train Loss: 0.3027 | Val Loss: 8.0497
  VERB   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.4776902887139108
  ACTION Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2375, Recall: 0.2302, F1: 0.2014
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1145, Recall: 0.1320, F1: 0.1150
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6990453752585772
     NOUN    Mean Top-5 Recall: 0.47977105136814757
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291

Epoch 50 Train:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 50 Val:   0%|          | 0/13 [00:00<?, ?it/s]

Epoch 50/50 | Time 5.2s
  Train Loss: 0.3054 | Val Loss: 8.0508
  VERB   Train Top1: 0.9919743178170144, Top5: 1.0; Val Top1: 0.2545931758530184, Top5: 0.7060367454068242
  NOUN   Train Top1: 0.9903691813804173, Top5: 1.0; Val Top1: 0.3123359580052493, Top5: 0.4776902887139108
  ACTION Train Top1: 0.9935794542536116, Top5: 1.0; Val Top1: 0.1968503937007874, Top5: 0.3359580052493438
  VERB   Val Precision: 0.2375, Recall: 0.2302, F1: 0.2014
  NOUN   Val Precision: 0.2450, Recall: 0.2214, F1: 0.2096
  ACTION Val Precision: 0.1145, Recall: 0.1320, F1: 0.1150
  ---- Mean Top-5 Recall (validation) ----
     VERB    Mean Top-5 Recall: 0.6990453752585772
     NOUN    Mean Top-5 Recall: 0.47977105136814757
     ACTION  Mean Top-5 Recall: 0.33377747700970184
  VERB   per-horizon (time-based):
    @ 0.25s  Top1: 0.23529411764705882  Top5: 0.6470588235294118
    @ 0.50s  Top1: 0.24324324324324326  Top5: 0.6486486486486487
    @ 0.75s  Top1: 0.25  Top5: 0.7272727272727273
    @ 1.00s  Top1: 0.2291