In [None]:
import ast
import math
import torch
import torchvision
import pickle as pkl
import pandas as pd
import numpy as np
import seaborn as sns
from einops import rearrange
import matplotlib.pyplot as plt

from torchmetrics.functional.classification import (
    multilabel_f1_score,
    multilabel_precision,
    multilabel_recall,
)

from data_utils import results2df
from sklearn.metrics.pairwise import cosine_similarity

# Slowfast imports
from slowfast.models import build_model
from slowfast.utils.parser import load_config, alt_parse_args

In [None]:
train_path = "/home/dl18206/Desktop/phd/code/personal/facebook/slowfast/dataset/results/model=slow_r50_ds=panaf_seq_fd_only_feats=train_feats.pkl"
orig_val_path = "/home/dl18206/Desktop/phd/code/personal/facebook/slowfast/dataset/results/model=slow_r50_ds=panaf_seq_fd_only_e=200_feats=val_feats.pkl"
val_sub_path = "/home/dl18206/Desktop/phd/code/personal/facebook/slowfast/dataset/results/model=slow_r50_ds=panaf_seq_fg_minus_bg_lambda_e=200_split=val_feats.pkl"
metadata_path = "/home/dl18206/Desktop/phd/code/personal/facebook/slowfast/dataset/metadata/with_negative_pairing/new_metadata.csv"
segments_file = "../dataset/metadata/segments.txt"

In [None]:
with open(train_path, "rb") as f:
    train_data = pkl.load(f)

with open(orig_val_path, "rb") as f:
    orig_val_data = pkl.load(f)

with open(val_sub_path, "rb") as f:
    sub_val_data = pkl.load(f)

metadata = pd.read_csv(metadata_path)

with open("../dataset/metadata/behaviours.txt", "rb") as f:
    behaviours = [beh.decode("utf-8").strip() for beh in f.readlines()]

with open(segments_file, "rb") as f:
    segments = [seg.decode("utf-8").strip() for seg in f.readlines()]

In [None]:
def calculate_metrics(preds, labels):
    # Convert preds and labels to tensors
    preds, labels = np.stack(preds), np.stack(labels)
    preds, labels = torch.tensor(preds, dtype=torch.float32), torch.tensor(labels)
    # Calculate metrics
    f1 = multilabel_f1_score(preds, labels, num_labels=14, average="none")
    precision = multilabel_precision(preds, labels, num_labels=14, average="none")
    recall = multilabel_recall(preds, labels, num_labels=14, average="none")
    return f1, precision, recall

In [None]:
def measure_domain_shift(train_df, val_df, pred_column, behaviours):

    store = []

    for idx in range(len(behaviours)):
        val_agg_df = val_df[val_df.label.apply(lambda x: x[idx] == 1)]
        train_agg_df = train_df[train_df.label.apply(lambda x: x[idx] == 1)]

        overall_f1, overall_precision, overall_recall = calculate_metrics(
            val_agg_df[pred_column].values, val_agg_df["label"].values
        )

        mutual_df = val_agg_df[val_agg_df["utm"].isin(train_agg_df["utm"])]
        mutual_videos = len(mutual_df["utm"].unique())

        mutual_f1, mutual_precision, mutual_recall = calculate_metrics(
            mutual_df[pred_column].values, mutual_df["label"].values
        )

        exclusive_df = val_agg_df[~val_agg_df["utm"].isin(train_agg_df["utm"])]
        exclusive_videos = len(exclusive_df["utm"].unique())

        exclusive_f1, exclusive_precision, exclusive_recall = calculate_metrics(
            exclusive_df[pred_column].values, exclusive_df["label"].values
        )

        store.append(
            {
                "behaviour": behaviours[idx],
                "mutual_ct_loc": mutual_videos,
                "exclusive_ct_loc": exclusive_videos,
                "mutual_loc_prop": round(
                    mutual_videos / (exclusive_videos + mutual_videos), 2
                ),
                "overall_recall": overall_recall[idx].item(),
                "overall_precision": overall_precision[idx].item(),
                "overall_f1": overall_f1[idx].item(),
                "mutual_recall": mutual_recall[idx].item(),
                "mutual_precision": mutual_precision[idx].item(),
                "mutual_f1": mutual_f1[idx].item(),
                "exclusive_recall": exclusive_recall[idx].item(),
                "exclusive_precision": exclusive_precision[idx].item(),
                "exclusive_f1": exclusive_f1[idx].item(),
            }
        )

        df = pd.DataFrame(store)

        # Round all numerical columns to 2 decimal places
        df = df.round(4)

    return mutual_df, exclusive_df, df

In [None]:
def results2df(df, split):
    # Process subclips
    subclips = []
    if split == "train":
        for name, pred, feat, label in zip(
            df["names"], df["preds"], df["feats"], df["labels"]
        ):
            subclips.append(
                {
                    "name": name,
                    "split": split,
                    "pred": pred,
                    "feat": feat,
                    "negative": True if sum(label) == 0 else False,
                    "label": label,
                }
            )
        df = pd.DataFrame(
            subclips, columns=["name", "split", "pred", "feat", "negative", "label"]
        )
    else:
        for name, pred, label in zip(df["names"], df["preds"], df["labels"]):
            subclips.append(
                {
                    "name": name,
                    "split": split,
                    "pred": pred,
                    "negative": True if sum(label) == 0 else False,
                    "label": label,
                }
            )
        df = pd.DataFrame(
            subclips, columns=["name", "split", "pred", "negative", "label"]
        )
    # Move all preds and labels to the cpu
    df["pred"] = df.pred.apply(lambda x: x.detach().cpu().numpy())
    df["label"] = df.label.apply(lambda x: x.detach().cpu().numpy())

    df["pred"] = df.pred.apply(lambda x: torch.sigmoid(torch.tensor(x)))

    return df

**Baseline Results**

In [None]:
train_df = results2df(train_data, "train")
val_df = results2df(orig_val_data, "val")

train_df = train_df.merge(
    metadata[["subject_id_fg", "utm", "value"]],
    left_on="name",
    right_on="subject_id_fg",
)
val_df = val_df.merge(
    metadata[["subject_id_fg", "utm", "value"]],
    left_on="name",
    right_on="subject_id_fg",
)

In [None]:
mutual_df, exclusive_df, original_df = measure_domain_shift(
    train_df, val_df, pred_column="pred", behaviours=behaviours
)

In [None]:
original_df[
    ["behaviour", "mutual_loc_prop", "overall_f1", "mutual_f1", "exclusive_f1"]
].sort_values(by="mutual_loc_prop", ascending=False)

In [None]:
print(original_df.overall_f1.mean())
print(original_df.mutual_f1.mean())
print(original_df.exclusive_f1.mean())

In [None]:
sub_val_df = results2df(sub_val_data, "val")
sub_val_df = sub_val_df.merge(
    metadata[["subject_id_fg", "utm"]], left_on="name", right_on="subject_id_fg"
)

In [None]:
mutual_df, exclusive_df, sub_df = measure_domain_shift(
    train_df, sub_val_df, pred_column="pred", behaviours=behaviours
)

In [None]:
sub_df[
    ["behaviour", "mutual_loc_prop", "overall_f1", "mutual_f1", "exclusive_f1"]
].sort_values(by="mutual_loc_prop", ascending=False)

In [None]:
print(f"Overall F1: {sub_df.overall_f1.mean()}")
print(f"Mutual F1: {sub_df.mutual_f1.mean()}")
print(f"Exclusive F1: {sub_df.exclusive_f1.mean()}")

In [None]:
res_df = pd.DataFrame(
    {
        "behaviours": behaviours,
        "segments": segments,
        "overall_f1": original_df.overall_f1,
        "mutual_f1": original_df.mutual_f1,
        "exclusive_f1": original_df.exclusive_f1,
        "sub_overall_f1": sub_df.overall_f1,
        "sub_mutual_f1": sub_df.mutual_f1,
        "sub_exclusive_f1": sub_df.exclusive_f1,
    }
)

In [None]:
res_df[
    [
        "behaviours",
        "segments",
        "overall_f1",
        "sub_overall_f1",
        "mutual_f1",
        "sub_mutual_f1",
        "exclusive_f1",
        "sub_exclusive_f1",
    ]
]

In [None]:
# for overall_f1 and sub_overall_f1 print per segment results
for s in res_df.segments.unique():
    print(
        f"{s}\n", res_df[res_df.segments == s][["overall_f1", "sub_overall_f1"]].mean()
    )

In [None]:
# for overall_f1 and sub_overall_f1 print per segment results
for s in res_df.segments.unique():
    print(f"{s}\n", res_df[res_df.segments == s][["mutual_f1", "sub_mutual_f1"]].mean())

In [None]:
# for overall_f1 and sub_overall_f1 print per segment results
for s in res_df.segments.unique():
    print(
        f"{s}\n",
        res_df[res_df.segments == s][["exclusive_f1", "sub_exclusive_f1"]].mean(),
    )

In [None]:
# Calculate correlation between mutual_loc_prop and overall_f1
overall_corr = original_df[["mutual_loc_prop", "overall_f1"]].corr().iloc[0, 1]
mutual_corr = original_df[["mutual_loc_prop", "mutual_f1"]].corr().iloc[0, 1]
exclusive_corr = original_df[["mutual_loc_prop", "exclusive_f1"]].corr().iloc[0, 1]

print(f"Overall correlation: {overall_corr}")
print(f"Mutual correlation: {mutual_corr}")
print(f"Exclusive correlation: {exclusive_corr}")

In [None]:
train_df["feat"] = train_df.feat.apply(lambda x: x.detach().cpu())
val_df["feat"] = val_df.feat.apply(lambda x: x.detach().cpu())

**Pairwise cosine similarity**

In [None]:
grouped_df = (
    train_df.groupby("value")["feat"]
    .apply(lambda x: np.mean(np.stack(x.values), axis=0))
    .reset_index()
)

# Convert the 'feat' column to a list of numpy arrays
embeddings = np.stack(grouped_df["feat"].values)

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(embeddings)

# Create a dataframe with the similarity matrix
similarity_df = pd.DataFrame(
    similarity_matrix, index=grouped_df["value"], columns=grouped_df["value"]
)

# Create a heatmap
plt.figure(figsize=(10, 8), dpi=300)
sns.heatmap(similarity_df, annot=False, cmap="viridis", cbar=True)
plt.title("Pairwise Cosine Similarity Heatmap")
plt.tight_layout()
plt.show()

In [None]:
# Get the values and index from the similarity dataframe
values = similarity_df.values
index = similarity_df.index.tolist()

# Create a mask to exclude the diagonal
mask = np.triu(np.ones_like(values, dtype=bool), k=1)

# Create a list to store similarity pairs
similarity_pairs = []

# Iterate through the upper triangle of the matrix
for i in range(len(index)):
    for j in range(i + 1, len(index)):
        if mask[i, j]:
            similarity_pairs.append(
                {"value1": index[i], "value2": index[j], "similarity": values[i, j]}
            )

# Convert to dataframe and sort by similarity
similarity_df = pd.DataFrame(similarity_pairs)
similarity_df = similarity_df.sort_values("similarity", ascending=False)

# Take top N pairs (e.g., top 20)
top_n = 20
top_pairs = similarity_df.head(top_n)

# Create pair labels
top_pairs["pair"] = top_pairs["value1"] + " - " + top_pairs["value2"]

# Set up the matplotlib figure
plt.figure(figsize=(12, 8))

# Create the Seaborn bar plot
sns.barplot(x="similarity", y="pair", data=top_pairs, orient="h")

# Customize the plot
plt.title(
    "Top 20 Value Pairs by Cosine Similarity (Excluding Self-Similarity)", fontsize=16
)
plt.xlabel("Cosine Similarity", fontsize=12)
plt.ylabel("Value Pairs", fontsize=12)

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

In [None]:
def get_segment(x, behaviours, segments):
    segment_code = []
    split_x = x.split(",")
    for i, seg in enumerate(segments):
        if behaviours[i] in split_x:
            segment_code.append(seg)
    # segment_code = list(set(segment_code))
    return ",".join(sorted(segment_code))


def get_seg_code(x, unique=True):
    seg_code = []
    try:
        split_x = x.split(",")
        for seg in split_x:
            seg_code.append(seg[0].capitalize())
        if unique:
            seg_code = list(set(seg_code))
        seg_code = "".join(sorted(seg_code))
    except:
        seg_code = "N"
    return seg_code

In [None]:
train_df.columns

In [None]:
train_df["label_combination"] = train_df.value.apply(
    lambda x: get_segment(x, behaviours, segments)
)
train_df["full_code"] = train_df["label_combination"].apply(
    lambda x: get_seg_code(x, unique=False)
)
train_df["unique_code"] = train_df["label_combination"].apply(
    lambda x: get_seg_code(x, unique=True)
)

val_df["label_combination"] = val_df.value.apply(
    lambda x: get_segment(x, behaviours, segments)
)
val_df["full_code"] = val_df["label_combination"].apply(
    lambda x: get_seg_code(x, unique=False)
)
val_df["unique_code"] = val_df["label_combination"].apply(
    lambda x: get_seg_code(x, unique=True)
)

In [None]:
grouped_df = (
    train_df.groupby("unique_code")["feat"]
    .apply(lambda x: np.mean(np.stack(x.values), axis=0))
    .reset_index()
)

# Convert the 'feat' column to a list of numpy arrays
embeddings = np.stack(grouped_df["feat"].values)

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(embeddings)

# Create a dataframe with the similarity matrix
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=grouped_df["unique_code"],
    columns=grouped_df["unique_code"],
)

# Create a heatmap
plt.figure(figsize=(12, 12), dpi=100)
sns.heatmap(similarity_df, annot=True, cmap="viridis", cbar=True)
plt.title("Pairwise Cosine Similarity Heatmap")
plt.tight_layout()
plt.show()

In [None]:
grouped_df = (
    train_df.groupby("full_code")["feat"]
    .apply(lambda x: np.mean(np.stack(x.values), axis=0))
    .reset_index()
)

# Convert the 'feat' column to a list of numpy arrays
embeddings = np.stack(grouped_df["feat"].values)

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(embeddings)

# Create a dataframe with the similarity matrix
similarity_df = pd.DataFrame(
    similarity_matrix, index=grouped_df["full_code"], columns=grouped_df["full_code"]
)

# Create a heatmap
plt.figure(figsize=(12, 12), dpi=100)
sns.heatmap(similarity_df, annot=False, cmap="viridis", cbar=True)
plt.title("Pairwise Cosine Similarity Heatmap")
plt.tight_layout()
plt.show()

In [None]:
grouped_df = (
    val_df.groupby("unique_code")["feat"]
    .apply(lambda x: np.mean(np.stack(x.values), axis=0))
    .reset_index()
)

# Convert the 'feat' column to a list of numpy arrays
embeddings = np.stack(grouped_df["feat"].values)

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(embeddings)

# Create a dataframe with the similarity matrix
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=grouped_df["unique_code"],
    columns=grouped_df["unique_code"],
)

# Create a heatmap
plt.figure(figsize=(12, 12), dpi=100)
sns.heatmap(similarity_df, annot=True, cmap="viridis", cbar=True)
plt.title("Pairwise Cosine Similarity Heatmap")
plt.tight_layout()
plt.show()

In [None]:
grouped_df = (
    val_df.groupby("full_code")["feat"]
    .apply(lambda x: np.mean(np.stack(x.values), axis=0))
    .reset_index()
)

# Convert the 'feat' column to a list of numpy arrays
embeddings = np.stack(grouped_df["feat"].values)

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(embeddings)

# Create a dataframe with the similarity matrix
similarity_df = pd.DataFrame(
    similarity_matrix, index=grouped_df["full_code"], columns=grouped_df["full_code"]
)

# Create a heatmap
plt.figure(figsize=(12, 12), dpi=100)
sns.heatmap(similarity_df, annot=False, cmap="viridis", cbar=True)
plt.title("Pairwise Cosine Similarity Heatmap")
plt.tight_layout()
plt.show()

**Classifying segment averages**

In [None]:
path_to_config = "/home/dl18206/Desktop/phd/code/personal/facebook/slowfast/configs/SLOW_8x8_R50_TEST.yaml"
path_to_ckpt = "/home/dl18206/Desktop/phd/code/personal/facebook/slowfast/checkpoint_epoch_00200.pyth"

args = alt_parse_args()[:-1]
cfg = load_config(
    args[0],
    path_to_config=path_to_config,
)
checkpoint = torch.load(path_to_ckpt)

model = build_model(cfg)
model.load_state_dict(checkpoint["model_state"])
projection = model.head.projection
projection.cpu()

In [None]:
grouped_df = (
    val_df.groupby("unique_code")["feat"]
    .apply(lambda x: np.mean(np.stack(x.values), axis=0))
    .reset_index()
)
grouped_df["preds"] = grouped_df.feat.apply(
    lambda x: torch.sigmoid(projection(torch.tensor(x)).detach().cpu())
)

grouped_df["preds"] = grouped_df["preds"].apply(lambda x: x.numpy())

In [None]:
# Split column of lists into multiple columns
grouped_df = pd.concat(
    [grouped_df.drop(["preds"], axis=1), grouped_df["preds"].apply(pd.Series)],
    axis=1,
)

# Add behaviour names to the columns
grouped_df.columns = ["unique_code", "pred"] + behaviours

In [None]:
grouped_df

In [None]:
grouped_df

In [None]:
colour_map = plt.cm.get_cmap("tab20", 14)
grouped_df[behaviours].plot(kind="bar", figsize=(12, 8), colormap=colour_map)

# Plot uniqe_code on x-axis
plt.xticks(range(len(grouped_df)), grouped_df["unique_code"], rotation=45)

# Move legend outside of plot
plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))

In [None]:
# Plot preds
plt.figure(figsize=(12, 8))
sns.histplot(grouped_df.preds.apply(lambda x: x.numpy()), bins=50)
plt.title("Predictions Distribution")
plt.show()

In [None]:
# Convert the 'feat' column to a list of numpy arrays
embeddings = np.stack(grouped_df["feat"].values)

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(embeddings)

# Create a dataframe with the similarity matrix
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=grouped_df["unique_code"],
    columns=grouped_df["unique_code"],
)

# Create a heatmap
plt.figure(figsize=(12, 12), dpi=100)
sns.heatmap(similarity_df, annot=True, cmap="viridis", cbar=True)
plt.title("Pairwise Cosine Similarity Heatmap")
plt.tight_layout()
plt.show()