In [14]:
import ast
import pandas as pd
import numpy as np
from skmultilearn.model_selection import iterative_train_test_split

In [3]:
behaviour_attr = [
    "p_camera_reaction",
    "p_tool_use",
    "p_object_carrying",
    "p_bipedal",
    "p_feeding",
    "p_chimp_carrying",
    "p_vocalisation",
    "p_climbing",
    "p_aggression",
    "p_travel",
    "p_sex",
    "p_piloerection",
    "p_social_interaction",
    "p_grooming",
    "p_display",
    "p_cross_species_interaction",
    "p_resting",
    "p_playing",
]

In [5]:
df = pd.read_csv("full_video_w_descriptors.csv")

**Generic video-level splits**

In [17]:
X, y = df[["video_id", "label"]].video_id.values, np.array(
    df[["video_id", "label"]].label.apply(ast.literal_eval).values
)

In [None]:
import ast
import numpy as np

vdf = df[["video_id", "label"]]
vdf.label = vdf.label.apply(lambda x: x[:-1])  # let 0 vector represent no behaviour
vdf.label = vdf.label.apply(lambda x: str(list(x)))
vdf = vdf.drop_duplicates()


vdf.label = vdf.label.apply(lambda x: ast.literal_eval(x))
labels = np.array(list(vdf.label.values))
X = vdf.video_id.to_numpy().reshape((6675, 1))

assert len(X) == len(labels)

X_train, y_train, X_test, y_test = iterative_train_test_split(X, labels, test_size=0.30)
X_test, y_test, X_val, y_val = iterative_train_test_split(
    X_test, y_test, test_size=0.33
)
y_train, y_test, y_val = (
    [str(list(x)) for x in y_train],
    [str(list(x)) for x in y_test],
    [str(list(x)) for x in y_val],
)

In [None]:
train_vdf = pd.DataFrame({"video": X_train[:, 0], "label": y_train})
val_vdf = pd.DataFrame({"video": X_val[:, 0], "label": y_val})
test_vdf = pd.DataFrame({"video": X_test[:, 0], "label": y_test})

train_vdf.to_csv("data/annotations/video_only/train.csv", index=False)
val_vdf.to_csv("data/annotations/video_only/val.csv", index=False)
test_vdf.to_csv("data/annotations/video_only/test.csv", index=False)

In [None]:
train_tdf = train_vdf.merge(tdf, left_on="video", right_on="video_id", how="left")
train_tdf.rename(columns={"label_y": "label"}, inplace=True)
train_tdf.drop(columns=["label_x"], inplace=True)

val_tdf = val_vdf.merge(tdf, left_on="video", right_on="video_id", how="left")
val_tdf.rename(columns={"label_y": "label"}, inplace=True)
val_tdf.drop(columns=["label_x"], inplace=True)

test_tdf = test_vdf.merge(tdf, left_on="video", right_on="video_id", how="left")
test_tdf.rename(columns={"label_y": "label"}, inplace=True)
test_tdf.drop(columns=["label_x"], inplace=True)

train_tdf[["video_id", "descriptor", "label"]].to_csv(
    "data/annotations/text_only/train_text_only.csv", index=False
)

val_tdf[["video_id", "descriptor", "label"]].to_csv(
    "data/annotations/text_only/val_text_only.csv", index=False
)

test_tdf[["video_id", "descriptor", "label"]].to_csv(
    "data/annotations/text_only/test_text_only.csv", index=False
)