In [None]:
import ast
import inflect
import calendar
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from skmultilearn.model_selection import iterative_train_test_split

### **Assign temporally aligned parent behaviours to clips**

In [None]:
df = pd.read_csv("assign_full_video_multilabel.csv")

In [None]:
# Filter videos that need removing
videos_to_remove = pd.read_csv("videos_to_remove.csv")
df = df[~df.video_id.isin(videos_to_remove.videos_to_remove.unique())]
df = df[~df.video_id.isin(["djo_cam09_0698421_0598444_20130109_pict0017"])]
df = df[~df.month.isna()]

In [None]:
df["sp"] = df.tags.str.lower()
df["sp"] = df.sp.str.split(",")
df.sp.fillna("", inplace=True)
df.sp = df.sp.apply(lambda x: list(enumerate(x)))

df["sb"] = df.behavior.str.lower()
df["sb"] = df.sb.str.split(",")
df.sb.fillna("", inplace=True)
df.sb = df.sb.apply(lambda x: list(enumerate(x)))

df["split_tags"] = df.sp + df.sb
df["split_tags"] = df.split_tags.apply(lambda x: sorted(x, key=lambda x: x[0]))

In [None]:
# DONT RUN THIS CELL - RUN THE ONE BELOW :-)
tag_df = pd.read_csv("data/internal/all_tags_behaviours.csv")


def get_new_tags(x, tag_df):
    store = []
    for i, t in enumerate(x):
        tmp = tag_df[tag_df.Tags == t[-1]].drop_duplicates()
        if len(tmp.index) == 0:
            continue
        else:
            extract = [x for x in tmp.values[0, 1:] if isinstance(x, str)]
            if not extract:
                continue
            else:
                store.append((t[0], extract))
    return store


df["new_tags"] = df.split_tags.apply(lambda x: get_new_tags(x, tag_df))
df.new_tags = df.new_tags.apply(lambda x: str(x.tolist()))
df.label = df.label.apply(lambda x: str(x.tolist()))

# The above takes approx. 45 minutes hence saving the intermediate file
df.to_csv("clips_w_temporally_aligned_behaviours_tmp.csv", index=False)

In [None]:
# Reload from checkpoint to stop recalulation
df = pd.read_csv("clips_w_temporally_aligned_behaviours_tmp.csv")
df.label = df.label.apply(lambda x: ast.literal_eval(x))
df.new_tags = df.new_tags.apply(lambda x: ast.literal_eval(x))
df.new_tags = df.new_tags.apply(lambda x: sorted(x, key=lambda x: x[0]))

In [None]:
behaviours = [
    "p_camera_reaction",
    "p_tool_use",
    "p_object_carrying",
    "p_bipedal",
    "p_feeding",
    "p_chimp_carrying",
    "p_vocalisation",
    "p_climbing",
    "p_aggression",
    "p_travel",
    "p_sex",
    "p_piloerection",
    "p_social_interaction",
    "p_grooming",
    "p_display",
    "p_cross_species_interaction",
    "p_resting",
    "p_playing",
]

In [None]:
conditions_dict = {
    "camera_reaction": "camera_reaction",
    "tool_use": "(tool_use) | (termite_fishing) | (nut_cracking)",
    "object_carrying": "object_carry",
    "bipedal": "bipedal",
    "feeding": "(feeding) | (wood_eating)",
    "chimp_carrying": "chimp_carrying",
    "vocalisation": "(vocalisation) | (hoot) | (grunt)",
    "climbing": "climbing",
    "aggression": "(aggression) | (charge) | (fight)",
    "travel": "(travel) | (running) | (walking)",
    "sex": "(sex) | (mounting)",
    "piloerection": "piloerection",
    "social_interaction": "(social_interaction) | (nursing)",
    "grooming": "grooming",
    "display": "(display) | (branch_shaking) | (stone_throw) | (drumming)",
    "cross_species_interaction": "cross_species_interaction",
    "resting": "resting",
    "playing": "playing",
    "no_behaviour": "(label_indicator == False) | (no_behaviour)",
}

reversed_dict = {}
for key, value in conditions_dict.items():
    conditions = [condition.strip() for condition in value.split("|")]
    for condition in conditions:
        reversed_dict[condition] = key

modified_dict = {}
for key, value in reversed_dict.items():
    modified_key = key.replace("(", "").replace(")", "").strip()
    modified_dict[modified_key] = value

# Remove 'label_indicator' and 'no_behaviour' keys from modified_dict
modified_dict.pop("label_indicator == False")
modified_dict.pop("no_behaviour")

In [None]:
def get_parent_tags(x, modified_dict):
    store = []
    for t in x:
        if t[-1][0] in modified_dict.keys():
            t[-1][0] = modified_dict[t[-1][0]]
            store.append(t)
    return store


def order_tags(x):
    store = []
    tag_num = len(x)
    for i in range(tag_num):
        if x[i][1][0] not in store:
            store.append(x[i][1][0])
    return store


# Decode multi hot binary labels to class labels
def decode_label(x, behaviours):
    decoded_behaviours = []
    idxs = np.where(x)
    for idx in idxs[0]:
        decoded_behaviours.append(behaviours[idx].split("p_")[-1])
    return decoded_behaviours


df["parent_new_tags"] = df.new_tags.apply(lambda x: get_parent_tags(x, modified_dict))
df["ordered_tags"] = df.parent_new_tags.apply(lambda x: order_tags(x))
df["decoded_labels"] = df.label.apply(lambda x: decode_label(x, behaviours))

In [None]:
# df.columns[:20], df.columns[20:40], df.columns[40:60], df.columns[60:]
# Not final but seems close to finished...
df = df[
    [
        "video_id",
        "subject_id",
        "start.time",
        "age_groups",
        "sex_groups",
        "country",
        "research_site",
        "genus",
        "species",
        "location_metadata",
        "habitat",
        "min",
        "max",
        "day",
        "month",
        "year",
        "time_hr",
        "time_min",
        "behavioral_context",
        "p_camera_reaction",
        "p_tool_use",
        "p_object_carrying",
        "p_bipedal",
        "p_feeding",
        "p_chimp_carrying",
        "p_vocalisation",
        "p_climbing",
        "p_aggression",
        "p_travel",
        "p_sex",
        "p_piloerection",
        "p_social_interaction",
        "p_grooming",
        "p_display",
        "p_cross_species_interaction",
        "p_resting",
        "p_playing",
        "ordered_tags",
        "label",
        "decoded_labels",
    ]
]

In [None]:
df.video_id = df.video_id.astype(str)
df.info()

In [None]:
df.label = df.label.apply(lambda x: str(x))
df.ordered_tags = df.ordered_tags.apply(lambda x: str(x))
df.decoded_labels = df.decoded_labels.apply(lambda x: str(x))

In [None]:
pd.set_option("max_colwidth", 400)
df.drop_duplicates(inplace=True)

In [120]:
df[df.video_id == "baf_vid16_0340989_1432398_20151114_12010009"]

Unnamed: 0,video_id,subject_id,start.time,age_groups,sex_groups,country,research_site,genus,species,location_metadata,...,p_piloerection,p_social_interaction,p_grooming,p_display,p_cross_species_interaction,p_resting,p_playing,ordered_tags,label,decoded_labels
4,baf_vid16_0340989_1432398_20151114_12010009,60886063,0.0,"adult,adult","male,male",mali,bafing,Pan,troglodytes verus,nesting site/water source,...,False,False,False,False,False,False,False,['travel'],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1]","['camera_reaction', 'travel', 'resting', 'playing']"
6,baf_vid16_0340989_1432398_20151114_12010009,60886065,15.0,"adult,adult","male,male",mali,bafing,Pan,troglodytes verus,nesting site/water source,...,False,False,False,False,False,False,False,[],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1]","['camera_reaction', 'travel', 'resting', 'playing']"
8,baf_vid16_0340989_1432398_20151114_12010009,60886064,30.0,"adult,adult","male,male",mali,bafing,Pan,troglodytes verus,nesting site/water source,...,False,False,False,False,False,True,True,"['camera_reaction', 'playing', 'resting', 'travel']","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1]","['camera_reaction', 'travel', 'resting', 'playing']"
10,baf_vid16_0340989_1432398_20151114_12010009,60886066,45.0,"adult,adult","male,male",mali,bafing,Pan,troglodytes verus,nesting site/water source,...,False,False,False,False,False,False,False,[],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1]","['camera_reaction', 'travel', 'resting', 'playing']"


In [None]:
df.to_csv("clips_w_temporally_aligned_behaviours.csv", index=False)

**Create meta-text dataset**

In [None]:
composition = ["age_groups", "sex_groups", "min", "max"]
location = ["country", "research_site", "location_metadata", "habitat"]
time = ["day", "month", "year", "time_hr", "time_min"]
tdf = df[["video_id"] + composition + location + time + ["label"]]
tdf.label = tdf.label.apply(lambda x: x[:-1])  # let 0 vector represent no behaviour
tdf.label = tdf.label.apply(lambda x: str(list(x)))
tdf.drop_duplicates(inplace=True)

**Age cats: 'unidentifiable', 'infant', 'juvenile', 'adolescent', 'adult'**


In [None]:
def count_individual_age(x, age):
    if age in x:
        return x.count(age)
    else:
        return 0

In [None]:
ages = ["unidentifiable", "infant", "juvenile", "adolescent", "adult"]
tdf.age_groups.fillna("", inplace=True)
tdf["unidentifiable_count"] = tdf.age_groups.apply(
    lambda x: count_individual_age(x.split(","), "unidentifiable")
)
tdf["infant_count"] = tdf.age_groups.apply(
    lambda x: count_individual_age(x.split(","), "infant")
)
tdf["juvenile_count"] = tdf.age_groups.apply(
    lambda x: count_individual_age(x.split(","), "juvenile")
)
tdf["adolescent_count"] = tdf.age_groups.apply(
    lambda x: count_individual_age(x.split(","), "adolescent")
)
tdf["adult_count"] = tdf.age_groups.apply(
    lambda x: count_individual_age(x.split(","), "adult")
)

**Sex cats: 'unclear', 'unidentifiable', 'male', 'female'**

In [None]:
def count_individual_sex(x, sex):
    if sex in x:
        return x.count(sex)
    else:
        return 0

In [None]:
sexes = ["unclear", "unidentifiable", "male", "female"]
tdf.sex_groups.fillna("", inplace=True)
tdf["unclear_count"] = tdf.sex_groups.apply(
    lambda x: count_individual_sex(x.split(","), "unclear")
)
tdf["unidentifiable_count"] = tdf.sex_groups.apply(
    lambda x: count_individual_sex(x.split(","), "unidentifiable")
)
tdf["male"] = tdf.sex_groups.apply(lambda x: count_individual_sex(x.split(","), "male"))
tdf["female"] = tdf.sex_groups.apply(
    lambda x: count_individual_sex(x.split(","), "female")
)

In [None]:
def group_age_sex(age, sex):
    assert len(age.split(",")), len(sex.split(","))
    pairings = []
    for age, sex in zip(age.split(","), sex.split(",")):
        pair = f"{age} {sex}"
        pairings.append(pair)
    return ",".join(pairings)


tdf["age_sex_group"] = tdf.apply(
    lambda x: group_age_sex(x.age_groups, x.sex_groups), axis=1
)

In [None]:
as_group = []
for group in tdf["age_sex_group"].unique():
    for g in group.split(","):
        as_group.append(g)
as_group = list(set(as_group))
as_group = [i for i in as_group if i != " "]
print(as_group)

In [None]:
def count_age_sex_pairs(x, g):
    tmp = x.split(",")
    count = tmp.count(g)
    return count


for g in as_group:
    tdf[f"{g}"] = tdf.age_sex_group.apply(lambda x: count_age_sex_pairs(x, g))

In [None]:
tdf["month"] = tdf.month.astype(int)
tdf["year"] = tdf.year.astype(int)
tdf.month = tdf.month.apply(lambda x: calendar.month_name[x])

In [None]:
def desc_composition(x, pairings):
    if x["max"] == 1:
        start = "A video of"
        for pair in pairings:
            if x[pair] == 1:
                start += f" {p.number_to_words(x[pair])} {pair} "
        start = start + "chimpanzee"
    elif x["max"] > 1:
        start = (
            f"A video of {p.number_to_words(int(x['max']))} chimpanzees, composed of"
        )
        tmp = []
        for pair in pairings:
            if x[pair] >= 1:
                tmp.append(pair)

        number_of_pairs = len(tmp)
        only_one_pair = True if len(set(tmp)) == 1 else False

        for i, pair in enumerate(tmp):
            if only_one_pair:
                start += f" {p.number_to_words(x[pair])} {pair}s"
            elif i == number_of_pairs - 1:
                start += f" and {p.number_to_words(x[pair])} {pair}s"
            else:
                if i == number_of_pairs - 2:
                    start += f" {p.number_to_words(x[pair])} {pair}s"
                else:
                    start += f" {p.number_to_words(x[pair])} {pair},"
        # start += f" {p.number_to_words(x[pair])} {pair}s,"
    return start


def desc_location(x):
    return f"It was filmed in {x['country']} at the {x['research_site']} research site"


def desc_habitat(x):
    desc = f"at a {x['location_metadata']} in {x['habitat']}"
    return desc


def desc_time(x):
    desc = f"on {x['day']} {x['month']} {x['year']} at {x['time_hr']}:{x['time_min']}."
    return desc

In [None]:
pairings = tdf.columns[24:-1]
tdf["desc"] = tdf.apply(
    lambda x: f"{desc_composition(x, pairings)}. {desc_location(x)} {desc_habitat(x)} {desc_time(x)}",
    axis=1,
)

In [None]:
tdf[tdf["max"] == 5][["max", "desc"]].desc.iloc[0]

In [None]:
train_tdf = train_vdf.merge(tdf, left_on="video", right_on="video_id", how="left")
train_tdf.rename(columns={"label_y": "label"}, inplace=True)
train_tdf.drop(columns=["label_x"], inplace=True)

val_tdf = val_vdf.merge(tdf, left_on="video", right_on="video_id", how="left")
val_tdf.rename(columns={"label_y": "label"}, inplace=True)
val_tdf.drop(columns=["label_x"], inplace=True)

test_tdf = test_vdf.merge(tdf, left_on="video", right_on="video_id", how="left")
test_tdf.rename(columns={"label_y": "label"}, inplace=True)
test_tdf.drop(columns=["label_x"], inplace=True)

In [None]:
train_tdf[["video_id", "descriptor", "label"]].to_csv(
    "data/annotations/text_only/train_text_only.csv", index=False
)

val_tdf[["video_id", "descriptor", "label"]].to_csv(
    "data/annotations/text_only/val_text_only.csv", index=False
)

test_tdf[["video_id", "descriptor", "label"]].to_csv(
    "data/annotations/text_only/test_text_only.csv", index=False
)

In [None]:
df[["video_id", "split_tags"]][
    df["video_id"] == "tair_cam22_688836_647457_20131025_ek000246"
].groupby("video_id").apply(lambda x: x.split_tags.values).iloc[0]

**Create video dataset**

In [None]:
import ast
import numpy as np

vdf = df[["video_id", "label"]]
vdf.label = vdf.label.apply(lambda x: x[:-1])  # let 0 vector represent no behaviour
vdf.label = vdf.label.apply(lambda x: str(list(x)))
vdf = vdf.drop_duplicates()


vdf.label = vdf.label.apply(lambda x: ast.literal_eval(x))
labels = np.array(list(vdf.label.values))
X = vdf.video_id.to_numpy().reshape((6675, 1))

assert len(X) == len(labels)

X_train, y_train, X_test, y_test = iterative_train_test_split(X, labels, test_size=0.30)
X_test, y_test, X_val, y_val = iterative_train_test_split(
    X_test, y_test, test_size=0.33
)
y_train, y_test, y_val = (
    [str(list(x)) for x in y_train],
    [str(list(x)) for x in y_test],
    [str(list(x)) for x in y_val],
)

train_vdf = pd.DataFrame({"video": X_train[:, 0], "label": y_train})
val_vdf = pd.DataFrame({"video": X_val[:, 0], "label": y_val})
test_vdf = pd.DataFrame({"video": X_test[:, 0], "label": y_test})

train_vdf.to_csv("data/annotations/video_only/train.csv", index=False)
val_vdf.to_csv("data/annotations/video_only/val.csv", index=False)
test_vdf.to_csv("data/annotations/video_only/test.csv", index=False)

**Test embedding metadata**

In [None]:
from transformers import CLIPTokenizer, CLIPTextModel

In [None]:
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
text_model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")

for module in text_model.text_model.encoder.layers[:-1].modules():
    for param in module.parameters():
        param.requires_grad = False

tokenized_text = tdf.descriptor.apply(
    lambda x: tokenizer(
        x, padding="max_length", max_length=77, truncation=True, return_tensors="pt"
    )
)

In [None]:
text_features = text_model(**tokenized_text[0])

In [None]:
#### This is for temporal processing of the dataset #####

In [None]:
test_df[
    ["video_id", "behavioral_context", "camera_" "start.time"]
    + list(test_df.columns[-19:])
]

In [None]:
collection = []
for video_name in test_df.video_id.unique():
    item = {}
    tmp = test_df[test_df.video_id == video_name]
    item["video_name"] = video_name
    item["metadata"] = dict(
        age_groups=tmp.age_groups, sex_groups=tmp.sex_groups, max=tmp.max, min=tmp.min
    )
    item["behaviour"] = tmp["start.time"].to_dict()
    collection.append(item)

In [None]:
collection[0]["metadata"]["age_groups"]