In [1]:
import json
import pandas as pd
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

**Load relevant data**

In [2]:
# Load acp-prefixed videos with multi-label annotations
cs_df = pd.read_csv("C&S_annotated_acp_videos.csv")
print(f"Total videos: {len(cs_df)}\nUnique videos: {cs_df.video.unique().__len__()}")

Total videos: 14492
Unique videos: 14492


In [8]:
# Pre-process MM's annotations
csv_files = glob(
    "/home/dl18206/Desktop/phd/data/panaf/maureen_annotations/data/sites/csv/**/*.csv",
    recursive=True,
)
sorted_csv_files = sorted(csv_files, key=lambda x: x.split("/")[-1])
initialiser, remainder = sorted_csv_files[0], sorted_csv_files[1:]
mm_df = pd.read_csv(initialiser, encoding="ISO-8859-1")

for file in remainder:
    site = pd.read_csv(file, encoding="ISO-8859-1")
    mm_df = pd.concat([mm_df, site])

In [9]:
mm_df

Unnamed: 0,new_row_id,country,research_site,genus,species,cam_coverage_area,location_metadata,habitat,utm_zone,utm_long,...,age_class,sex,tool_use,vocalization,bipedal,camera_reaction,behavioral_context,other_species,additional_comments,record_type
0,1,mali,bafing,Pan,troglodytes verus,9.87,trail,forest on rock,29n,342661.0,...,adult,male,no,no,no,no,travel,,Not very clear screen,60s_video
1,2,mali,bafing,Pan,troglodytes verus,14.00,nesting site,savannah - wooded,29n,344126.0,...,adult,female,no,no,no,no,feeding,,"Gray back, disappears at the left side",60s_video
2,3,mali,bafing,Pan,troglodytes verus,14.00,nesting site,savannah - wooded,29n,344126.0,...,infant,unclear,no,no,no,no,feeding,,"Follows the above chimp, probably child of her",60s_video
3,4,mali,bafing,Pan,troglodytes verus,14.00,nesting site,savannah - wooded,29n,344126.0,...,infant,unclear,no,no,no,no,feeding,,"On the ground, stands near next chimp, and the...",60s_video
4,5,mali,bafing,Pan,troglodytes verus,14.00,nesting site,savannah - wooded,29n,344126.0,...,adult,female,no,no,no,no,feeding,,"Behind the tree, picks a baobab´s fruit and th...",60s_video
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,1337,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,,trail,forest - colonising,36s,230040.0,...,,,,,,,,,,photo
1337,1338,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,,trail,forest - colonising,36s,230040.0,...,,,,,,,,,,photo
1338,1339,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,,trail,forest - colonising,36s,230040.0,...,,,,,,,,,,photo
1339,1340,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,,trail,forest - colonising,36s,230040.0,...,,,,,,,,,,photo


In [3]:
# Pre-process MM's annotations
csv_files = glob(
    "/home/dl18206/Desktop/phd/data/panaf/maureen_annotations/data/sites/csv/**/*.csv",
    recursive=True,
)
sorted_csv_files = sorted(csv_files, key=lambda x: x.split("/")[-1])
initialiser, remainder = sorted_csv_files[0], sorted_csv_files[1:]
mm_df = pd.read_csv(initialiser, encoding="ISO-8859-1")
for file in remainder:
    site = pd.read_csv(file, encoding="ISO-8859-1")
    mm_df = pd.concat([mm_df, site])
mm_df["subdir_video"] = (
    mm_df.subfolder.astype(str) + "_" + mm_df.video_file_name.astype(str)
)
mm_df["subdir_video"] = mm_df.subdir_video.str.lower()
mm_df["subdir_video"] = mm_df.subdir_video.str.split(".").str[0]
mm_df.drop(
    mm_df.columns.difference(
        [
            "subdir_video",
            "behavioral_context",
            "tool_use",
            "vocalization",
            "bipedal",
            "camera_reaction",
        ]
    ),
    1,
    inplace=True,
)

mm_df.tool_use.replace({"yes": 1, "off_camera": 1, "unclear": 0, "no": 0}, inplace=True)
mm_df.vocalization.replace(
    {"yes": 1, "off_camera": 1, "offscreen": 1, "no": 0}, inplace=True
)
mm_df.bipedal.replace(
    {"yes": 1, "off_camera": 1, "offscreen": 1, "no": 0}, inplace=True
)
mm_df.camera_reaction.replace(
    {"yes": 1, "off_camera": 1, "ues": 1, "no": 0}, inplace=True
)

mm_df.tool_use.fillna(0, inplace=True)
mm_df.vocalization.fillna(0, inplace=True)
mm_df.bipedal.fillna(0, inplace=True)
mm_df.camera_reaction.fillna(0, inplace=True)

mm_df.behavioral_context.replace(
    {
        "camera reaction": "camera_reaction",
        "tool use": "tool_use",
        "tool use ants": "tool_use_ants",
        "tool use algae": "tool_use_algae",
        "tool use honey": "tool_use_honey",
        "tool use nuts": "tool_use_nuts",
        "tool use stone throwing": "tool_use_stone_throwing",
        "tool use termites": "tool_use_termites",
        "tool use unknown": "tool_use_unknown",
    },
    inplace=True,
)
mm_df = pd.get_dummies(mm_df, columns=["behavioral_context"])

  mm_df.drop(


In [4]:
# Merge MM's annotations with C&S annotations
df = cs_df.merge(mm_df, on="subdir_video")

In [368]:
# Convert the columns to int64
float64_columns = df.select_dtypes(include="float64").columns
uint8_columns = df.select_dtypes(include="uint8").columns
df[uint8_columns] = df[uint8_columns].astype("int64")
df[float64_columns] = df[float64_columns].astype("int64")

# Move columns not relating to behaviour
df.insert(7, "label", df.pop("label"))
df.insert(8, "label_indicator", df.pop("label_indicator"))

In [506]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30840 entries, 0 to 30839
Data columns (total 57 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   video                                       30840 non-null  object
 1   site                                        30840 non-null  object
 2   subdir_video                                30840 non-null  object
 3   ID                                          30840 non-null  object
 4   Tags                                        30840 non-null  object
 5   Unnamed: 2                                  30840 non-null  object
 6   split_tags                                  30840 non-null  object
 7   label                                       30840 non-null  object
 8   label_indicator                             30840 non-null  bool  
 9   camera_reaction_x                           30840 non-null  int64 
 10  tool_use_x            

**Make replica (test_df) for further processing**

In [497]:
# Subsample main df
test_df = df.sample(len(df))

# Convert the columns to int64
float64_columns = test_df.select_dtypes(include="float64").columns
uint8_columns = test_df.select_dtypes(include="uint8").columns
test_df[uint8_columns] = test_df[uint8_columns].astype("int64")
test_df[float64_columns] = test_df[float64_columns].astype("int64")

# Move columns not relating to behaviour
test_df.insert(7, "label", test_df.pop("label"))
test_df.insert(8, "label_indicator", test_df.pop("label_indicator"))

In [373]:
# Get annotation groupings (i.e., C&S and MM)
cs_annotations = test_df.columns[9:25].values
mm_annotations = test_df.columns[30:57].values

In [499]:
# Create an empty list to store the column headings
column_list, cs_anns, mm_anns = [], [], []
# Iterate over each row in the dataframe
for index, row in test_df.iterrows():
    # Create a list comprehension to get the column headings where the value is 1
    columns_with_ones = [column for column in test_df.columns if row[column] == 1]
    # Append the list of column headings to the column_list
    column_list.append(columns_with_ones)
# Add the new column to the dataframe
test_df["behaviours"] = column_list

In [501]:
test_df = test_df.groupby("video")["behaviours"].apply(list).reset_index()
test_df.behaviours = test_df.behaviours.apply(lambda x: x[0])
test_df["count"] = test_df.behaviours.apply(lambda x: len(x))

In [503]:
test_df

Unnamed: 0,video,behaviours,count
0,acp000002j,"[label_indicator, camera_reaction_x, behaviora...",3
1,acp000002l,"[label_indicator, camera_reaction_x, behaviora...",3
2,acp00000ep,[behavioral_context_travel],1
3,acp00000g6,[behavioral_context_climbing],1
4,acp00000ga,[behavioral_context_travel],1
...,...,...,...
12370,acp000dzp0,"[label_indicator, camera_reaction_x, camera_re...",4
12371,acp000dzp1,"[camera_reaction_y, behavioral_context_travel]",2
12372,acp000e0cs,"[label_indicator, tool_use_x, vocalization, be...",4
12373,acp000e0cu,"[vocalization, behavioral_context_travel]",2


In [505]:
# Create new columns to indicate CS and MM annotations
cs_ann, mm_ann = [], []
for index, row in test_df.iterrows():
    mm_ann.append(any([x for x in row["behaviours"] if x in mm_annotations]))
    cs_ann.append(any([x for x in row["behaviours"] if x in cs_annotations]))
# Add the new column to the dataframe
test_df["mm"] = mm_ann
test_df["cs"] = cs_ann

In [379]:
# Open the behaviour groupings file
with open("behaviour_groupings.json", "r") as f:
    behaviour_groups = json.load(f)

In [507]:
action_list = test_df.behaviours.values.tolist()
videos = test_df.video.values.tolist()

store = []

for k, v in behaviour_groups.items():
    for video, actions in zip(
        videos, action_list
    ):  # where each set of actions corresponds to 1 video
        agreement = []
        for a in actions:
            for i, b in enumerate(v["behaviors"]):
                if a == b:
                    agreement.append(v["original_list"][i])
        if not agreement:
            continue
        else:
            if ("list_1") in agreement and ("list_2") in agreement:
                agree_dict = dict(video=video, behaviour=k, agreement="Agree")
            elif ("list_1") in agreement and ("list_2") not in agreement:
                agree_dict = dict(video=video, behaviour=k, agreement="MM-only")
            elif ("list_1") not in agreement and ("list_2") in agreement:
                agree_dict = dict(video=video, behaviour=k, agreement="CS-only")
        store.append(agree_dict)

In [508]:
agree_df = pd.DataFrame(store)
agree_df.head()

Unnamed: 0,video,behaviour,agreement
0,acp00001qd,tool_use,CS-only
1,acp000051c,tool_use,CS-only
2,acp000051e,tool_use,CS-only
3,acp000086j,tool_use,Agree
4,acp000086l,tool_use,Agree


**Exploring annotation dist**

In [509]:
agreement_group_df = agree_df.groupby("video")["agreement"].apply(list).reset_index()
behaviour_group_df = agree_df.groupby("video")["behaviour"].apply(list).reset_index()
dist_df = agreement_group_df.merge(behaviour_group_df, on="video")

In [510]:
dist_df.agreement.astype(str)
dist_df.agreement = dist_df.agreement.apply("_".join)
dist_df.agreement.value_counts()

MM-only                                          5076
Agree                                            2144
CS-only_MM-only                                  1219
MM-only_Agree                                     939
MM-only_MM-only                                   657
                                                 ... 
CS-only_CS-only_CS-only_CS-only_Agree_CS-only       1
CS-only_MM-only_CS-only_Agree                       1
MM-only_MM-only_MM-only_Agree                       1
Agree_MM-only_CS-only_MM-only                       1
CS-only_MM-only_MM-only_Agree                       1
Name: agreement, Length: 87, dtype: int64

**Videos requiring verification**

In [468]:
augmented_df = dist_df[
    (dist_df.agreement != "Agree") & (dist_df.agreement != "CS-only")
]

In [480]:
# MM-only
mm_only_df = dist_df[dist_df.agreement == "MM-only"]
mm_only_df.behaviour.value_counts()

[travel]                2104
[feeding]               1659
[climbing]               441
[resting]                361
[unclear]                188
[tool_use]               130
[camera_reaction]        100
[social_interaction]      38
[display]                 20
[grooming]                12
[sexual]                   7
[aggression]               1
Name: behaviour, dtype: int64

In [491]:
mm_only_df.sort_values(by="video", ascending=True, inplace=True)
mm_only_df.to_csv("mm_only.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mm_only_df.sort_values(by='video', ascending=True, inplace=True)


**Move videos to new directory for annotation**

In [484]:
path = "/home/dl18206/Desktop/phd/data/panaf/acp/videos/all"
local_videos = glob(f"{path}/**/*.mp4", recursive=True)
mm_only_videos = [
    x
    for x in local_videos
    if x.split("/")[-1].split(".")[0] in mm_only_df.video.values.tolist()
]

In [488]:
import shutil
import os


def copy_files(file_list, destination_dir):
    # Create the destination directory if it doesn't exist
    os.makedirs(destination_dir, exist_ok=True)

    for file_path in tqdm(file_list):
        # Get the filename from the original file path
        file_name = os.path.basename(file_path)

        # Construct the destination file path
        destination_path = os.path.join(destination_dir, file_name)

        # Copy the file to the destination directory
        shutil.copy(file_path, destination_path)
        print(f"File '{file_name}' copied to '{destination_dir}'")