In [1]:
import os
import json
import shutil
import random, math
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# Load Maureens annotations
csv_files = glob("data/sites/csv/**/*.csv", recursive=True)
sorted_csv_files = sorted(csv_files, key=lambda x: x.split("/")[-1])
initialiser, remainder = sorted_csv_files[0], sorted_csv_files[1:]
df = pd.read_csv(initialiser, encoding="ISO-8859-1")
for file in remainder:
    site = pd.read_csv(file, encoding="ISO-8859-1")
    df = pd.concat([df, site])
df.camera_reaction.replace({"ues": "yes"}, inplace=True)
df["subdir_video"] = df.subfolder.astype(str) + "_" + df.video_file_name.astype(str)
df.subdir_video = df.subdir_video.apply(lambda x: x.lower())
df.subdir_video = df.subdir_video.str.split(".").str[0]
df["prepend_zero"] = df.subdir_video.apply(
    lambda x: f'{"_".join(x.split("_")[:-1])}_{"0" + x.split("_")[-1]}'
)

In [5]:
# Load filepaths from Jade2
jf = pd.read_csv("data/jade2/chimp_videos.csv", index_col=False)
jf.drop(columns=["Unnamed: 0"], inplace=True)
jf.files = jf.files.apply(lambda x: x.split(".")[0].lower())

In [6]:
df.columns

Index(['new_row_id', 'country', 'research_site', 'genus', 'species',
       'cam_coverage_area', 'location_metadata', 'habitat', 'utm_zone',
       'utm_long', 'utm_lat', 'cam_id', 'vid_res', 'vid_duration',
       'panaf_datasheet_comments', 'year', 'month', 'day', 'time_hr',
       'time_min', 'cell_id', 'subfolder', 'video_file_name', 'event_id',
       'min_number_chimps_per_video', 'max_number_chimps_per_video',
       'age_class', 'sex', 'tool_use', 'vocalization', 'bipedal',
       'camera_reaction', 'behavioral_context', 'other_species',
       'additional_comments', 'record_type', 'subdir_video', 'prepend_zero'],
      dtype='object')

In [7]:
x1 = df[(df.camera_reaction == "yes")].subdir_video.unique().tolist()
x2 = df[df.behavioral_context == "camera reaction"].subdir_video.unique().tolist()
cam_reaction_videos = list(set(x1 + x2))

In [8]:
reaction_df = pd.DataFrame(
    df[df.subdir_video.isin(cam_reaction_videos)].subdir_video.unique(),
    columns=["video"],
)
no_reaction_df = pd.DataFrame(
    df[~df.subdir_video.isin(cam_reaction_videos)].subdir_video.unique(),
    columns=["video"],
)
reaction_df["p_camera_reaction"] = True
no_reaction_df["p_camera_reaction"] = False
cr_df = pd.concat([reaction_df, no_reaction_df])
cr_df = cr_df[cr_df.video.isin(jf.files.values)]

In [94]:
cr_df.p_camera_reaction.value_counts()

False    9286
True     4174
Name: p_camera_reaction, dtype: int64

In [9]:
cr_df

Unnamed: 0,video,p_camera_reaction
0,baf_vid10_0346467_1436892_20151112_11280025,True
3,baf_vid11_0343264_1434832_20151019_12100066,True
4,baf_vid12_0345234_1438195_20151123_11280008,True
8,baf_vid15_0342480_1431181_20141110_11150010,True
9,baf_vid15_0342480_1431181_20141110_11150011,True
...,...,...
14765,uga_vidba4_231634_9390066_20141027_ek000017,False
14766,uga_vidba4_231634_9390066_20141027_ek000018,False
14767,uga_vidba4_231634_9390066_20141027_ek000029,False
14768,uga_vidba4_231634_9390066_20141027_ek000044,False


In [127]:
cr_df.p_camera_reaction.value_counts() / len(cr_df)

False    0.689896
True     0.310104
Name: p_camera_reaction, dtype: float64

In [106]:
X = cr_df["video"]
y = cr_df["p_camera_reaction"]

In [119]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.3, shuffle=True, stratify=y
)
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, random_state=42, test_size=0.3, shuffle=True, stratify=y_test
)

In [142]:
# Make df from X_train and y_train
train_df = pd.DataFrame({"video": X_train, "p_camera_reaction": y_train})
val_df = pd.DataFrame({"video": X_val, "p_camera_reaction": y_val})
test_df = pd.DataFrame({"video": X_test, "p_camera_reaction": y_test})

# Add label column
train_df["label"] = train_df.p_camera_reaction.apply(lambda x: str([int(x)]))
val_df["label"] = val_df.p_camera_reaction.apply(lambda x: str([int(x)]))
test_df["label"] = test_df.p_camera_reaction.apply(lambda x: str([int(x)]))

In [143]:
train_df.to_csv("data/splits/train.csv", index=False)
val_df.to_csv("data/splits/val.csv", index=False)
test_df.to_csv("data/splits/test.csv", index=False)