In [1]:
import os
import json
import shutil
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import random, math

In [2]:
csv_files = glob("data/sites/csv/**/*.csv", recursive=True)
sorted_csv_files = sorted(csv_files, key=lambda x: x.split("/")[-1])
initialiser, remainder = sorted_csv_files[0], sorted_csv_files[1:]
df = pd.read_csv(initialiser, encoding="ISO-8859-1")
for file in remainder:
    site = pd.read_csv(file, encoding="ISO-8859-1")
    df = pd.concat([df, site])
df["subdir_video"] = df.subfolder.astype(str) + "_" + df.video_file_name.astype(str)

**Assigning metadata to PanAf20K videos**

In [3]:
p20k = [
    x.split("/")[-1].split(".")[0]
    for x in glob(
        "/home/dl18206/Desktop/phd/data/panaf/acp/videos/all/**/*.mp4", recursive=True
    )
]

In [4]:
cns = pd.read_csv(
    "/home/dl18206/Downloads/Re_ getting back on track _)/all_cs_clip_information.txt",
    sep="\t",
)
cns.subject_id = cns.subject_id.str.lower()

In [5]:
cns = cns[cns.subject_id.isin(p20k)]
cns["video"] = cns["video.id"].str.split("/").str[-1].str.split(".").str[0]
cns["dir"] = cns["video.id"].str.split("/").str[-2]
cns["subdir_video"] = cns["dir"] + "_" + cns["video"]
cns.subdir_video = cns.subdir_video.str.lower()

In [6]:
# merge df and cns on subdir_video
merged_df = cns.merge(df, on="subdir_video", how="left")

**Processing ages**

In [7]:
videos = df.groupby("subdir_video")["age_class"].apply(lambda x: list(x)).index.values
age_groups = (
    df.groupby("subdir_video")["age_class"]
    .apply(
        lambda x: ",".join(list([str(i) for i in x]))
        if len(list(x)) > 1
        else list(x)[0]
    )
    .values
)
sex_groups = (
    df.groupby("subdir_video")["sex"]
    .apply(
        lambda x: ",".join(list([str(i) for i in x]))
        if len(list(x)) > 1
        else list(x)[0]
    )
    .values
)
age_df = pd.DataFrame(
    {"subdir_video": videos, "age_groups": age_groups, "sex_groups": sex_groups}
)
merged_df = merged_df.merge(age_df, on="subdir_video")
merged_df.drop(columns=["age_class", "sex"], inplace=True)

In [8]:
merged_df.columns

Index(['subject_id', 'start.time', 'site', 'video.id', 'tags',
       'classifications', 'behavior', 'video', 'dir', 'subdir_video',
       'new_row_id', 'country', 'research_site', 'genus', 'species',
       'cam_coverage_area', 'location_metadata', 'habitat', 'utm_zone',
       'utm_long', 'utm_lat', 'cam_id', 'vid_res', 'vid_duration',
       'panaf_datasheet_comments', 'year', 'month', 'day', 'time_hr',
       'time_min', 'cell_id', 'subfolder', 'video_file_name', 'event_id',
       'min_number_chimps_per_video', 'max_number_chimps_per_video',
       'tool_use', 'vocalization', 'bipedal', 'camera_reaction',
       'behavioral_context', 'other_species', 'additional_comments',
       'record_type', 'age_groups', 'sex_groups'],
      dtype='object')

In [9]:
all_attr = [
    "subject_id",
    "start.time",
    "country",
    "research_site",
    "site",
    "genus",
    "species",
    "location_metadata",
    "habitat",
    "min",
    "max",
    "day",
    "month",
    "year",
    "time_hr",
    "time_min",
    "tool_use",
    "vocalization",
    "bipedal",
    "camera_reaction",
    "behavioral_context",
    "video.id",
    "tags",
    "classifications",
    "behavior",
]

meta_attr = [
    "subject_id",
    "start.time",
    "country",
    "research_site",
    "site",
    "genus",
    "species",
    "location_metadata",
    "habitat",
    "min",
    "max",
    "age_groups",
    "sex_groups",
    "day",
    "month",
    "year",
    "time_hr",
    "time_min",
]

behaviour_attr = [
    "tool_use",
    "vocalization",
    "bipedal",
    "camera_reaction",
    "behavioral_context",
    "video.id",
    "tags",
    "classifications",
    "behavior",
]

merged_df.rename(
    columns={
        "min_number_chimps_per_video": "min",
        "max_number_chimps_per_video": "max",
    },
    inplace=True,
)
merged_df[meta_attr]

Unnamed: 0,subject_id,start.time,country,research_site,site,genus,species,location_metadata,habitat,min,max,age_groups,sex_groups,day,month,year,time_hr,time_min
0,acp00000ga,0.0,drc,bili,bili,Pan,troglodytes schweinfurthii,trail,"forest - mixed, open understorey",1.0,1.0,adult,male,7.0,9.0,2012.0,13.0,52.0
1,acp00000gd,15.0,drc,bili,bili,Pan,troglodytes schweinfurthii,trail,"forest - mixed, open understorey",1.0,1.0,adult,male,7.0,9.0,2012.0,13.0,52.0
2,acp00000gf,30.0,drc,bili,bili,Pan,troglodytes schweinfurthii,trail,"forest - mixed, open understorey",1.0,1.0,adult,male,7.0,9.0,2012.0,13.0,52.0
3,acp00000gi,45.0,drc,bili,bili,Pan,troglodytes schweinfurthii,trail,"forest - mixed, open understorey",1.0,1.0,adult,male,7.0,9.0,2012.0,13.0,52.0
4,acp00000ep,0.0,drc,bili,bili,Pan,troglodytes schweinfurthii,trail,"forest - mixed, open understorey",2.0,2.0,"adult,infant","female,unclear",24.0,9.0,2012.0,16.0,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33704,acp0004tfa,45.0,cotedivoire,tair,tair,Pan,troglodytes verus,nut cracking site,"forest - mixed, closed understorey",,,,,30.0,1.0,2014.0,9.0,11.0
33705,acp0004tfv,0.0,cotedivoire,tair,tair,Pan,troglodytes verus,nut cracking site,"forest - mixed, closed understorey",1.0,1.0,infant,unclear,30.0,1.0,2014.0,9.0,14.0
33706,acp0004tfw,15.0,cotedivoire,tair,tair,Pan,troglodytes verus,nut cracking site,"forest - mixed, closed understorey",1.0,1.0,infant,unclear,30.0,1.0,2014.0,9.0,14.0
33707,acp0004tfx,30.0,cotedivoire,tair,tair,Pan,troglodytes verus,nut cracking site,"forest - mixed, closed understorey",1.0,1.0,infant,unclear,30.0,1.0,2014.0,9.0,14.0


In [10]:
train_df = pd.read_csv(
    "/home/dl18206/Desktop/phd/data/panaf/chimp_and_see/data/internal/splits/final/train.csv"
)
val_df = pd.read_csv(
    "/home/dl18206/Desktop/phd/data/panaf/chimp_and_see/data/internal/splits/final/val.csv"
)
test_df = pd.read_csv(
    "/home/dl18206/Desktop/phd/data/panaf/chimp_and_see/data/internal/splits/final/test.csv"
)

In [11]:
len(train_df), len(val_df), len(test_df)

(10943, 3143, 1548)

In [12]:
train_df.rename(columns={"video": "subject_id"}, inplace=True)
train_df = train_df.merge(merged_df[meta_attr], on="subject_id", how="left")
train_df.drop_duplicates(inplace=True)
meta_train_df = train_df[train_df.columns[20:]]
meta_train_df["video"] = train_df["subject_id"]


val_df.rename(columns={"video": "subject_id"}, inplace=True)
val_df = val_df.merge(merged_df[meta_attr], on="subject_id", how="left")
val_df.drop_duplicates(inplace=True)
meta_val_df = val_df[val_df.columns[20:]]
meta_val_df["video"] = val_df["subject_id"]

test_df.rename(columns={"video": "subject_id"}, inplace=True)
test_df = test_df.merge(merged_df[meta_attr], on="subject_id", how="left")
test_df.drop_duplicates(inplace=True)
meta_test_df = test_df[test_df.columns[20:]]
meta_test_df["video"] = test_df["subject_id"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_train_df['video'] = train_df['subject_id']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_val_df['video'] = val_df['subject_id']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_test_df['video'] = test_df['subject_id']


In [13]:
# move video column to front
def front_column(df):
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    return df

In [14]:
meta_train_df = front_column(meta_train_df)
meta_val_df = front_column(meta_val_df)
meta_test_df = front_column(meta_test_df)

In [16]:
meta_train_df.to_csv("train_metadata.csv", index=False)
meta_test_df.to_csv("test_metadata.csv", index=False)
meta_val_df.to_csv("val_metadata.csv", index=False)