In [1]:
import os
import json
import shutil
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import random, math

In [2]:
# Load Maureens annotations
csv_files = glob('data/sites/csv/**/*.csv', recursive=True)
sorted_csv_files = sorted(csv_files, key=lambda x: x.split('/')[-1])
initialiser, remainder = sorted_csv_files[0], sorted_csv_files[1:]
df = pd.read_csv(initialiser, encoding="ISO-8859-1")
for file in remainder:
    site = pd.read_csv(file, encoding="ISO-8859-1")
    df = pd.concat([df, site])
df['subdir_video'] = df.subfolder.astype(str) + '_' + df.video_file_name.astype(str)

# Make df single apes only - is this correct with regards to accuracy?
df = df[df.max_number_chimps_per_video==1]
df.subdir_video = df.subdir_video.apply(lambda x: x.lower())
df.subdir_video = df.subdir_video.str.split('.').str[0]
df['prepend_zero'] = df.subdir_video.apply(lambda x: f'{"_".join(x.split("_")[:-1])}_{"0" + x.split("_")[-1]}')

In [3]:
# Load filepaths from Jade2
jf = pd.read_csv('data/jade2/chimp_videos.csv', index_col=False)
jf.drop(columns=['Unnamed: 0'], inplace=True)
jf.files = jf.files.apply(lambda x: x.split('.')[0].lower())

**Matching video-annotations**

In [4]:
matching1 = df[df.subdir_video.isin(jf.files.values)]
matching1

Unnamed: 0,new_row_id,country,research_site,genus,species,cam_coverage_area,location_metadata,habitat,utm_zone,utm_long,...,tool_use,vocalization,bipedal,camera_reaction,behavioral_context,other_species,additional_comments,record_type,subdir_video,prepend_zero
0,1,mali,bafing,Pan,troglodytes verus,9.87,trail,forest on rock,29n,342661.0,...,no,no,no,no,travel,,Not very clear screen,60s_video,baf_vid1_0342661_1432807_20141103_11160014,baf_vid1_0342661_1432807_20141103_011160014
5,6,mali,bafing,Pan,troglodytes verus,14.00,feeding site/fruit tree,gallery forest,29n,346467.0,...,no,no,no,yes,feeding,,"Near the camera, with a fruit in his mouth",60s_video,baf_vid10_0346467_1436892_20151112_11280025,baf_vid10_0346467_1436892_20151112_011280025
38,39,mali,bafing,Pan,troglodytes verus,15.12,trail,rocks,29n,343048.0,...,no,no,yes,no,unclear,,"Penis erect, seems to be just moments befora a...",60s_video,baf_vid13_0343048_1434623_20141208_12170200,baf_vid13_0343048_1434623_20141208_012170200
40,41,mali,bafing,Pan,troglodytes verus,12.30,feeding site/nesting site,forest on rock,29n,343242.0,...,no,no,no,no,travel,,,60s_video,baf_vid13_0343242_1434854_20150910_10070021,baf_vid13_0343242_1434854_20150910_010070021
41,42,mali,bafing,Pan,troglodytes verus,12.30,feeding site/nesting site,forest on rock,29n,343242.0,...,no,no,no,no,displaying,,"He makes a display, but does not make any voca...",60s_video,baf_vid13_0343242_1434854_20150910_10080025,baf_vid13_0343242_1434854_20150910_010080025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1236,1237,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,7.76,termite site / clearing,savannah - wooded,36s,231665.0,...,no,no,no,no,resting,,same female as last video,60s_video,uga_vid20_231665_9391062_20141027_pict0086,uga_vid20_231665_9391062_20141027_0pict0086
1273,1274,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,,trail,gallery forest,36s,231634.0,...,no,no,no,yes,unclear,,presumed female because no penis visible when ...,60s_video,uga_vidba4_231634_9390066_20140820_ek000014,uga_vidba4_231634_9390066_20140820_0ek000014
1319,1320,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,,trail,gallery forest,36s,231634.0,...,no,no,no,no,resting,,,60s_video,uga_vidba4_231634_9390066_20141027_ek000029,uga_vidba4_231634_9390066_20141027_0ek000029
1322,1323,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,,trail,gallery forest,36s,231634.0,...,no,no,no,no,unclear,,not clearly visible; black and white video in ...,60s_video,uga_vidba4_231634_9390066_20141027_ek000050,uga_vidba4_231634_9390066_20141027_0ek000050


**Dealing with missing videos**

In [5]:
non_matching = df[~df.subdir_video.isin(jf.files.values)]

In [6]:
# Handling missing zeros
matching2 = df[df.prepend_zero.isin(jf.files.values)]
matching2.drop(columns=['subdir_video'], inplace=True)
matching2.rename(columns={"prepend_zero": "subdir_video"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching2.drop(columns=['subdir_video'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching2.rename(columns={"prepend_zero": "subdir_video"}, inplace=True)


**Concatentate all matched DataFrames**

In [7]:
single_ape = pd.concat([matching1, matching2])
single_ape.drop(columns=['prepend_zero'], inplace=True)
single_ape[single_ape.subdir_video.isin(jf.files)]

Unnamed: 0,new_row_id,country,research_site,genus,species,cam_coverage_area,location_metadata,habitat,utm_zone,utm_long,...,sex,tool_use,vocalization,bipedal,camera_reaction,behavioral_context,other_species,additional_comments,record_type,subdir_video
0,1,mali,bafing,Pan,troglodytes verus,9.87,trail,forest on rock,29n,342661.0,...,male,no,no,no,no,travel,,Not very clear screen,60s_video,baf_vid1_0342661_1432807_20141103_11160014
5,6,mali,bafing,Pan,troglodytes verus,14.00,feeding site/fruit tree,gallery forest,29n,346467.0,...,male,no,no,no,yes,feeding,,"Near the camera, with a fruit in his mouth",60s_video,baf_vid10_0346467_1436892_20151112_11280025
38,39,mali,bafing,Pan,troglodytes verus,15.12,trail,rocks,29n,343048.0,...,male,no,no,yes,no,unclear,,"Penis erect, seems to be just moments befora a...",60s_video,baf_vid13_0343048_1434623_20141208_12170200
40,41,mali,bafing,Pan,troglodytes verus,12.30,feeding site/nesting site,forest on rock,29n,343242.0,...,unclear,no,no,no,no,travel,,,60s_video,baf_vid13_0343242_1434854_20150910_10070021
41,42,mali,bafing,Pan,troglodytes verus,12.30,feeding site/nesting site,forest on rock,29n,343242.0,...,male,no,no,no,no,displaying,,"He makes a display, but does not make any voca...",60s_video,baf_vid13_0343242_1434854_20150910_10080025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,895,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,male,no,no,no,no,travel,,,non_60s_video,sob_vid05_0255619_1305651_20160301_03090137
895,896,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,male,no,no,no,no,travel,,,60s_video,sob_vid05_0255619_1305651_20160301_03100138
903,904,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,male,no,no,no,no,travel,,,60s_video,sob_vid05_0255619_1305651_20160301_03120144
950,951,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,male,no,no,no,no,travel,,,60s_video,sob_vid05_0255619_1305651_20160319_03220016


**Processing data and preparing annotation file**

In [8]:
# Remove behaviours which only have 1 or 2 examples
single_ape = single_ape[(single_ape.behavioral_context!='aggression')&(single_ape.behavioral_context!='playing')&(single_ape.behavioral_context!='grooming')&(single_ape.behavioral_context!='greeting')&(single_ape.behavioral_context!='displaying ')&(single_ape.behavioral_context!='no')]

In [9]:
# Cast behavioral context column to category + index behaviors
single_ape.behavioral_context = single_ape.behavioral_context.astype('category')
single_ape['label_index'] = single_ape.behavioral_context.cat.codes

# Create subdir columns to reconstruct paths online
single_ape['research_site_code'] = single_ape.subdir_video.str.split('_').str[0].str.lower()

# Create full path - should be prefix + full_path.mp4
single_ape['full_path'] = single_ape.country + str('/') +  single_ape.research_site_code + str('/') +  single_ape.subdir_video + str('.mp4')

In [None]:
single_ape.behavioral_context.cat.codes.max()

In [25]:
x = dict(single_ape[['behavioral_context', 'label_index']].value_counts())

In [28]:
new_dict = {}
for k,v in x.items():
    new_dict[k[0]] = k[1]

In [11]:
# Samples per behaviour
single_ape.behavioral_context.value_counts()

travel                     2608
tool use termites          1226
feeding                     746
resting                     502
unclear                     414
tool use                    401
tool use unknown            370
climbing                    281
camera reaction             234
displaying                   81
tool use nuts                77
tool use stone throwing      54
tool use ants                48
tool use algae               26
Name: behavioral_context, dtype: int64

In [12]:
# Proportion of samples per behaviour
single_ape.behavioral_context.value_counts() / single_ape.behavioral_context.value_counts().sum() * 100

travel                     36.898698
tool use termites          17.345784
feeding                    10.554612
resting                     7.102434
unclear                     5.857385
tool use                    5.673458
tool use unknown            5.234861
climbing                    3.975665
camera reaction             3.310696
displaying                  1.146010
tool use nuts               1.089417
tool use stone throwing     0.764007
tool use ants               0.679117
tool use algae              0.367855
Name: behavioral_context, dtype: float64

**JADE: Generating train test splits**

In [13]:
from sklearn.model_selection import train_test_split
single_ape['remote_prefix'] = '/jmain02/home/J2AD001/wwp02/oxb63-wwp02/data/chimp_videos'
single_ape['full_remote_path'] = single_ape['remote_prefix'].astype(str) + str('/') + single_ape['full_path'].astype(str)
train, test = train_test_split(single_ape, test_size=0.35, random_state=42)

In [14]:
train.behavioral_context.value_counts()

travel                     1695
tool use termites           798
feeding                     486
resting                     336
tool use                    262
unclear                     258
tool use unknown            250
climbing                    170
camera reaction             148
displaying                   57
tool use nuts                51
tool use stone throwing      34
tool use ants                28
tool use algae               21
Name: behavioral_context, dtype: int64

In [15]:
test.behavioral_context.value_counts()

travel                     913
tool use termites          428
feeding                    260
resting                    166
unclear                    156
tool use                   139
tool use unknown           120
climbing                   111
camera reaction             86
tool use nuts               26
displaying                  24
tool use ants               20
tool use stone throwing     20
tool use algae               5
Name: behavioral_context, dtype: int64

**CSD3**

In [13]:
from sklearn.model_selection import train_test_split
single_ape['remote_prefix'] = '/rds/project/rds-oqQ0697DY5c/panaf/full/data'
single_ape['full_remote_path'] = single_ape['remote_prefix'].astype(str) + str('/') + single_ape['full_path'].astype(str)
train, test = train_test_split(single_ape, test_size=0.35, random_state=42)

**Writing to file**

In [16]:
# Further processing for Kinetics-style dataset/loader
kinetics_train = train[['full_remote_path', 'label_index']]
kinetics_test = test[['full_remote_path', 'label_index']]

In [17]:
kinetics_train.to_csv('kinetics_train.csv', sep=' ', header=False, index=False)
kinetics_test.to_csv('kinetics_test.csv', sep=' ', header=False, index=False)

In [None]:
train.to_csv('foo.csv', index=False)
test.to_csv('bar.csv', index=False)

**Initial results**

In [28]:
# Epoch 200
{"top1_acc": 74.88045, "top5_acc": 95.37642}

{'top1_acc': 74.88045, 'top5_acc': 95.37642}

**Add dummy classifier**

In [23]:
import numpy as np
from sklearn.dummy import DummyClassifier

# Add dummy data column to train + test
kinetics_train['dummy_data'] = int(0)
kinetics_test['dummy_data'] = int(0)

X_train = kinetics_train.dummy_data.values
y_train = kinetics_train.label_index.values

X_test = kinetics_test.dummy_data.values
y_test = kinetics_test.label_index.values

for strategy in ['most_frequent', 'prior', 'stratified', 'uniform', 'constant']:
    # Fit dummy clf
    if strategy == 'constant':
        dummy_clf = DummyClassifier(strategy=strategy, constant=12)
    else:
        dummy_clf = DummyClassifier(strategy=strategy)
    dummy_clf.fit(X_train, y_train)
    # Make preds
    dummy_clf.predict(X_test)
    print(f"Score for {strategy} strategy is: {dummy_clf.score(X_test, y_test)}")

Score for most_frequent strategy is: 0.3690379951495554
Score for prior strategy is: 0.3690379951495554
Score for stratified strategy is: 0.19684721099434116
Score for uniform strategy is: 0.06426839126919967
Score for constant strategy is: 0.3690379951495554


**DataFrame with videos not stored locally**

In [None]:
remaining = df[~(df.subdir_video.isin(jf.files)) & ~(df.prepend_zero.isin(jf.files))]