In [1]:
import os
import json
import shutil
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import random, math

In [2]:
# Load Maureens annotations
csv_files = glob('data/sites/csv/**/*.csv', recursive=True)
sorted_csv_files = sorted(csv_files, key=lambda x: x.split('/')[-1])
initialiser, remainder = sorted_csv_files[0], sorted_csv_files[1:]
df = pd.read_csv(initialiser, encoding="ISO-8859-1")
for file in remainder:
    site = pd.read_csv(file, encoding="ISO-8859-1")
    df = pd.concat([df, site])
df['subdir_video'] = df.subfolder.astype(str) + '_' + df.video_file_name.astype(str)

df.subdir_video = df.subdir_video.apply(lambda x: x.lower())
df.subdir_video = df.subdir_video.str.split('.').str[0]
df['prepend_zero'] = df.subdir_video.apply(lambda x: f'{"_".join(x.split("_")[:-1])}_{"0" + x.split("_")[-1]}')

In [3]:
# Load filepaths from Jade2
jf = pd.read_csv('data/jade2/chimp_videos.csv', index_col=False)
jf.drop(columns=['Unnamed: 0'], inplace=True)
jf.files = jf.files.apply(lambda x: x.split('.')[0].lower())

In [4]:
df = df[df['behavioral_context'].notna()]

**Matching video-annotations**

In [5]:
matching1 = df[df.subdir_video.isin(jf.files.values)]
matching1

Unnamed: 0,new_row_id,country,research_site,genus,species,cam_coverage_area,location_metadata,habitat,utm_zone,utm_long,...,tool_use,vocalization,bipedal,camera_reaction,behavioral_context,other_species,additional_comments,record_type,subdir_video,prepend_zero
0,1,mali,bafing,Pan,troglodytes verus,9.87,trail,forest on rock,29n,342661.0,...,no,no,no,no,travel,,Not very clear screen,60s_video,baf_vid1_0342661_1432807_20141103_11160014,baf_vid1_0342661_1432807_20141103_011160014
5,6,mali,bafing,Pan,troglodytes verus,14.00,feeding site/fruit tree,gallery forest,29n,346467.0,...,no,no,no,yes,feeding,,"Near the camera, with a fruit in his mouth",60s_video,baf_vid10_0346467_1436892_20151112_11280025,baf_vid10_0346467_1436892_20151112_011280025
10,11,mali,bafing,Pan,troglodytes verus,7.00,water source,forest on rock,29n,343264.0,...,no,no,no,no,resting,,Carries an infant ventrally,60s_video,baf_vid11_0343264_1434832_20151019_12100066,baf_vid11_0343264_1434832_20151019_012100066
11,12,mali,bafing,Pan,troglodytes verus,7.00,water source,forest on rock,29n,343264.0,...,no,no,no,no,resting,,Rides ventrally on the above chimp,60s_video,baf_vid11_0343264_1434832_20151019_12100066,baf_vid11_0343264_1434832_20151019_012100066
12,13,mali,bafing,Pan,troglodytes verus,7.00,water source,forest on rock,29n,343264.0,...,no,yes,no,yes,resting,,At the top right side of the screen. Seems to ...,60s_video,baf_vid11_0343264_1434832_20151019_12100066,baf_vid11_0343264_1434832_20151019_012100066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1327,1328,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,,trail,gallery forest,36s,231634.0,...,no,offscreen,no,no,travel,,walks past in last second of video; not fully ...,60s_video,uga_vidba4_231634_9390066_20141027_ek000060,uga_vidba4_231634_9390066_20141027_0ek000060
1328,1329,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,,trail,gallery forest,36s,231634.0,...,no,no,no,yes,travel,,only partially visible,60s_video,uga_vidba4_231634_9390066_20141027_ek000069,uga_vidba4_231634_9390066_20141027_0ek000069
1329,1330,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,,trail,gallery forest,36s,231634.0,...,no,no,no,no,travel,,only partially visible,60s_video,uga_vidba4_231634_9390066_20141027_ek000069,uga_vidba4_231634_9390066_20141027_0ek000069
1330,1331,tanzania,ugalla_issa,Pan,troglodytes schweinfurthii,,trail,gallery forest,36s,231634.0,...,no,no,no,no,travel,,,60s_video,uga_vidba4_231634_9390066_20141027_ek000069,uga_vidba4_231634_9390066_20141027_0ek000069


**Dealing with missing videos**

In [6]:
non_matching = df[~df.subdir_video.isin(jf.files.values)]

In [7]:
# Handling missing zeros
matching2 = df[df.prepend_zero.isin(jf.files.values)]
matching2.drop(columns=['subdir_video'], inplace=True)
matching2.rename(columns={"prepend_zero": "subdir_video"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching2.drop(columns=['subdir_video'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching2.rename(columns={"prepend_zero": "subdir_video"}, inplace=True)


**Concatentate all matched DataFrames**

In [8]:
all_apes = pd.concat([matching1, matching2])
all_apes.drop(columns=['prepend_zero'], inplace=True)
all_apes[all_apes.subdir_video.isin(jf.files)]

Unnamed: 0,new_row_id,country,research_site,genus,species,cam_coverage_area,location_metadata,habitat,utm_zone,utm_long,...,sex,tool_use,vocalization,bipedal,camera_reaction,behavioral_context,other_species,additional_comments,record_type,subdir_video
0,1,mali,bafing,Pan,troglodytes verus,9.87,trail,forest on rock,29n,342661.0,...,male,no,no,no,no,travel,,Not very clear screen,60s_video,baf_vid1_0342661_1432807_20141103_11160014
5,6,mali,bafing,Pan,troglodytes verus,14.00,feeding site/fruit tree,gallery forest,29n,346467.0,...,male,no,no,no,yes,feeding,,"Near the camera, with a fruit in his mouth",60s_video,baf_vid10_0346467_1436892_20151112_11280025
10,11,mali,bafing,Pan,troglodytes verus,7.00,water source,forest on rock,29n,343264.0,...,female,no,no,no,no,resting,,Carries an infant ventrally,60s_video,baf_vid11_0343264_1434832_20151019_12100066
11,12,mali,bafing,Pan,troglodytes verus,7.00,water source,forest on rock,29n,343264.0,...,unclear,no,no,no,no,resting,,Rides ventrally on the above chimp,60s_video,baf_vid11_0343264_1434832_20151019_12100066
12,13,mali,bafing,Pan,troglodytes verus,7.00,water source,forest on rock,29n,343264.0,...,female,no,yes,no,yes,resting,,At the top right side of the screen. Seems to ...,60s_video,baf_vid11_0343264_1434832_20151019_12100066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963,964,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,male,no,no,no,no,travel,,,60s_video,sob_vid05_0255619_1305651_20160319_03220020
964,965,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,unclear,no,no,no,yes,travel,,,60s_video,sob_vid05_0255619_1305651_20160319_03220020
965,966,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,male,no,no,no,yes,travel,,,60s_video,sob_vid05_0255619_1305651_20160319_03220020
966,967,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,male,no,no,no,yes,travel,,,60s_video,sob_vid05_0255619_1305651_20160319_03220020


In [9]:
all_apes = all_apes.drop_duplicates(subset=['subdir_video'])

In [10]:
all_apes.behavioral_context.value_counts()

travel                     6686
tool use termites          2411
feeding                    1972
resting                    1148
unclear                    1070
tool use                    996
climbing                    656
camera reaction             518
tool use unknown            461
displaying                  213
tool use nuts               169
tool use algae              140
playing                     136
grooming                    125
tool use stone throwing      87
tool use ants                69
aggression                   47
sexual                       28
greeting                     21
reassurance                  11
displaying                    1
no                            1
tool use honey                1
Name: behavioral_context, dtype: int64

**Processing data and preparing annotation file**

In [11]:
# Remove behaviours which only have 1 or 2 examples
all_apes = all_apes[(all_apes.behavioral_context!='displaying ')&(all_apes.behavioral_context!='no')&(all_apes.behavioral_context!='tool use honey')]

In [12]:
# Cast behavioral context column to category + index behaviors
all_apes.behavioral_context = all_apes.behavioral_context.astype('category')
all_apes['label_index'] = all_apes.behavioral_context.cat.codes

# Create subdir columns to reconstruct paths online
all_apes['research_site_code'] = all_apes.subdir_video.str.split('_').str[0].str.lower()

# Create full path - should be prefix + full_path.mp4
all_apes['full_path'] = all_apes.country + str('/') +  all_apes.research_site_code + str('/') +  all_apes.subdir_video + str('.mp4')

In [13]:
all_apes.behavioral_context.cat.codes.max()

19

In [14]:
all_apes

Unnamed: 0,new_row_id,country,research_site,genus,species,cam_coverage_area,location_metadata,habitat,utm_zone,utm_long,...,bipedal,camera_reaction,behavioral_context,other_species,additional_comments,record_type,subdir_video,label_index,research_site_code,full_path
0,1,mali,bafing,Pan,troglodytes verus,9.87,trail,forest on rock,29n,342661.0,...,no,no,travel,,Not very clear screen,60s_video,baf_vid1_0342661_1432807_20141103_11160014,18,baf,mali/baf/baf_vid1_0342661_1432807_20141103_111...
5,6,mali,bafing,Pan,troglodytes verus,14.00,feeding site/fruit tree,gallery forest,29n,346467.0,...,no,yes,feeding,,"Near the camera, with a fruit in his mouth",60s_video,baf_vid10_0346467_1436892_20151112_11280025,4,baf,mali/baf/baf_vid10_0346467_1436892_20151112_11...
10,11,mali,bafing,Pan,troglodytes verus,7.00,water source,forest on rock,29n,343264.0,...,no,no,resting,,Carries an infant ventrally,60s_video,baf_vid11_0343264_1434832_20151019_12100066,9,baf,mali/baf/baf_vid11_0343264_1434832_20151019_12...
25,26,mali,bafing,Pan,troglodytes verus,12.25,nesting site,forest - bamboo,29n,345234.0,...,no,yes,travel,,"First secs, looking the camera, disappears",60s_video,baf_vid12_0345234_1438195_20151123_11280008,18,baf,mali/baf/baf_vid12_0345234_1438195_20151123_11...
38,39,mali,bafing,Pan,troglodytes verus,15.12,trail,rocks,29n,343048.0,...,yes,no,unclear,,"Penis erect, seems to be just moments befora a...",60s_video,baf_vid13_0343048_1434623_20141208_12170200,19,baf,mali/baf/baf_vid13_0343048_1434623_20141208_12...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,949,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,no,no,travel,,,60s_video,sob_vid05_0255619_1305651_20160319_03210013,18,sob,guinea/sob/sob_vid05_0255619_1305651_20160319_...
950,951,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,no,no,travel,,,60s_video,sob_vid05_0255619_1305651_20160319_03220016,18,sob,guinea/sob/sob_vid05_0255619_1305651_20160319_...
951,952,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,no,no,travel,,,60s_video,sob_vid05_0255619_1305651_20160319_03220018,18,sob,guinea/sob/sob_vid05_0255619_1305651_20160319_...
952,953,guinea,sobory,Pan,troglodytes verus,80.00,trail,gallery forest,29n,255619.0,...,no,no,travel,,,60s_video,sob_vid05_0255619_1305651_20160319_03220019,18,sob,guinea/sob/sob_vid05_0255619_1305651_20160319_...


In [16]:
new_dict = {}
for k, v in dict(all_apes[['behavioral_context', 'label_index']].value_counts()).items():
    new_dict[k[0]] = k[1]

In [18]:
import json
print(json.dumps(new_dict))

{"travel": 18, "tool use termites": 16, "feeding": 4, "resting": 9, "unclear": 19, "tool use": 11, "climbing": 2, "camera reaction": 1, "tool use unknown": 17, "displaying": 3, "tool use nuts": 14, "tool use algae": 12, "playing": 7, "grooming": 6, "tool use stone throwing": 15, "tool use ants": 13, "aggression": 0, "sexual": 10, "greeting": 5, "reassurance": 8}


In [None]:
{'travel': 18,
 'tool use termites': 16,
 'feeding': 4,
 'resting': 9,
 'unclear': 19,
 'tool use': 11,
 'climbing': 2,
 'camera reaction': 1,
 'tool use unknown': 17,
 'displaying': 3,
 'tool use nuts': 14,
 'tool use algae': 12,
 'playing': 7,
 'grooming': 6,
 'tool use stone throwing': 15,
 'tool use ants': 13,
 'aggression': 0,
 'sexual': 10,
 'greeting': 5,
 'reassurance': 8}

In [83]:
# Proportion of samples per behaviour
all_apes.behavioral_context.value_counts() / all_apes.behavioral_context.value_counts().sum() * 100

travel                     39.412874
tool use termites          14.212450
feeding                    11.624617
resting                     6.767272
unclear                     6.307475
tool use                    5.871257
climbing                    3.867012
camera reaction             3.053525
tool use unknown            2.717519
displaying                  1.255600
tool use nuts               0.996227
tool use algae              0.825277
playing                     0.801698
grooming                    0.736855
tool use stone throwing     0.512851
tool use ants               0.406744
aggression                  0.277057
sexual                      0.165055
greeting                    0.123792
reassurance                 0.064843
Name: behavioral_context, dtype: float64

**CSD3: Generating train test splits**

In [84]:
from sklearn.model_selection import train_test_split
all_apes['remote_prefix'] = '/rds/project/rds-oqQ0697DY5c/panaf/full/data'
all_apes['full_remote_path'] = all_apes['remote_prefix'].astype(str) + str('/') + all_apes['full_path'].astype(str)
train, test = train_test_split(all_apes, test_size=0.35, random_state=42)

In [85]:
train.behavioral_context.value_counts()

travel                     4330
tool use termites          1575
feeding                    1293
resting                     759
unclear                     721
tool use                    647
climbing                    402
camera reaction             330
tool use unknown            290
displaying                  139
tool use nuts               116
grooming                     91
tool use algae               87
playing                      85
tool use stone throwing      51
tool use ants                44
aggression                   24
sexual                       20
greeting                     13
reassurance                   9
Name: behavioral_context, dtype: int64

In [87]:
test.behavioral_context.value_counts()

travel                     2356
tool use termites           836
feeding                     679
resting                     389
unclear                     349
tool use                    349
climbing                    254
camera reaction             188
tool use unknown            171
displaying                   74
tool use algae               53
tool use nuts                53
playing                      51
tool use stone throwing      36
grooming                     34
tool use ants                25
aggression                   23
greeting                      8
sexual                        8
reassurance                   2
Name: behavioral_context, dtype: int64

**Writing to file**

In [88]:
# Further processing for Kinetics-style dataset/loader
kinetics_train = train[['full_remote_path', 'label_index']]
kinetics_test = test[['full_remote_path', 'label_index']]

In [90]:
kinetics_train.to_csv('kinetics_full-ape_train.csv', sep=' ', header=False, index=False)
kinetics_test.to_csv('kinetics_full-ape_test.csv', sep=' ', header=False, index=False)

**Full SSL**

In [49]:
ssl_all_apes = all_apes[['full_remote_path', 'label_index']]
ssl_all_apes['label_index'] = 9

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ssl_all_apes['label_index'] = 9


In [51]:
ssl_all_apes.to_csv('kinetics_full-ape-ssl_train.csv', sep=' ', header=False, index=False)

**Initial results**

In [28]:
# Epoch 200
{"top1_acc": 74.88045, "top5_acc": 95.37642}

{'top1_acc': 74.88045, 'top5_acc': 95.37642}

**Add dummy classifier**

In [23]:
import numpy as np
from sklearn.dummy import DummyClassifier

# Add dummy data column to train + test
kinetics_train['dummy_data'] = int(0)
kinetics_test['dummy_data'] = int(0)

X_train = kinetics_train.dummy_data.values
y_train = kinetics_train.label_index.values

X_test = kinetics_test.dummy_data.values
y_test = kinetics_test.label_index.values

for strategy in ['most_frequent', 'prior', 'stratified', 'uniform', 'constant']:
    # Fit dummy clf
    if strategy == 'constant':
        dummy_clf = DummyClassifier(strategy=strategy, constant=12)
    else:
        dummy_clf = DummyClassifier(strategy=strategy)
    dummy_clf.fit(X_train, y_train)
    # Make preds
    dummy_clf.predict(X_test)
    print(f"Score for {strategy} strategy is: {dummy_clf.score(X_test, y_test)}")

Score for most_frequent strategy is: 0.3690379951495554
Score for prior strategy is: 0.3690379951495554
Score for stratified strategy is: 0.19684721099434116
Score for uniform strategy is: 0.06426839126919967
Score for constant strategy is: 0.3690379951495554


**DataFrame with videos not stored locally**

In [None]:
remaining = df[~(df.subdir_video.isin(jf.files)) & ~(df.prepend_zero.isin(jf.files))]