<img align="left" src="https://panoptes-uploads.zooniverse.org/project_avatar/86c23ca7-bbaa-4e84-8d8a-876819551431.png" type="image/png" height=100 width=100>
</img>


<h1 align="right">KSO Tutorials #15: Process Spyfish Aotearoa videos</h1>
<h3 align="right">Written by @jannesgg and @vykanton</h3>
<h5 align="right">Last updated: Sep 15th, 2021</h5>

# Set up and requirements

Import Python packages

In [1]:
# Set the directory of the libraries
import sys
sys.path.append('..')

# Import required modules
import utils.t15_utils as t15
import utils.t12_utils as t12

print("Packages loaded successfully")

import os
import pandas as pd
import sqlite3
import numpy as np
from tqdm import tqdm
import subprocess
import ipywidgets as widgets
from ipywidgets import interact, Layout

import utils.db_utils as db_utils
import utils.server_utils as server_utils
import utils.spyfish_utils as spyfish_utils

Broken libmagic installation detected. The python-magic module is installed but can't be imported. Please check that both python-magic and the libmagic shared library are installed correctly. Uploading media other than images may not work.


Packages loaded successfully


# Get list of surveys from S3

In [2]:
aws_access_key_id, aws_secret_access_key = server_utils.aws_credentials()

Enter the key id for the aws server········
Enter the secret access key for the aws server········


In [3]:
client = server_utils.connect_s3(aws_access_key_id, aws_secret_access_key)

In [4]:
bucket_i = "marine-buv"

In [5]:
# Retrieve info from the bucket
contents_s3_pd = server_utils.retrieve_s3_buckets_info(client, bucket_i)

# Specify the filename of the objects (videos)        
contents_s3_pd['raw_filename'] = contents_s3_pd['Key'].str.split('/').str[-1]

# Specify the prefix (directory) of the objects        
contents_s3_pd['prefix'] = contents_s3_pd['Key'].str.rsplit('/',1).str[0]

In [6]:
# Select only files within the buv-zooniverse-uploads bucket
zoo_contents_s3_pd = contents_s3_pd[contents_s3_pd['prefix'].str.contains("buv-zooniverse-uploads")].reset_index(drop = True)

# Specify the formats of the movies to select
movie_formats = tuple(['wmv', 'mpg', 'mov', 'avi', 'mp4', 'MOV', 'MP4'])

# Select only files of interest (movies)
zoo_contents_s3_pd_movies = zoo_contents_s3_pd[zoo_contents_s3_pd['raw_filename'].str.endswith(movie_formats)]

# Get list of surveys from movies csv

In [11]:
# Get the location of the csv files with initial info to populate the db
sites_csv, movies_csv, species_csv = server_utils.get_sites_movies_species()

# Load the csv with movies information
movies_df = pd.read_csv(movies_csv) 

# Combine the entire movies and go_pro_files
movies_df['files'] = np.where(movies_df['go_pro_files'].isna(), movies_df['filename'], movies_df['filename'] + ";" + movies_df['go_pro_files'])

# Transform the df to have a row per filename
movies_df = movies_df.assign(raw_filename=movies_df['files'].str.split(';')).explode('raw_filename')

# Specify the key (path in S3 of the object)
movies_df["Key"] = movies_df["prefix"] + "/" + movies_df["raw_filename"]

# Find surveys missing from s3

In [8]:
# Missing info for files in the "buv-zooniverse-uploads"
missing_info = zoo_contents_s3_pd_movies.merge(movies_df, 
                                        on=['Key'], 
                                        how='outer', 
                                        indicator=True)

#missing_info[missing_info["_merge"]=="both"]
#missing_info

In [9]:
# Find out about those files missing from the S3
missing_from_s3 = missing_info[missing_info["_merge"]=="right_only"]
missing_bad_deployment = missing_from_s3[missing_from_s3["IsBadDeployment"]]
missing_no_bucket_info = missing_from_s3[~(missing_from_s3["IsBadDeployment"])&(missing_from_s3["bucket"].isna())]

In [10]:
print("There are", len(missing_from_s3.index), "movies missing from the S3")
print(len(missing_bad_deployment.index), "movies are bad deployments. Their filenames are:")
print(*missing_bad_deployment.filename.unique(), sep = "\n")
print(len(missing_no_bucket_info.index), "movies are good deployments but don't have bucket info. Their filenames are:")
print(*missing_no_bucket_info.filename.unique(), sep = "\n")

There are 44 movies missing from the S3
12 movies are bad deployments. Their filenames are:
bad_deployment_TUH_017_NA
bad_deployment_TUH_027_NA
bad_deployment_TUH_038_NA
bad_deployment_TUH_001_44096
bad_deployment_TUH_009_44096
bad_deployment_TUH_013_44096
bad_deployment_TUH_028_44096
bad_deployment_TUH_033_44096
bad_deployment_TUH_010_44074
bad_deployment_TUH_022_44074
bad_deployment_TUH_020_44075
bad_deployment_TUH_024_NA
32 movies are good deployments but don't have bucket info. Their filenames are:
Te Oneroa BayKAP01_2021.MP4
Hole in the wall bayKAP16_2021.MP4
Onepoto pointKAP20_2020.MP4
Rangatira pointKAP23_2021.MP4
Green canKAP24_2021.MP4
EE11_2011.MP4
EE13_2011.MP4
EE14_2011.MP4
EE16_2011.MP4
WP59.MP4
WP45.MP4
WP58.MP4
DD2_2011.MP4
DD12_2011.MP4
DD5_2011.MP4
DD6_2011.MP4
dummy_21


### Find surveys missing from csv

In [13]:
missing_from_csv = missing_info[missing_info["_merge"]=="left_only"].reset_index(drop=True)

In [14]:
print("There are", len(missing_from_csv.index), "movies missing from the csv")

There are 99 movies missing from the csv


In [15]:
# Combine the location of the file and the filename
missing_from_csv["location_and_filename"] = "S3_prefix: " + missing_from_csv['prefix_x'].str.rsplit('/',1).str[1] + " | Filename: " + missing_from_csv['raw_filename_x']

In [16]:
missing_from_csv

Unnamed: 0,Key,LastModified,ETag,Size,StorageClass,raw_filename_x,prefix_x,movie_id,filename,siteName,...,survey_start,survey_end,go_pro_files,bucket,prefix_y,IsBadDeployment,files,raw_filename_y,_merge,location_and_filename
0,buv-zooniverse-uploads/tapuae-buv-2011/CON14_2...,2021-09-17 05:17:15+00:00,"""c6b6bca4dab4ea799fb7e947c403df6f""",2.298003e+09,STANDARD,CON14_2012.mpg,buv-zooniverse-uploads/tapuae-buv-2011,,,,...,,,,,,,,,left_only,S3_prefix: tapuae-buv-2011 | Filename: CON14_2...
1,buv-zooniverse-uploads/tapuae-buv-2011/CON14_s...,2021-09-17 05:38:22+00:00,"""54e26fc5dea638e276d0680111040553""",1.187840e+07,STANDARD,CON14_sitedata_2012.mpg,buv-zooniverse-uploads/tapuae-buv-2011,,,,...,,,,,,,,,left_only,S3_prefix: tapuae-buv-2011 | Filename: CON14_s...
2,buv-zooniverse-uploads/tapuae-buv-2011/CON15_2...,2021-09-17 05:57:54+00:00,"""0007b589c8b7839b3c90ebad3f67bf9c""",2.720862e+09,STANDARD,CON15_2012.mpg,buv-zooniverse-uploads/tapuae-buv-2011,,,,...,,,,,,,,,left_only,S3_prefix: tapuae-buv-2011 | Filename: CON15_2...
3,buv-zooniverse-uploads/tapuae-buv-2011/CON16_8...,2021-09-17 05:55:00+00:00,"""063404cdfe0e14506c28887235debff0""",2.360805e+09,STANDARD,CON16_8.8m_23Apr2014_1356hrs.mpg,buv-zooniverse-uploads/tapuae-buv-2011,,,,...,,,,,,,,,left_only,S3_prefix: tapuae-buv-2011 | Filename: CON16_8...
4,buv-zooniverse-uploads/tapuae-buv-2011/CON17_2...,2021-09-17 06:04:05+00:00,"""3e9e126821433d7f1c6331e40ba4250a""",2.615038e+09,STANDARD,CON17_2012.mpg,buv-zooniverse-uploads/tapuae-buv-2011,,,,...,,,,,,,,,left_only,S3_prefix: tapuae-buv-2011 | Filename: CON17_2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,buv-zooniverse-uploads/tuhua-buv-2021/WP10/GH0...,2021-09-19 22:38:29+00:00,"""6bc59fd61eb15e7615fc896bba8b8343""",4.005076e+09,STANDARD,GH010030.MP4,buv-zooniverse-uploads/tuhua-buv-2021/WP10,,,,...,,,,,,,,,left_only,S3_prefix: WP10 | Filename: GH010030.MP4
95,buv-zooniverse-uploads/tuhua-buv-2021/WP10/GH0...,2021-09-19 22:38:59+00:00,"""644c2b6fadd8e7df62a1adee9e2499ca""",4.004767e+09,STANDARD,GH020030.MP4,buv-zooniverse-uploads/tuhua-buv-2021/WP10,,,,...,,,,,,,,,left_only,S3_prefix: WP10 | Filename: GH020030.MP4
96,buv-zooniverse-uploads/tuhua-buv-2021/WP10/GH0...,2021-09-19 22:39:12+00:00,"""804cdafb960c60e4df82db9324c46e57""",4.004563e+09,STANDARD,GH030030.MP4,buv-zooniverse-uploads/tuhua-buv-2021/WP10,,,,...,,,,,,,,,left_only,S3_prefix: WP10 | Filename: GH030030.MP4
97,buv-zooniverse-uploads/tuhua-buv-2021/WP10/GH0...,2021-09-19 22:39:29+00:00,"""87616b40ab713121121ca04f1f2dd972""",4.004806e+09,STANDARD,GH040030.MP4,buv-zooniverse-uploads/tuhua-buv-2021/WP10,,,,...,,,,,,,,,left_only,S3_prefix: WP10 | Filename: GH040030.MP4


In [117]:
# Specify the go pro movies to update
go_pro_mov_to_update = go_pro_movies_to_update(missing_from_csv)

SelectMultiple(description='GO pro movies:', layout=Layout(width='80%'), options=('s3_prefix: tapuae-buv-2011 …

In [118]:
# Specify the full movies to update
full_mov_to_update = full_movie_to_update(missing_from_csv)

Dropdown(description='Full movie:', layout=Layout(width='80%'), options=('s3_prefix: tapuae-buv-2011 | filenam…

In [108]:
mov_to_update.value

('loc:tapuae-buv-2011 file:CON14_2012.mpg',
 'loc:tapuae-buv-2011 file:CON14_sitedata_2012.mpg',
 'loc:tapuae-buv-2011 file:CON15_2012.mpg',
 'loc:tapuae-buv-2011 file:CON16_8.8m_23Apr2014_1356hrs.mpg',
 'loc:tapuae-buv-2011 file:CON17_2012.mpg',
 'loc:tapuae-buv-2011 file:CON18_2012.mpg',
 'loc:tapuae-buv-2011 file:CON19_2012.mpg',
 'loc:tapuae-buv-2011 file:CON20_2012.mpg',
 'loc:tapuae-buv-2011 file:CON22_2012.mpg')

In [111]:
# Select multiple movies to include information of
def go_pro_movies_to_update(df):
    
    # Save the filenames of the movies missing
    filename_missing_csv = df.location_and_filename.unique()
    
    # Display the project options
    movie_to_update = widgets.SelectMultiple(
        options=filename_missing_csv,
        rows=15,
        layout=Layout(width='80%'),
        description="GO pro movies:",
        disabled=False,
        
    )
    
    display(movie_to_update)
    return movie_to_update

# Select one movie to include information of
def full_movie_to_update(df):
    
    # Save the filenames of the movies missing
    filename_missing_csv = df.location_and_filename.unique()
    
    # Display the project options
    movie_to_update = widgets.Dropdown(
        options=filename_missing_csv,
        rows=15,
        layout=Layout(width='80%'),
        description="Full movie:",
        disabled=False,
        
    )
    
    display(movie_to_update)
    return movie_to_update


In [None]:
info_csv = info_to_csv(missing_from_csv, mov_to_update.value)

In [None]:
# Select the info to add to the csv
def info_to_csv(df, movies):
    
    # Save the filenames of the movies missing
    filename_missing_csv = df.location_and_filename.unique()
    
    # Display the project options
    movie_to_update = widgets.SelectMultiple(
        options=filename_missing_csv,
        rows=15,
        layout=Layout(width='80%'),
        description="Movie:",
        disabled=False,
        
    )
    
    display(movie_to_update)
    return movie_to_update

# Concatenate go pros and update csv

In [None]:
zoo_contents_s3_pd_movies[zoo_contents_s3_pd_movies['Key'].str.contains("TUH_032")].Key.unique()

In [20]:
movies_df[movies_df['siteName']=="TUH_037"].go_pro_files.unique()

array(['Site 37 video 1.MP4;Site 37 video 2.MP4;Site 37 video 3.MP4;Site 37 video 4.MP4'],
      dtype=object)

In [None]:
missing_info[(missing_info["_merge"]=="right_only")]

In [None]:
# Select only those deployments that are valid
movies_df = movies_df[~movies_df["IsBadDeployment"]].reset_index(drop=True)

In [33]:
# Set the directory of the libraries
import sys
sys.path.append('..')

# Import required modules
import utils.t15_utils as t15
import utils.t12_utils as t12

print("Packages loaded successfully")

import os
import pandas as pd
import sqlite3
import pandas as pd
from tqdm import tqdm
import subprocess

import utils.db_utils as db_utils
import utils.server_utils as server_utils
import utils.spyfish_utils as spyfish_utils

# Get the location of the csv files with initial info to populate the db
sites_csv, movies_csv, species_csv = server_utils.get_sites_movies_species()

# Load the csv with movies information
movies_df = pd.read_csv(movies_csv)

# Select only those deployments that are valid
movies_df = movies_df[~movies_df["IsBadDeployment"]].reset_index(drop=True)    

Packages loaded successfully


In [34]:
# Select only surveys that are missing filenames
unprocessed_movies_df = movies_df[movies_df["filename"].isna()].reset_index(drop=True)

# Write the filename of the concatenated movie
unprocessed_movies_df["filename"] = unprocessed_movies_df["siteName"] + "_" + unprocessed_movies_df["created_on"].str.replace('/','_')+ ".MP4"

In [32]:
zoo_contents_s3_pd_movies[zoo_contents_s3_pd_movies['raw_filename'].str.contains("Site 20")]

Unnamed: 0,Key,LastModified,ETag,Size,StorageClass,raw_filename,prefix
115,buv-zooniverse-uploads/tuhua-buv-2020/TUH_020/...,2021-09-19 22:22:37+00:00,"""3e177dc45d15404f28e7b4fc10c7c39b""",4004981319,STANDARD,Site 20 video 1.MP4,buv-zooniverse-uploads/tuhua-buv-2020/TUH_020
116,buv-zooniverse-uploads/tuhua-buv-2020/TUH_020/...,2021-09-19 22:22:56+00:00,"""82ea5941d68d96f95f6f7fb5b34b813b""",4004794456,STANDARD,Site 20 video 2.MP4,buv-zooniverse-uploads/tuhua-buv-2020/TUH_020
117,buv-zooniverse-uploads/tuhua-buv-2020/TUH_020/...,2021-09-19 22:23:01+00:00,"""ecd102578f3d135f6f24059408a6f55e""",4004810401,STANDARD,Site 20 video 3.MP4,buv-zooniverse-uploads/tuhua-buv-2020/TUH_020
118,buv-zooniverse-uploads/tuhua-buv-2020/TUH_020/...,2021-09-19 22:23:41+00:00,"""be9604f9a790af3a2e4acfb1027fe4df""",4004858508,STANDARD,Site 20 video 4.MP4,buv-zooniverse-uploads/tuhua-buv-2020/TUH_020


In [35]:
unprocessed_movies_df

Unnamed: 0,movie_id,filename,siteName,created_on,Author,fps,duration,survey_start,survey_end,go_pro_files,bucket,prefix,IsBadDeployment


In [19]:
unprocessed_movies_df

Unnamed: 0,movie_id,filename,siteName,created_on,Author,fps,duration,survey_start,survey_end,go_pro_files,bucket,prefix,IsBadDeployment
0,49,TUH_037_1_09_2020.MP4,TUH_037,1/09/2020,MoniqueLadds,99,7200,395,2195,Site 37 video 1.MP4;Site 37 video 2.MP4;Site 3...,,,False


In [27]:
spyfish_utils.concatenate_videos(unprocessed_movies_df, client)

  0%|          | 0/1 [00:00<?, ?it/s]


TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [None]:
# Check that all videos have filenames
if movies_df["filename"].isna().any():
    
    #####Get info from bucket#####
    # Your acess key for the s3 bucket. 
    #aws_access_key_id, aws_secret_access_key = server_utils.aws_credentials()

    # Specify the bucket where the BUV files are
    bucket_i = movies_df['bucket'].str.split('/').str[0].dropna().unique()[1]

In [None]:
# Retrieve info from the bucket
contents_s3_pd = server_utils.retrieve_s3_buckets_info(client, bucket_i)

In [None]:
contents_s3_pd

In [None]:
contents_s3_pd[contents_s3_pd['Key'].str.contains("buv-zooniverse-uploads/tuhua-buv-2020/TUH_021/TUH_021__22_09_2020.MP4")]['Key'].unique()

In [None]:
#client.delete_object(Bucket=bucket_i, Key="buv-zooniverse-uploads/tuhua-buv-2020/TUH_021/TUH_021__22_09_2020.MP4")

In [None]:
# Specify the filename of the raw videos        
contents_s3_pd['raw_filename'] = contents_s3_pd['Key'].str.split('/').str[-1]

# Specify the filename of the raw videos        
contents_s3_pd['bucket'] = contents_s3_pd['Key'].str.rsplit('/',1).str[0]

In [None]:
session = server_utils.connect_s3(aws_access_key_id, aws_secret_access_key)

In [None]:
!pip install boto3

In [None]:
# Check that filenames info from movies csv exists in S3
import boto3
from botocore.errorfactory import ClientError

s3 = boto3.client('s3')
try:
    s3.head_object(Bucket='bucket_name', Key='file_path')
except ClientError:
    # Not found
    pass
