<img align="left" src="https://panoptes-uploads.zooniverse.org/project_avatar/86c23ca7-bbaa-4e84-8d8a-876819551431.png" type="image/png" height=100 width=100>
</img>


<h1 align="right">KSO Tutorials #15: Process Spyfish Aotearoa videos</h1>
<h3 align="right">Written by @jannesgg and @vykanton</h3>
<h5 align="right">Last updated: Sep 15th, 2021</h5>

# Set up and requirements

Import Python packages

In [1]:
# Set the directory of the libraries
import sys
sys.path.append('..')

# Set to display dataframes as interactive tables
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

# Import required modules
import utils.t15_utils as t15
import utils.t12_utils as t12

print("Packages loaded successfully")

import os
import pandas as pd
import sqlite3
import numpy as np
from tqdm import tqdm
import subprocess
import ipywidgets as widgets
from ipywidgets import interact, Layout

import utils.db_utils as db_utils
import utils.server_utils as server_utils
import utils.spyfish_utils as spyfish_utils

<IPython.core.display.Javascript object>



Packages loaded successfully


# Get list of surveys from S3

In [7]:
aws_access_key_id, aws_secret_access_key = server_utils.aws_credentials()

Enter the key id for the aws server········
Enter the secret access key for the aws server········


In [8]:
client = server_utils.connect_s3(aws_access_key_id, aws_secret_access_key)

In [None]:
# See list of buckets available
#client.list_buckets()

In [9]:
bucket_i = "marine-buv"

In [10]:
# Retrieve info from the bucket
contents_s3_pd = server_utils.retrieve_s3_buckets_info(client, bucket_i)

# Specify the filename of the objects (videos)        
contents_s3_pd['raw_filename'] = contents_s3_pd['Key'].str.split('/').str[-1]

# Specify the prefix (directory) of the objects        
contents_s3_pd['prefix'] = contents_s3_pd['Key'].str.rsplit('/',1).str[0]

ClientError: An error occurred (SignatureDoesNotMatch) when calling the ListObjects operation: The request signature we calculated does not match the signature you provided. Check your key and signing method.

In [None]:
# Upload csv to s3 
upload_file_to_s3(client, *, bucket, key, filename)

In [6]:
contents_s3_pd['prefix'].unique()
#no_rov_contents_s3_pd = contents_s3_pd[~contents_s3_pd['prefix'].str.contains("ROV")].reset_index(drop = True)
#no_rov_contents_s3_pd

array(['ROV/2002 Auckland Islands',
       'ROV/2002_01 Auckland Is Chambres Inlet',
       'ROV/2005 Bounty Islands/DISC2-Title1-Chapter1-6-Bounty',
       'ROV/2005 Bounty Islands/DISC3-Title2-Chapter1-5-Tunnel',
       'ROV/2005 Bounty Islands',
       'ROV/2008 Whananaki/2008_Benthic video and still_DV and ROV',
       'ROV/2008 Whananaki/2008_Benthic video and still_DV and ROV/Still Vid Cam',
       'ROV/2008 Whananaki/2008_Benthic video and still_DV and ROV/Stills panasonic still camera',
       "ROV/2008 Whananaki/2008_Benthic video and still_DV and ROV/Vince's video",
       'ROV/2008_04 Northland', 'ROV/2008_04 Northland/Still Vid Cam',
       'ROV/2008_04 Northland/Stills panasonic still camera',
       "ROV/2008_04 Northland/Vince's video", 'ROV/2008_10 Poor Knights',
       'ROV/2008_10 Poor Knights/Short clips', 'ROV/2008_12 Tasman Bay',
       'ROV/2008_12 Tory Channel', 'ROV/2015_03 Poor Knights Islands',
       'Tonga Island BUV/WP1', 'Tonga Island BUV/WP10',
       'To

In [None]:
# Select only files within the buv-zooniverse-uploads bucket
zoo_contents_s3_pd = contents_s3_pd[contents_s3_pd['prefix'].str.contains("buv-zooniverse-uploads")].reset_index(drop = True)

# Specify the formats of the movies to select
movie_formats = tuple(['wmv', 'mpg', 'mov', 'avi', 'mp4', 'MOV', 'MP4'])

# Select only files of interest (movies)
zoo_contents_s3_pd_movies = zoo_contents_s3_pd[zoo_contents_s3_pd['raw_filename'].str.endswith(movie_formats)]

# Get list of surveys from movies csv

In [None]:
# Get the location of the csv files with initial info to populate the db
sites_csv, movies_csv, species_csv = server_utils.get_sites_movies_species()

# Load the csv with movies information
movies_df = pd.read_csv(movies_csv) 

# Combine the entire movies and go_pro_files
movies_df['files'] = np.where(movies_df['go_pro_files'].isna(), movies_df['filename'], movies_df['filename'] + ";" + movies_df['go_pro_files'])

# Transform the df to have a row per filename
movies_df = movies_df.assign(raw_filename=movies_df['files'].str.split(';')).explode('raw_filename')

# Specify the key (path in S3 of the object)
movies_df["Key"] = movies_df["prefix"] + "/" + movies_df["raw_filename"]

# Find surveys missing from s3

In [None]:
# Missing info for files in the "buv-zooniverse-uploads"
missing_info = zoo_contents_s3_pd_movies.merge(movies_df, 
                                        on=['Key'], 
                                        how='outer', 
                                        indicator=True)

#missing_info[missing_info["_merge"]=="both"]
#missing_info

In [None]:
# Find out about those files missing from the S3
missing_from_s3 = missing_info[missing_info["_merge"]=="right_only"]
missing_bad_deployment = missing_from_s3[missing_from_s3["IsBadDeployment"]]
missing_no_bucket_info = missing_from_s3[~(missing_from_s3["IsBadDeployment"])&(missing_from_s3["bucket"].isna())]

In [None]:
print("There are", len(missing_from_s3.index), "movies missing from the S3")
print(len(missing_bad_deployment.index), "movies are bad deployments. Their filenames are:")
print(*missing_bad_deployment.filename.unique(), sep = "\n")
print(len(missing_no_bucket_info.index), "movies are good deployments but don't have bucket info. Their filenames are:")
print(*missing_no_bucket_info.filename.unique(), sep = "\n")

### Find surveys missing from csv

In [None]:
missing_from_csv = missing_info[missing_info["_merge"]=="left_only"].reset_index(drop=True)

In [None]:
print("There are", len(missing_from_csv.index), "movies missing from the csv")

In [None]:
# Combine the location of the file and the filename
missing_from_csv["location_and_filename"] = "S3_prefix: " + missing_from_csv['prefix_x'].str.rsplit('/',1).str[1] + " | Filename: " + missing_from_csv['raw_filename_x']

In [None]:
missing_from_csv

In [None]:
# Specify the go pro movies to update
go_pro_mov_to_update = go_pro_movies_to_update(missing_from_csv)

In [None]:
# Specify the full movies to update
full_mov_to_update = full_movie_to_update(missing_from_csv)

In [None]:
mov_to_update.value

In [None]:
# Select multiple movies to include information of
def go_pro_movies_to_update(df):
    
    # Save the filenames of the movies missing
    filename_missing_csv = df.location_and_filename.unique()
    
    # Display the project options
    movie_to_update = widgets.SelectMultiple(
        options=filename_missing_csv,
        rows=15,
        layout=Layout(width='80%'),
        description="GO pro movies:",
        disabled=False,
        
    )
    
    display(movie_to_update)
    return movie_to_update

# Select one movie to include information of
def full_movie_to_update(df):
    
    # Save the filenames of the movies missing
    filename_missing_csv = df.location_and_filename.unique()
    
    # Display the project options
    movie_to_update = widgets.Dropdown(
        options=filename_missing_csv,
        rows=15,
        layout=Layout(width='80%'),
        description="Full movie:",
        disabled=False,
        
    )
    
    display(movie_to_update)
    return movie_to_update


In [None]:
info_csv = info_to_csv(missing_from_csv, mov_to_update.value)

In [None]:
# Select the info to add to the csv
def info_to_csv(df, movies):
    
    # Save the filenames of the movies missing
    filename_missing_csv = df.location_and_filename.unique()
    
    # Display the project options
    movie_to_update = widgets.SelectMultiple(
        options=filename_missing_csv,
        rows=15,
        layout=Layout(width='80%'),
        description="Movie:",
        disabled=False,
        
    )
    
    display(movie_to_update)
    return movie_to_update

# Concatenate go pros and update csv

In [None]:
zoo_contents_s3_pd_movies[zoo_contents_s3_pd_movies['Key'].str.contains("TUH_032")].Key.unique()

In [None]:
movies_df[movies_df['siteName']=="TUH_037"].go_pro_files.unique()

In [None]:
missing_info[(missing_info["_merge"]=="right_only")]

In [None]:
# Select only those deployments that are valid
movies_df = movies_df[~movies_df["IsBadDeployment"]].reset_index(drop=True)

In [None]:
# Set the directory of the libraries
import sys
sys.path.append('..')

# Import required modules
import utils.t15_utils as t15
import utils.t12_utils as t12

print("Packages loaded successfully")

import os
import pandas as pd
import sqlite3
import pandas as pd
from tqdm import tqdm
import subprocess

import utils.db_utils as db_utils
import utils.server_utils as server_utils
import utils.spyfish_utils as spyfish_utils

# Get the location of the csv files with initial info to populate the db
sites_csv, movies_csv, species_csv = server_utils.get_sites_movies_species()

# Load the csv with movies information
movies_df = pd.read_csv(movies_csv)

# Select only those deployments that are valid
movies_df = movies_df[~movies_df["IsBadDeployment"]].reset_index(drop=True)    

In [None]:
# Select only surveys that are missing filenames
unprocessed_movies_df = movies_df[movies_df["filename"].isna()].reset_index(drop=True)

# Write the filename of the concatenated movie
unprocessed_movies_df["filename"] = unprocessed_movies_df["siteName"] + "_" + unprocessed_movies_df["created_on"].str.replace('/','_')+ ".MP4"

In [None]:
zoo_contents_s3_pd_movies[zoo_contents_s3_pd_movies['raw_filename'].str.contains("Site 20")]

In [None]:
unprocessed_movies_df

In [None]:
unprocessed_movies_df

In [None]:
spyfish_utils.concatenate_videos(unprocessed_movies_df, client)

In [None]:
# Check that all videos have filenames
if movies_df["filename"].isna().any():
    
    #####Get info from bucket#####
    # Your acess key for the s3 bucket. 
    #aws_access_key_id, aws_secret_access_key = server_utils.aws_credentials()

    # Specify the bucket where the BUV files are
    bucket_i = movies_df['bucket'].str.split('/').str[0].dropna().unique()[1]

In [None]:
# Retrieve info from the bucket
contents_s3_pd = server_utils.retrieve_s3_buckets_info(client, bucket_i)

In [None]:
contents_s3_pd

In [None]:
contents_s3_pd[contents_s3_pd['Key'].str.contains("buv-zooniverse-uploads/tuhua-buv-2020/TUH_021/TUH_021__22_09_2020.MP4")]['Key'].unique()

In [None]:
#client.delete_object(Bucket=bucket_i, Key="buv-zooniverse-uploads/tuhua-buv-2020/TUH_021/TUH_021__22_09_2020.MP4")

In [None]:
# Specify the filename of the raw videos        
contents_s3_pd['raw_filename'] = contents_s3_pd['Key'].str.split('/').str[-1]

# Specify the filename of the raw videos        
contents_s3_pd['bucket'] = contents_s3_pd['Key'].str.rsplit('/',1).str[0]

In [None]:
session = server_utils.connect_s3(aws_access_key_id, aws_secret_access_key)

In [None]:
!pip install boto3

In [None]:
# Check that filenames info from movies csv exists in S3
import boto3
from botocore.errorfactory import ClientError

s3 = boto3.client('s3')
try:
    s3.head_object(Bucket='bucket_name', Key='file_path')
except ClientError:
    # Not found
    pass


# Check movies that can't be mapped

In [None]:
# Select only those deployments that are valid
    movies_df = movies_df[~movies_df["IsBadDeployment"]].reset_index(drop=True)  
    
    # Report on unmapped movies
    unmapped_movies_df = movies_df[~movies_df["exists"]].reset_index(drop=True)
    if not unmapped_movies_df.empty:
        print("The following", len(unmapped_movies_df.index), "movies are missing from the S3 and are not bad deployments")
        print(*unmapped_movies_df.filename.unique(), sep = "\n")