<img align="left" src="https://panoptes-uploads.zooniverse.org/project_avatar/86c23ca7-bbaa-4e84-8d8a-876819551431.png" type="image/png" height=100 width=100>
</img>


<h1 align="right">KSO Tutorials #19: Process Spyfish Aotearoa videos</h1>
<h3 align="right">Written by @jannesgg and @vykanton</h3>
<h5 align="right">Last updated: Nov 14th, 2021</h5>

# Set up and requirements

Import Python packages

In [None]:
# Set the directory of the libraries
import sys
sys.path.append('..')

# Set to display dataframes as interactive tables
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

# Import required modules
import utils.tutorials_utils as t_utils
import utils.t19_utils as t19
#import utils.server_utils as serv_utils

print("Packages loaded successfully")

### Choose your project

In [None]:
project = t_utils.choose_project()

# Select the folder where the sd card is

In [None]:
go_pro_folder = t19.select_go_pro_folder()

In [None]:
go_pro_movies_i = t19.select_go_pro_movies(go_pro_folder.selected)
print("The movies selected are:", go_pro_movies_i)

### Select the site

In [None]:
site_i = t19.select_site()

### Select the date

In [None]:
date_i = t19.select_date()

### Concatenate the go pro files

In [None]:
fps_i, duration_i = t19.concatenate_go_pro_videos(site_i.result.value,
                              date_i.value.isoformat(),
                              go_pro_folder.selected,
                              go_pro_movies_i)

In [None]:
# Save the site selected
siteName_i = site_i.value

# Save the date selected
created_on_i = date_i.value.isoformat()

# Specify the name of the survey
unique_survey_name = siteName_i+"_"+created_on_i

# Specify the filename and path for the concatenated movie
filename_i = unique_survey_name+".MP4"
temp_path_filename_i = fc.selected+filename_i

### Add movie_id

In [None]:
# Read csv as pd
movies_df = pd.read_csv(movies_csv)
movie_id_i = 1 + movies_df.movie_id.iloc[-1]

### Select Author

In [None]:
from IPython.display import display 
from ipywidgets import interactive

# Existing authors
exisiting_authors = movies_df.Author.unique()

def f(Existing_or_new):
    if Existing_or_new == 'Existing':
        author_widget = widgets.Dropdown(
            options = exisiting_authors,
            description = 'Author:',
            disabled = False
        )

    if Existing_or_new == 'New author':   
        author_widget = widgets.Text(
            placeholder='Type Author',
            description='Author:',
            disabled=False
        )

    display(author_widget)

    return(author_widget)

w = interactive(f, Existing_or_new=['Existing','New author'])

display(w)

In [None]:
# Save Author
Author_i = w.result.value

### Set survey start

In [None]:
import datetime

# Display in hours, minutes and seconds
def to_hhmmss(seconds):
    print("Time selected:", datetime.timedelta(seconds=seconds))
    
    return seconds



# Select the start of the survey 
surv_start = interactive(to_hhmmss, seconds=widgets.IntSlider(
    value=0,
    min=0,
    max=duration_i,
    step=1,
    description='Survey starts (seconds):',))

display(surv_start)

### Set survey end

In [None]:
# Select the end of the survey 
surv_end = interactive(to_hhmmss, seconds=widgets.IntSlider(
    value=duration_i,
    min=0,
    max=duration_i,
    step=1,
    description='Survey ends (seconds):',))

display(surv_end)

### Specify bucket

In [None]:
bucket_i = 'marine-buv'

### Specify if bad deployment

In [None]:
def select_bad_deployment():
    
    # Select the deployment status 
    bad_deployment_widget = widgets.Dropdown(
        options=['Yes, unfortunately it is marine crap', 'No, it is a great video'],
        value='No, it is a great video',
        description='Is it a bad deployment?',
        disabled=False,
    )
    
    display(bad_deployment_widget)
    return bad_deployment_widget

def deployment_to_true_false(deploy_value):
    if deploy_value == 'No, it is a great video':
        return False
    else:
        return True

In [None]:
deployment_i = select_bad_deployment()

### Save the s3 prefix

In [None]:
# Connect to s3 to get the list of folders available
aws_access_key_id, aws_secret_access_key = server_utils.aws_credentials()
client = server_utils.connect_s3(aws_access_key_id, aws_secret_access_key)

In [None]:
# Retrieve info from the bucket
contents_s3_pd = server_utils.retrieve_s3_buckets_info(client, bucket_i, "")

# Extract the prefix (directory) of the objects        
s3_folders_available = contents_s3_pd[0].str.rsplit('/',0).str[0]

# Conver folders available df to tuple
s3_folders_available_tuple = tuple(s3_folders_available.unique())

In [None]:
# Select s3 folder to upload the video
def select_s3_folder(s3_folders_available):

    # Select the s3 folder
    s3_folder_widget = widgets.Combobox(
                    options=s3_folders_available,
                    description="S3 folder:",
                    ensure_option=True,
                    disabled=False,
                )
    
    
    display(s3_folder_widget)
    return s3_folder_widget

In [None]:
# Select the S3 "folder" to upload the video
s3_folder = select_s3_folder(s3_folders_available_tuple)

### Add any comment related to the movie

In [None]:
# Write a comment about the video
def write_comment():

    # Create the comment widget
    comment_widget = widgets.Text(
            placeholder='Type comment',
            description='Comment:',
            disabled=False
        )

    
    display(comment_widget)
    return comment_widget

In [None]:
comment_movie = write_comment()

### Save all the values selected

In [None]:
# Save survey start and end
survey_start_i = surv_start.result
survey_end_i = surv_end.result

# Save response to bad deployment
IsBadDeployment_i = deployment_to_true_false(deployment_i.value)

# Save the prefix (s3 path) to upload the video
prefix_i = s3_folder.value + "/" + unique_survey_name

# Save the comment
comment_i = comment_movie.value

### Review the survey/movie details

In [None]:
row_i = [[movie_id_i, filename_i, siteName_i, created_on_i, Author_i, fps_i, 
         duration_i, survey_start_i, survey_end_i, go_pro_files_i, bucket_i,
         prefix_i, IsBadDeployment_i, comment_i]]
new_row = pd.DataFrame(row_i, columns = movies_df.columns)
new_row

# !!!Only pass this point if movie details are correct!!!

### Update movies csv and upload video to s3

In [None]:
# Specify the location of the movie in the s3
key_filename = prefix + "/" + filename

# Upload movie to the s3 bucket
server_utils.upload_file_to_s3(client, bucket_i, key_filename, path_filename)

# Add row to movies_df
movies_df = movies_df.append(new_row, ignore_index=True)

# Temporarily save the movies df as csv
path_movies_csv = "movies_buv_doc.csv"
movies_df.to_csv(path_movies_csv, index=False)

# Upload the updated csv to the s3 bucket
server_utils.upload_file_to_s3(client, bucket_i, key_movies_csv, path_movies_csv)

# Remove temporary csv
os.remove(path_movies_csv)

# Remove temporary movie


In [None]:
# Upload the file to the s3

# Get list of surveys from S3

In [None]:
aws_access_key_id, aws_secret_access_key = server_utils.aws_credentials()

In [None]:
client = server_utils.connect_s3(aws_access_key_id, aws_secret_access_key)

In [None]:
# See list of buckets available
#client.list_buckets()

In [None]:
bucket_i = "marine-buv"

In [None]:
contents_s3_pd

In [None]:
# Retrieve info from the bucket
contents_s3_pd = server_utils.retrieve_s3_buckets_info(client, bucket_i,"")

# Specify the filename of the objects (videos)        
contents_s3_pd['raw_filename'] = contents_s3_pd['Key'].str.split('/').str[-1]

# Specify the prefix (directory) of the objects        
contents_s3_pd['prefix'] = contents_s3_pd['Key'].str.rsplit('/',1).str[0]

In [None]:
# Upload csv to s3 
#upload_file_to_s3(client, *, bucket, key, filename)

In [None]:
contents_s3_pd

In [None]:
# Specify the formats of the movies to select
movie_formats = tuple(['wmv', 'mpg', 'mov', 'avi', 'mp4', 'MOV', 'MP4'])

foo = get_matching_s3_objects(client, bucket_i, suffix=movie_formats)


In [None]:
# Specify the formats of the movies to select
movie_formats = tuple(['wmv', 'mpg', 'mov', 'avi', 'mp4', 'MOV', 'MP4'])

s3_keys = [key for key in get_matching_s3_keys(client=client, bucket=bucket_i, suffix=movie_formats)]

In [None]:
pd.DataFrame(s3_keys).shape

In [None]:
foo_pd = retrieve_s3_buckets_info(client=client, bucket=bucket_i, suffix=movie_formats)

In [None]:
foo_pd.shape

In [None]:
# Select only files within the buv-zooniverse-uploads bucket
zoo_contents_s3_pd = contents_s3_pd[contents_s3_pd['prefix'].str.contains("buv-zooniverse-uploads")].reset_index(drop = True)

# Specify the formats of the movies to select
movie_formats = tuple(['wmv', 'mpg', 'mov', 'avi', 'mp4', 'MOV', 'MP4'])

# Select only files of interest (movies)
zoo_contents_s3_pd_movies = zoo_contents_s3_pd[zoo_contents_s3_pd['raw_filename'].str.endswith(movie_formats)]

# Get list of surveys from movies csv

In [None]:
# Get the location of the csv files with initial info to populate the db
sites_csv, movies_csv, species_csv = server_utils.get_sites_movies_species()

# Load the csv with movies information
movies_df = pd.read_csv(movies_csv) 

# Combine the entire movies and go_pro_files
movies_df['files'] = np.where(movies_df['go_pro_files'].isna(), movies_df['filename'], movies_df['filename'] + ";" + movies_df['go_pro_files'])

# Transform the df to have a row per filename
movies_df = movies_df.assign(raw_filename=movies_df['files'].str.split(';')).explode('raw_filename')

# Specify the key (path in S3 of the object)
movies_df["Key"] = movies_df["prefix"] + "/" + movies_df["raw_filename"]

# Find surveys missing from s3

In [None]:
# Missing info for files in the "buv-zooniverse-uploads"
missing_info = zoo_contents_s3_pd_movies.merge(movies_df, 
                                        on=['Key'], 
                                        how='outer', 
                                        indicator=True)

#missing_info[missing_info["_merge"]=="both"]
#missing_info

In [None]:
# Find out about those files missing from the S3
missing_from_s3 = missing_info[missing_info["_merge"]=="right_only"]
missing_bad_deployment = missing_from_s3[missing_from_s3["IsBadDeployment"]]
missing_no_bucket_info = missing_from_s3[~(missing_from_s3["IsBadDeployment"])&(missing_from_s3["bucket"].isna())]

In [None]:
print("There are", len(missing_from_s3.index), "movies missing from the S3")
print(len(missing_bad_deployment.index), "movies are bad deployments. Their filenames are:")
print(*missing_bad_deployment.filename.unique(), sep = "\n")
print(len(missing_no_bucket_info.index), "movies are good deployments but don't have bucket info. Their filenames are:")
print(*missing_no_bucket_info.filename.unique(), sep = "\n")

### Find surveys missing from csv

In [None]:
missing_from_csv = missing_info[missing_info["_merge"]=="left_only"].reset_index(drop=True)

In [None]:
print("There are", len(missing_from_csv.index), "movies missing from the csv")

In [None]:
# Combine the location of the file and the filename
missing_from_csv["location_and_filename"] = "S3_prefix: " + missing_from_csv['prefix_x'].str.rsplit('/',1).str[1] + " | Filename: " + missing_from_csv['raw_filename_x']

In [None]:
missing_from_csv

In [None]:
# Specify the go pro movies to update
go_pro_mov_to_update = go_pro_movies_to_update(missing_from_csv)

In [None]:
# Specify the full movies to update
full_mov_to_update = full_movie_to_update(missing_from_csv)

In [None]:
mov_to_update.value

In [None]:
# Select multiple movies to include information of
def go_pro_movies_to_update(df):
    
    # Save the filenames of the movies missing
    filename_missing_csv = df.location_and_filename.unique()
    
    # Display the project options
    movie_to_update = widgets.SelectMultiple(
        options=filename_missing_csv,
        rows=15,
        layout=Layout(width='80%'),
        description="GO pro movies:",
        disabled=False,
        
    )
    
    display(movie_to_update)
    return movie_to_update

# Select one movie to include information of
def full_movie_to_update(df):
    
    # Save the filenames of the movies missing
    filename_missing_csv = df.location_and_filename.unique()
    
    # Display the project options
    movie_to_update = widgets.Dropdown(
        options=filename_missing_csv,
        rows=15,
        layout=Layout(width='80%'),
        description="Full movie:",
        disabled=False,
        
    )
    
    display(movie_to_update)
    return movie_to_update


In [None]:
info_csv = info_to_csv(missing_from_csv, mov_to_update.value)

In [None]:
# Select the info to add to the csv
def info_to_csv(df, movies):
    
    # Save the filenames of the movies missing
    filename_missing_csv = df.location_and_filename.unique()
    
    # Display the project options
    movie_to_update = widgets.SelectMultiple(
        options=filename_missing_csv,
        rows=15,
        layout=Layout(width='80%'),
        description="Movie:",
        disabled=False,
        
    )
    
    display(movie_to_update)
    return movie_to_update

# Concatenate go pros and update csv

In [None]:
zoo_contents_s3_pd_movies[zoo_contents_s3_pd_movies['Key'].str.contains("TUH_032")].Key.unique()

In [None]:
movies_df[movies_df['siteName']=="TUH_037"].go_pro_files.unique()

In [None]:
missing_info[(missing_info["_merge"]=="right_only")]

In [None]:
# Select only those deployments that are valid
movies_df = movies_df[~movies_df["IsBadDeployment"]].reset_index(drop=True)

In [None]:
# Set the directory of the libraries
import sys
sys.path.append('..')

# Import required modules
import utils.t15_utils as t15
import utils.t12_utils as t12

print("Packages loaded successfully")

import os
import pandas as pd
import sqlite3
import pandas as pd
from tqdm import tqdm
import subprocess

import utils.db_utils as db_utils
import utils.server_utils as server_utils
import utils.spyfish_utils as spyfish_utils

# Get the location of the csv files with initial info to populate the db
sites_csv, movies_csv, species_csv = server_utils.get_sites_movies_species()

# Load the csv with movies information
movies_df = pd.read_csv(movies_csv)

# Select only those deployments that are valid
movies_df = movies_df[~movies_df["IsBadDeployment"]].reset_index(drop=True)    

In [None]:
# Select only surveys that are missing filenames
unprocessed_movies_df = movies_df[movies_df["filename"].isna()].reset_index(drop=True)

# Write the filename of the concatenated movie
unprocessed_movies_df["filename"] = unprocessed_movies_df["siteName"] + "_" + unprocessed_movies_df["created_on"].str.replace('/','_')+ ".MP4"

In [None]:
zoo_contents_s3_pd_movies[zoo_contents_s3_pd_movies['raw_filename'].str.contains("Site 20")]

In [None]:
unprocessed_movies_df

In [None]:
unprocessed_movies_df

In [None]:
spyfish_utils.concatenate_videos(unprocessed_movies_df, client)

In [None]:
# Check that all videos have filenames
if movies_df["filename"].isna().any():
    
    #####Get info from bucket#####
    # Your acess key for the s3 bucket. 
    #aws_access_key_id, aws_secret_access_key = server_utils.aws_credentials()

    # Specify the bucket where the BUV files are
    bucket_i = movies_df['bucket'].str.split('/').str[0].dropna().unique()[1]

In [None]:
# Retrieve info from the bucket
contents_s3_pd = server_utils.retrieve_s3_buckets_info(client, bucket_i)

In [None]:
contents_s3_pd

In [None]:
contents_s3_pd[contents_s3_pd['Key'].str.contains("buv-zooniverse-uploads/tuhua-buv-2020/TUH_021/TUH_021__22_09_2020.MP4")]['Key'].unique()

In [None]:
#client.delete_object(Bucket=bucket_i, Key="buv-zooniverse-uploads/tuhua-buv-2020/TUH_021/TUH_021__22_09_2020.MP4")

In [None]:
# Specify the filename of the raw videos        
contents_s3_pd['raw_filename'] = contents_s3_pd['Key'].str.split('/').str[-1]

# Specify the filename of the raw videos        
contents_s3_pd['bucket'] = contents_s3_pd['Key'].str.rsplit('/',1).str[0]

In [None]:
session = server_utils.connect_s3(aws_access_key_id, aws_secret_access_key)

In [None]:
!pip install boto3

In [None]:
# Check that filenames info from movies csv exists in S3
import boto3
from botocore.errorfactory import ClientError

s3 = boto3.client('s3')
try:
    s3.head_object(Bucket='bucket_name', Key='file_path')
except ClientError:
    # Not found
    pass


# Check movies that can't be mapped

In [None]:
# Select only those deployments that are valid
    movies_df = movies_df[~movies_df["IsBadDeployment"]].reset_index(drop=True)  
    
    # Report on unmapped movies
    unmapped_movies_df = movies_df[~movies_df["exists"]].reset_index(drop=True)
    if not unmapped_movies_df.empty:
        print("The following", len(unmapped_movies_df.index), "movies are missing from the S3 and are not bad deployments")
        print(*unmapped_movies_df.filename.unique(), sep = "\n")