<img align="left" src="https://panoptes-uploads.zooniverse.org/project_avatar/86c23ca7-bbaa-4e84-8d8a-876819551431.png" type="image/png" height=100 width=100>
</img>
<h1 align="right">KSO Tutorials #4: Add new clips to a Zooniverse workflow</h1>
<h3 align="right">Written by @jannesgg and @vykanton</h3>
<h5 align="right">Last updated: Nov 8th, 2021</h5>

# Set up and requirements

### Import Python packages

In [None]:
# Set the directory of the libraries
import sys
sys.path.append('..')

# Set to display dataframes as interactive tables
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

# Import required modules
import utils.tutorials_utils as t_utils
import utils.server_utils as serv_utils
import utils.t4_utils as t4
import utils.zooniverse_utils as zoo

print("Packages loaded successfully")

### Choose your project

In [None]:
project = t_utils.choose_project()

### Initiate SQL database and populate sites, movies and species

In [None]:
# Specify the path of the movies 
movies_path = "/uploads"

# Specify the path of the sql database
db_path = "koster_lab.db"

# Initiate the SQL database 
%run -i "../db_starter/starter.py" --movies_path $movies_path --db_path $db_path --project_name $project.value

# Retrieve info about zooniverse clips

In [None]:
# Save your Zooniverse user name and password.
zoo_user, zoo_pass = zoo.zoo_credentials()

In [None]:
# Specify the Zooniverse information required throughout the tutorial
zoo_info = ["subjects"]

# Retrieve and store the Zooniverse information required throughout the tutorial in a dictionary
zoo_info_dict = zoo.retrieve_zoo_info(zoo_user, zoo_pass, project.value, zoo_info)

In [None]:
# Populate the sql with subjects uploaded to Zooniverse
zoo.populate_subjects(zoo_info_dict["subjects"], project.value, db_path)

# Retrieve info about movies hosted in the server

In [None]:
# Choose the server where the movies are hosted
#server_i = t_utils.choose_server()
server_i = choose_server()

In [None]:
# Connect to the server
server_i_dict = serv_utils.connect_to_server(server_i.value)

In [None]:
# Get the location of the csv files with initial info to populate the db
sites_csv, movies_csv, species_csv = serv_utils.get_sites_movies_species()

# Read csv as pd
movies_df = pd.read_csv(movies_csv)

In [None]:
bucket_i = 'marine-buv'

# Specify the formats of the movies to select
movie_formats = tuple(['wmv', 'mpg', 'mov', 'avi', 'mp4', 'MOV', 'MP4'])

movies_s3_pd = get_matching_s3_keys(server_i_dict["client"], bucket_i, suffix=movie_formats)

In [None]:
# Specify the key of the movies (path in S3 of the object)
movies_df["Key"] = movies_df["prefix"] + "/"+ movies_df["filename"]

# Missing info for files in the "buv-zooniverse-uploads"
movies_df = movies_df.merge(movies_s3_pd["Key"], 
                            on=['Key'], how='left', 
                            indicator=True)

# Check that movies can be mapped
movies_df['exists'] = np.where(movies_df["_merge"]=="left_only", False, True)

# Drop _merge columns to match sql squema
movies_df = movies_df.drop("_merge", axis=1)

In [None]:
# Select only those that can be mapped
available_movies_df = movies_df[movies_df['exists']]

print(available_movies_df.shape[0], "movies are mapped from", server_i.value)

### Select the movie you want to upload to Zooniverse

In [None]:
import pandas as pd
import ipywidgets as widgets

# Select the movie you want to upload to Zooniverse
def movie_to_upload(available_movies_df):

    # Widget to select the movie
    movie_to_upload_widget = widgets.Combobox(
                    options=tuple(available_movies_df.filename.unique()),
                    description="Movie to upload:",
                    ensure_option=True,
                    disabled=False,
                )
    
    
    display(movie_to_upload_widget)
    return movie_to_upload_widget

In [None]:
movie_i = movie_to_upload(available_movies_df)

### Check if movie is already in Zooniverse

Remember to query a new to get the most up to date list of clips uploaded

In [None]:
# Check movie hasn't been uploaded to Zooniverse
movie_i.value

In [None]:
import utils.db_utils as db_utils

# Create connection to db
conn = db_utils.create_connection(db_path)

# Query info about the clip subjects uploaded to Zooniverse
subjects_df = pd.read_sql_query("SELECT id, subject_type, filename, clip_start_time, clip_end_time, movie_id FROM subjects WHERE subject_type='clip'", conn)

# Save the video filenames of the clips uploaded to Zooniverse 
videos_uploaded = subjects_df.filename.unique()

In [None]:
# Check if selected movie has already been uploaded contains list element
already_uploaded = any(mv in movie_i.value for mv in videos_uploaded)

if already_uploaded:
    movie_no_ext = movie_i.value.split(".", 1)[0]
    clips_uploaded = subjects_df[subjects_df["filename"].str.contains(movie_no_ext)]
    print(movie_i.value, "has clips already uploaded. The clips start and finish at:")
    print(clips_uploaded[["clip_start_time", "clip_end_time"]], sep = "\n")
else:
    print(movie_i.value, "has not been uploaded to Zooniverse yet")

### Specify the number of clips and clip length

In [None]:
import datetime
from IPython.display import display 
from ipywidgets import interactive

# Display in hours, minutes and seconds
def to_hhmmss(clip_length, clips_to_upl):
    
    # Calculate the seconds
    seconds = clip_length*clips_to_upl
    
    print("Movie time to upload:", datetime.timedelta(seconds=seconds))
    
    return seconds


# Select the number of clips to upload 
clip_length_number = interactive(to_hhmmss, 
                          clip_length = widgets.Dropdown(
                             options=[10,5],
                             value=10,
                             description="Length of clips:",
                             ensure_option=True,
                             disabled=False,),
                         clips_to_upl = widgets.IntSlider(
                             value=180,
                             min=0,
                             max=720,
                             step=1,
                             description='Number of clips to upload:',)
                        )

display(clip_length_number)

### Review the clips that will be created

In [None]:
# Review the clips that will be created


In [None]:
# Func to expand seconds
def expand_list(df, list_column, new_column):
    lens_of_lists = df[list_column].apply(len)
    origin_rows = range(df.shape[0])
    destination_rows = np.repeat(origin_rows, lens_of_lists)
    non_list_cols = [idx for idx, col in enumerate(df.columns) if col != list_column]
    expanded_df = df.iloc[destination_rows, non_list_cols].copy()
    expanded_df[new_column] = [item for items in df[list_column] for item in items]
    expanded_df.reset_index(inplace=True, drop=True)
    return expanded_df

In [None]:
import math

# Select movie of interest
movie_i_df = available_movies_df[available_movies_df['filename']==movie_i.value].reset_index(drop=True)

# Save the filename of the movie_i
sitecode_i = movie_i_df.filename.unique()[0]

# Calculate the max number of clips available
clip_length = clip_length_number.kwargs['clip_length']
clip_numbers = clip_length_number.kwargs['clips_to_upl']

# Calculate all the potential seconds for the new clips to start
movie_i_df["seconds"] = [
    list(range(i, int(math.floor(j / clip_length) * clip_length), clip_length))
    for i, j in movie_i_df[["survey_start", "duration"]].values
]

# Reshape the dataframe of potential seconds for the new clips to start
potential_start_df = expand_list(movie_i_df, "seconds", "pot_seconds")

if clip_numbers>potential_start_df.shape[0]:
    print("The number of clips and duration selected is higher than the duration of the survey")

else:
    # Select the number of clips to create
    videos_upload = potential_start_df.sort_values("pot_seconds").head(clip_numbers)
    
    print(clip_numbers, "clips of", clip_length, "seconds each, will be created from", sitecode_i)
    

### Create the clips

In [None]:
# Specify the temp folder to host the clips
clips_folder = sitecode_i+"clips_folder"

project_name = project.value

if project_name == "Spyfish_Aotearoa":
    # Download the movie of interest
    serv_utils.download_object_from_s3(
                    client,
                    bucket=bucket_i,
                    key=movie_i_df.Key.unique()[0],
                    filename=sitecode_i,
    )

# Set the filename of the clips
videos_upload["clip_filename"] = videos_upload["filename"].astype(str) + "_clip_" + videos_upload["pot_seconds"].astype(str) + "_" + str(clip_length) + ".mp4"

# Set the path to uncompressed clips
videos_upload["clip_path"] = clips_folder + videos_upload["clip_filename"]

In [None]:
# Function to extract the videos 
def extract_clips(df, clip_length): 
    # Read each movie and extract the clips (printing a progress bar) 
    for index, row in tqdm(df.iterrows(), total=df.shape[0]): 
        if not os.path.exists(row['clip_path']):
            subprocess.call(["ffmpeg", 
                             "-ss", str(row['upl_seconds']), 
                             "-t", str(clip_length), 
                             "-i", str(row['concat_video']), 
                             "-c", "copy", 
                             "-an",#removes the audio
                             "-force_key_frames", "1",
                             str(row['clip_path'])])

    print("clips extracted successfully")

In [None]:
# Create the folder to store the videos if not exist
if not os.path.exists(clips_folder):
    os.mkdir(clips_folder)
    
# Extract the videos and store them in the folder
extract_clips(videos_upload, clip_length)

Make sure your workflows in Zooniverse have different names to avoid issues while selecting the workflow id

### Preview of clips

In [None]:
from IPython.display import HTML
import os
import pandas as pd
import numpy as np
import json, io
from ast import literal_eval
from utils.zooniverse_utils import auth_session
import utils.db_utils as db_utils
from utils.koster_utils import filter_bboxes, process_clips_koster
from utils.spyfish_utils import process_clips_spyfish
from utils import db_utils
from collections import OrderedDict, Counter
from IPython.display import HTML, display, update_display, clear_output
import ipywidgets as widgets
from ipywidgets import interact
import asyncio
from itables import show
from IPython.display import Video


def select_video(movies_list: list):

    movie_widget = widgets.Combobox(
                    options=movies_list,
                    description="Movie:",
                    ensure_option=True,
                    disabled=False,
                )
    
    main_out = widgets.Output()
    display(movie_widget, main_out)
    
    # Display the subject and classifications on change
    def on_change(change):
        with main_out:
            a = view_movie(change["new"])
            clear_output()
            display(a)
                
                
    movie_widget.observe(on_change, names='value')
    
    
# View movie with ipython display
def view_movie(movie_path):
    # Specify the formats of the movies to select
    movie_formats = tuple(['wmv', 'mpg', 'mov', 'avi', 'mp4', 'MOV', 'MP4'])
   
    if movie_path.endswith(movie_path):
        return Video(movie_path, embed=True)
    
    else:
        Exception("Movie file not supported.")
        
# # View movie with html
# def view_movie(movie_path):
#     # Specify the formats of the movies to select
#     movie_formats = tuple(['wmv', 'mpg', 'mov', 'avi', 'mp4', 'MOV', 'MP4'])
   
#     if movie_path.endswith(movie_path):
#         html_code = f"""
#         <video alt="test" controls>
#         <source src={movie_path} type="video/MP4">
#         </video><html>"""
    
#     else:
#         Exception("Movie file not supported.")
#     return HTML(html_code)        

In [None]:
select_video(videos_upload.clip_path.unique())

In [None]:
import glob
import os

# Get a list of files (file paths) in the given directory 
list_of_files = filter( os.path.isfile,
                        glob.glob(clips_folder + '*') )

# get list of files with size
files_with_size = [ (file_path, os.stat(file_path).st_size) 
                    for file_path in list_of_files ]

df = pd.DataFrame(files_with_size)

#Change bytes to MB
df['size'] = df[1]/1000000

### Video modifications

In [None]:
# Function to extract the videos 
def color_correction(df): 
    
    # Read each movie and extract the clips (printing a progress bar) 
    for index, row in tqdm(df.iterrows(), total=df.shape[0]): 
        if not os.path.exists(row['clip_comp_path']):
            subprocess.call(["ffmpeg", 
                             "-i", str(row['clip_path']),
                             "-c:v", "libx264",
                             "-vf", "curves=red=0/0 0.396/0.67 1/1:green=0/0 0.525/0.451 1/1:blue=0/0 0.459/0.517 1/1,scale=1280:-1",#borrowed from https://www.element84.com/blog/color-correction-in-space-and-at-sea
                             "-crf", "18", #Change to get ~5MB videos
                             "-preset", "veryfast",
                             str(row['clip_comp_path'])])
        
       
    print("Clips color corrected successfully")
    
    #ffmpeg -i "C:\your-path-here\GH010054.MP4" -c:v libx265 -crf 28 -preset veryfast -vf scale=1920:1080 GH010054_reduced.mp4 -hide_banner

In [None]:
# Choose to reduce the size of the videos, remove audio or blur sensitive portions


# Specify the temp folder to host the modified clips
mod_clips_folder = "modified_" + sitecode_i+"_clips_folder"

# Set the path to modified clips
videos_upload["clip_comp_path"] = mod_clips_folder + videos_upload["clip_filename"]

# Create the folder to store the videos if not exist
if not os.path.exists(mod_clips_folder):
    os.mkdir(mod_clips_folder)
    
# Compress the videos
color_correction(videos_upload)


### Preview of modified clips

### Set Zooniverse metadata

In [None]:
#Conver datetime to string to avoid JSON seriazible issues
videos_upload['EventDate'] = videos_upload['Date'].astype(str)

# Select the photo_path and other columns that will be used as metadata
# (fields that begin with “#” or “//” will never be shown to volunteers)
# (fields that begin with "!" will only be available for volunteers on the Talk section, after classification)

upload_to_zoo = videos_upload.rename(columns={
    "Marine Reserve": "!LinkToMarineReserve",
    "EventDate": "#EventDate",
    "VideoFilename": "#VideoFilename",
    "Waypoint": "#SiteID",
    "Protection Status": "ProtectionStatus",
    "Depth (m)": "Depth"
    })


upload_to_zoo["#SiteCode"] = upload_to_zoo["#VideoFilename"]

upload_to_zoo = upload_to_zoo[
                            [
                             "clip_comp_path",
                             "Year",
                             "upl_seconds",
                             "Depth",
                             "ProtectionStatus",
                             "!LinkToMarineReserve",
                             "#EventDate",
                             "#VideoFilename",
                             "#SiteID",
                             "#SiteCode",
                             ]
                            ]
        
# Add information about the type of subject and length
upload_to_zoo["Subject_type"] = "clip"
upload_to_zoo["#clip_length"] = clip_length

### Upload clips to Zooniverse

You may receive an error message related to file size if clips exceed the recommended limit for Zooniverse uploads. In this case, we recommend shortening the clip length to achieve a suitable filesize.

In [None]:
# Create a new subject set to host the clips
subject_set = SubjectSet()

subject_set_name = str(int(n_clips)) + "_clips" + "_" + sitecode_i + date.today().strftime("_%Y_%m_%d")
subject_set.links.project = project
subject_set.display_name = subject_set_name

subject_set.save()

print(subject_set_name, "subject set created")

# Save the df as the subject metadata
subject_metadata = upload_to_zoo[upload_to_zoo["#SiteCode"]==sitecode_i].set_index('clip_comp_path').to_dict('index')

# Upload the clips to Zooniverse (with metadata)
new_subjects = []

print("uploading subjects to Zooniverse")
for clip_path, metadata in tqdm(subject_metadata.items(), total=len(subject_metadata)):
  subject = Subject()

  subject.links.project = project
  subject.add_location(clip_path)

  subject.metadata.update(metadata)

  subject.save()
  new_subjects.append(subject)

# Upload videos
subject_set.add(new_subjects)

print("Subjects uploaded to Zooniverse")

In [None]:
#END