<a href="https://colab.research.google.com/github/ocean-data-factory-sweden/kso-data-management/blob/main/tutorials/08_Analyse_Aggregate_Zooniverse_Annotations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img align="left" src="https://panoptes-uploads.zooniverse.org/project_avatar/86c23ca7-bbaa-4e84-8d8a-876819551431.png" type="image/png" height=100 width=100>
</img>


<h1 align="right">KSO Tutorials #8: Analyse / Aggregate Zooniverse classifications</h1>
<h3 align="right">Written by the KSO team</h3>

# Set up KSO requirements

In [None]:
# @title <font size="5"><i>Install kso_data_management and its requirements</font> { vertical-output: true }

from IPython.display import clear_output

try:
    import google.colab
    import os

    IN_COLAB = True
    print("Running in Colab...")

    # Clone kso-data-management repo
    !git clone --quiet --recurse-submodules -b main https://github.com/ocean-data-factory-sweden/kso-data-management.git
    !pip install -q --upgrade pip
    !pip install -q -r kso-data-management/requirements.txt

    # Fix libmagic issue
    !apt-get -qq update && apt-get -qq install -y libmagic-dev > /dev/null

    # Enable external widgets
    from google.colab import output

    output.enable_custom_widget_manager()

    os.chdir("kso-data-management/tutorials")
    try:
        clear_output()
        print("All packages are installed and ready to go!")
    except:
        clear_output()
        print("There have been some issues installing the packages!")
except:
    IN_COLAB = False
    import sys

    # Install requirements
    !pip install -q --no-warn-script-location --upgrade pip
    !pip install -qr ../requirements.txt

    !jupyter nbextension install --user --py widgetsnbextension
    !jupyter nbextension enable --user --py widgetsnbextension
    !jupyter nbextension install --user --py jupyter_bbox_widget
    !jupyter nbextension enable --user --py jupyter_bbox_widget

    clear_output()
    print("Running locally... you're good to go!")

#######Import Python packages########

# Set the directory of the libraries
import sys, os
from pathlib import Path

# Enables testing changes in utils
%load_ext autoreload
%autoreload 2

# Specify the path of the tutorials
sys.path.append("..")

# Import required modules
import kso_utils.tutorials_utils as t_utils
import kso_utils.server_utils as s_utils
import kso_utils.project_utils as p_utils
import kso_utils.t3_utils as t3
import kso_utils.t4_utils as t4
import kso_utils.t5_utils as t5
import kso_utils.t8_utils as t8
from kso_utils.yolo_utils import frame_aggregation
from kso_utils.zooniverse_utils import populate_agg_annotations

print("Packages loaded successfully")

In [None]:
# @title <font size="5"><i>Choose your project</font> { vertical-output: true }
project_name = t_utils.choose_project()

In [None]:
# @title <font size="5"><i>Initiate project's database</font> { vertical-output: true }
# Save the name of the project
project = p_utils.find_project(project_name=project_name.value)

# Initiate db
db_info_dict = t_utils.initiate_db(project)

In [None]:
# @title <font size="5"><i>Connect to Zooniverse</font> { vertical-output: true }
zoo_project = t_utils.connect_zoo_project(project)

In [None]:
# @title <font size="5"><i>Select the information to retrieve from Zooniverse</font> { vertical-output: true }

retrieve_info = t_utils.select_retrieve_info()

In [None]:
# @title <font size="5"><i>Retrieve the information from Zooniverse</font> { vertical-output: true }

zoo_info_dict = t_utils.retrieve__populate_zoo_info(
    project=project,
    db_info_dict=db_info_dict,
    zoo_project=zoo_project,
    zoo_info=["subjects", "workflows", "classifications"],
    generate_export=retrieve_info.result,
)

# Specify Zooniverse workflow of interest

In [None]:
# @title <font size="5"><i>Select Zooniverse workflow id and version of interest</font> { vertical-output: true }

# Note: A manual export in Zooniverse is required to get the most up-to-date classifications here*

# Make sure your workflows in Zooniverse have different names to avoid issues while selecting the workflow id

# Display a selectable list of workflow names and a list of versions of the workflow of interest
workflows_df = zoo_info_dict["workflows"]
wm = t8.WidgetMaker(workflows_df)
wm

In [None]:
# @title <font size="5"><i>Retrieve classifications from the workflow of interest</font> { vertical-output: true }

# Retrieve classifications from the workflow of interest
class_df = t8.get_classifications(
    wm.checks,
    workflows_df,
    wm.checks["Subject type: #0"],
    zoo_info_dict["classifications"],
    db_info_dict["db_path"],
    project,
)

# Aggregate classifications received on the workflow of interest

In [None]:
# @title <font size="5"><i>Specify agreement threshold among cit scientists</font> { vertical-output: true }

agg_params = t8.choose_agg_parameters(wm.checks["Subject type: #0"])

In [None]:
# @title <font size="5"><i>Aggregate classifications based on threshold</font> { vertical-output: true }

agg_class_df, raw_class_df = t8.aggregrate_classifications(
    df=class_df,
    subj_type=wm.checks["Subject type: #0"],
    project=project,
    agg_params=agg_params,
)

# Explore the aggregated classifications

In [None]:
# @title <font size="5"><i>Summarise the number of aggregated classifications</font> { vertical-output: true }

agg_class_df.groupby("label")["subject_ids"].agg("count")

In [None]:
# @title <font size="5"><i>Display all the aggregated classifications in a table</font> { vertical-output: true }

t8.launch_table(agg_class_df, wm.checks["Subject type: #0"])

In [None]:
# @title <font size="5"><i>Display a subject and its aggregated classifications</font> { vertical-output: true }

t8.launch_viewer(agg_class_df, wm.checks["Subject type: #0"])

In [None]:
# @title <font size="5"><i>Display the individual/non-aggregated classifications of a subject</font> { vertical-output: true }

t8.explore_classifications_per_subject(raw_class_df, wm.checks["Subject type: #0"])

# OPTIONAL - Export aggregated classifications in YOLO format (For ML purposes)

## Prepare the labelled frames

In [None]:
# @title <font size="5"><i>Choose species of interest for model training</font> { vertical-output: true }

# Roadblock to ensure the subject types are frames
if wm.checks["Subject type: #0"] != "frame":
    raise ValueError("The subject types are not frames.")
else:
    # Choose species of interest for model training
    species_i = t4.choose_species(db_info_dict)

In [None]:
# @title <font size="5"><i>Store selected classes of interest</font> { vertical-output: true }
# Store selected classes of interest
cl = list(species_i.value)
print("The select species are", cl)

In [None]:
# @title <font size="5"><i>Specify path to store the labelled frames and annotations</font> { vertical-output: true }
# Specify path to store the labelled frames and annotations
output_folder = t_utils.choose_folder(".", "output")

In [None]:
# @title <font size="5"><i>Select only relevant species frames</font> { vertical-output: true }
# Select only relevant species frames
agg_class_selected = agg_class_df[agg_class_df.label.isin(cl)]

In [None]:
# @title <font size="5"><i>Preview aggregated frame information</font> { vertical-output: true }
# Preview aggregated frame information
agg_class_selected.head()

## Process labelled frames

In [None]:
# @title <font size="5"><i>Add annotations to db</font> { vertical-output: true }
# Add annotations to db
populate_agg_annotations(agg_class_selected, "frame", project)

In [None]:
# @title <font size="5"><i>Determine your training parameters</font> { vertical-output: true }
# Determine your training parameters
percentage_test = t5.choose_test_prop()

In [None]:
# @title <font size="5"><i>Run the preparation script</font> { vertical-output: true }
# Run the preparation script
frame_aggregation(
    project,
    db_info_dict,
    output_folder.selected,
    percentage_test.value,
    cl,
    (720, 540),
    remove_nulls=True,
    track_frames=False,
    n_tracked_frames=10,
    agg_df=agg_class_selected,
)


## Preview and adjust aggregated annotations

In [None]:
# @title <font size="5"><i>Preview and adjust annotations</font> { vertical-output: true }
t8.get_annotations_viewer(output_folder.selected, species_list=cl)


# OPTIONAL - Export observations in GBIF/OBIS format (For biodiversity purposes)

In [None]:
# @title <font size="5"><i>Format the classifications to Darwin Core Standard occurrences</font> { vertical-output: true }
occurrence_df = t8.format_to_gbif_occurence(
    df=agg_class_df,
    classified_by="citizen_scientists",
    subject_type=wm.checks["Subject type: #0"],
    db_info_dict=db_info_dict,
    project=project,
    zoo_info_dict=zoo_info_dict,
)

In [None]:
# @title <font size="5"><i>Save the occurence df locally</font> { vertical-output: true }

occurrence_df.to_csv("occurrence_for_gbif.csv", index=False)
print("The observations are now saved in occurrence_for_gbif.csv")

In [None]:
# END