<img align="left" src="https://panoptes-uploads.zooniverse.org/project_avatar/86c23ca7-bbaa-4e84-8d8a-876819551431.png" type="image/png" height=100 width=100>
</img>
<h1 align="right">KSO Tutorials #6: Train machine learning models</h1>
<h3 align="right">Written by @jannesgg and @vykanton</h3>
<h5 align="right">Last updated: Feb 14th, 2022</h5>

# Set up and requirements

### Import Python packages

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Set the directory of the libraries
import sys, os
from pathlib import Path
sys.path.append('..')

# Set to display dataframes as interactive tables
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
from ipyfilechooser import FileChooser

# Import required modules
import kso_utils.tutorials_utils as t_utils
import kso_utils.server_utils as s_utils
import kso_utils.t6_utils as t6
import kso_utils.t12_utils as t12
from src.prepare_zooniverse import frame_aggregation
from kso_utils.zooniverse_utils import retrieve_zoo_info, populate_subjects, populate_agg_annotations

print("Packages loaded successfully")

### Choose a project

In [None]:
project = t_utils.choose_project()

### Initiate SQL database and populate sites, movies and species

In [None]:
# Initiate db
db_info_dict = t_utils.initiate_db(project.value)

In [None]:
# Connect to Zooniverse project
zoo_project = t_utils.connect_zoo_project(project.value)

### Retrieve Zooniverse information

In [None]:
zoo_info_dict = t_utils.retrieve__populate_zoo_info(project_name = project.value, 
                                                    db_info_dict = db_info_dict,
                                                    zoo_project = zoo_project,
                                                    zoo_info = ["subjects", "workflows", "classifications"])

### Specify Zooniverse parameters for frame aggregation

In [None]:
# Display a selectable list of workflow names and a list of versions of the workflow of interest
workflows_df = zoo_info_dict["workflows"]
wm = t12.WidgetMaker(workflows_df)
wm

In [None]:
# Retrieve classifications from the workflow of interest
class_df = t12.get_classifications(wm.checks,
                                     workflows_df, 
                                     'frame', 
                                     zoo_info_dict["classifications"], 
                                     db_info_dict["db_path"])

In [None]:
# Specify the agreement threshold required among cit scientists
agg_params = t12.choose_agg_parameters("frame")

In [None]:
agg_class_df, raw_class_df = t12.aggregrate_classifications(
                                    class_df, 'frame', project.value, agg_params)

In [None]:
# Add annotations to db
populate_agg_annotations(agg_class_df, 'frame', project.value)

### Step 0: Specify important paths and training parameters

In [None]:
# Specify output path where processed data will be stored
fc = t6.choose_folder(".", "output")

In [None]:
# Store selected output path
output_folder = fc.selected

In [None]:
# Choose species of interest for model training
class_list = t6.choose_classes(db_info_dict["db_path"])

In [None]:
# Store selected classes of interest
cl = list(class_list.value)

In [None]:
# Determine your training parameters
percentage_test, batch_size, epochs, conf_thres = t6.choose_test_prop()

### Step 1: Prepare the aggregated data

In [None]:
# Run the preparation script
frame_aggregation(project.value, db_info_dict, output_folder, percentage_test.value, cl, (720, 540))

### Step 2: Train the model with selected parameters

In [None]:
# Choose weights path
weights = t6.choose_folder("/usr/src/app/data_dir/weights", "weights")
display(weights)

In [None]:
# Fix important paths
data_path = [str(Path(output_folder, _)) for _ in os.listdir(output_folder) if _.endswith(".yaml") and "hyp" not in _][-1]
hyps_path = str(Path(output_folder, "hyp.yaml"))

# Temp placeholder experiment grouping (to be removed)
entity = "koster"
exp_name = "sgu_bm_model"

# Choose folder that will contain the different model runs
project_path = t6.choose_folder('/cephyr/NOBACKUP/groups/snic2021-6-9/models/koster-ml', "project")
display(project_path)

In [None]:
# Train YOLO model
%run -i "/usr/src/app/train.py" --entity $entity --data $data_path --hyp $hyps_path --weights $weights.selected  \
            --project $project_path.selected --name $exp_name --batch $batch_size.value --epochs $epochs.value \
            --single-cls --workers 4

### Step 3: Evaluate model performance on test set

In [None]:
# Find trained model weights
tuned_weights = f"{Path(project_path.selected, exp_name, 'best.pt')}"

In [None]:
# Evaluate YOLOv3 Model on Unseen Test data for mAP metric
%run -i "/usr/src/app/test.py" --data $data_path --weights $tuned_weights --conf-thres $conf_thres.value

### Transfer model to web app server (for API use)

In [None]:
server_user = getpass.getpass('Enter your server user')
server_pass = getpass.getpass('Enter your server password')

In [None]:
t6.transfer_model("test_model", "koster/koster-ml", server_user, server_pass)

In [None]:
#END