<img align="left" src="https://panoptes-uploads.zooniverse.org/project_avatar/86c23ca7-bbaa-4e84-8d8a-876819551431.png" type="image/png" height=100 width=100>
</img>
<h1 align="right">KSO Tutorials #6: Train machine learning models</h1>
<h3 align="right">Written by @jannesgg and @vykanton</h3>
<h5 align="right">Last updated: Feb 3rd, 2022</h5>

# Set up and requirements

### Import Python packages

In [1]:
%load_ext autoreload
%autoreload 2

In [112]:
# Set the directory of the libraries
import sys, os
from pathlib import Path
sys.path.append('..')

# Set to display dataframes as interactive tables
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
from ipyfilechooser import FileChooser

# Import required modules
import kso_utils.tutorials_utils as t_utils
import kso_utils.server_utils as s_utils
import kso_utils.t6_utils as t6
import kso_utils.t12_utils as t12
from src.prepare_zooniverse import frame_aggregation
from kso_utils.zooniverse_utils import retrieve_zoo_info, populate_subjects, populate_agg_annotations

print("Packages loaded successfully")

<IPython.core.display.Javascript object>

Packages loaded successfully


### Choose a project

In [113]:
project = t_utils.choose_project()

Dropdown(description='Project:', options=('Koster_Seafloor_Obs', 'Spyfish_Aotearoa', 'SGU'), value='Koster_Sea…

### Initiate SQL database and populate sites, movies and species

In [114]:
# Initiate db
db_info_dict = t_utils.initiate_db(project.value)

{'local_sites_csv': PosixPath('../db_starter/db_csv_info/sites_sgu.csv'), 'local_species_csv': PosixPath('../db_starter/db_csv_info/species_sgu.csv'), 'local_photos_csv': PosixPath('../db_starter/db_csv_info/photos_sgu.csv'), 'local_surveys_csv': PosixPath('../db_starter/db_csv_info/surveys_sgu.csv')}
   site_id         siteName  decimalLatitude  decimalLongitude  \
0        1   nmi18_11002 0        6228074.31         631688.14   
1        2  nmi18_11002 18        6228074.29         631687.55   
2        3   nmi18_11002 3        6228074.29         631687.55   
3        4   nmi18_11002 6        6228074.28         631687.31   
4        5   nmi18_11002 9        6228074.29         631687.55   

           geodeticDatum countryCode  
0  WGS 84 / UTM zone 33N      Sweden  
1  WGS 84 / UTM zone 33N      Sweden  
2  WGS 84 / UTM zone 33N      Sweden  
3  WGS 84 / UTM zone 33N      Sweden  
4  WGS 84 / UTM zone 33N      Sweden  
Updated sites
no such table: photos
Updated photos
Updated species

In [115]:
# Connect to Zooniverse project
zoo_project = t_utils.connect_zoo_project(project.value)

Enter your Zooniverse user········
Enter your Zooniverse password········


In [None]:
# Default weights path
#weights = "/usr/src/app/data_dir/weights/yolov5m.pt"

### Retrieve Zooniverse information

In [116]:
zoo_info_dict = t_utils.retrieve__populate_zoo_info(project_name = project.value, 
                                                    db_info_dict = db_info_dict,
                                                    zoo_project = zoo_project,
                                                    zoo_info = ["subjects", "workflows", "classifications"])

Retrieving subjects from Zooniverse
subjects were retrieved successfully
Retrieving workflows from Zooniverse
workflows were retrieved successfully
Retrieving classifications from Zooniverse
classifications were retrieved successfully
Updated subjects
The database has a total of 2518 frame subjects and 2160 clip subjects have been updated


### Specify Zooniverse parameters for frame aggregation

In [23]:
# Display a selectable list of workflow names and a list of versions of the workflow of interest
workflows_df = zoo_info_dict["workflows"]
wm = t12.WidgetMaker(workflows_df)
wm

WidgetMaker(children=(IntText(value=0, description='Number of workflows:', style=DescriptionStyle(description_…

Output()

In [117]:
# Selects the workflow id based on the workflow name
#workflow_ids = t12.get_workflow_ids(workflows_df, names)

# Retrieve classifications from the workflow of interest
class_df = t12.get_classifications(wm.checks,
                                     workflows_df, 
                                     'frame', 
                                     zoo_info_dict["classifications"], 
                                     db_info_dict["db_path"])

Zooniverse classifications have been retrieved


In [99]:
# Specify the agreement threshold required among cit scientists
agg_params = t12.choose_agg_parameters("frame")

FloatSlider(value=0.8, continuous_update=False, description='Aggregation threshold:', max=1.0, readout_format=…

IntSlider(value=3, continuous_update=False, description='Min numbers of users:', max=15, min=1, style=SliderSt…

FloatSlider(value=0.8, continuous_update=False, description='Object threshold:', max=1.0, readout_format='.1f'…

FloatSlider(value=0.5, continuous_update=False, description='IOU Epsilon:', max=1.0, readout_format='.1f', sty…

FloatSlider(value=0.8, continuous_update=False, description='Inter user agreement:', max=1.0, readout_format='…

In [118]:
agg_class_df, raw_class_df = t12.aggregrate_classifications(
                                    class_df, 'frame', project.value, agg_params)

Aggregrating the classifications
257 classifications aggregated out of 161 unique subjects available


In [128]:
# Add annotations to db
populate_agg_annotations(agg_class_df, 'frame', project.value)

   frame_exp_sp_id      x      y      w     h  subject_ids
0              1.0  482.0   85.0   56.0  76.0     72169386
1              1.0    2.0  403.0   81.0  86.0     72169388
2              1.0  253.0  168.0  106.0  87.0     72169390
3              1.0  243.0   69.5   82.0  53.5     72169391
4              1.0   19.0   26.0   77.0  48.0     72169392
UNIQUE constraint failed: agg_annotations_frame.species_id, agg_annotations_frame.x_position, agg_annotations_frame.y_position, agg_annotations_frame.width, agg_annotations_frame.height, agg_annotations_frame.subject_id
Updated agg_annotations_frame


### Step 0: Specify important paths and training parameters

In [121]:
# Specify output path where processed data will be stored
fc = t6.choose_output_folder(".")

FileChooser(path='.', filename='', title='HTML(value='', layout=Layout(display='none'))', show_hidden='False',…

In [122]:
# Store selected output path
output_folder = fc.selected

In [123]:
# Choose species of interest for model training
class_list = t6.choose_classes(db_info_dict["db_path"])

SelectMultiple(description='Species', index=(0,), options=('Blue mussels',), value=('Blue mussels',))

In [124]:
# Store selected classes of interest
cl = list(class_list.value)

In [125]:
# Determine your training parameters
percentage_test, batch_size, epochs, conf_thres = t6.choose_test_prop()

HBox(children=(FloatSlider(value=0.2, continuous_update=False, description='Test proportion:', max=1.0, readou…

### Step 1: Prepare the aggregated data

In [155]:
# Run the preparation script
frame_aggregation(project.value, db_info_dict, output_folder, percentage_test.value, cl, (720, 576))

INFO:kso_utils.t4_utils:No movies to be linked. If you do not have any movie files, please use Tutorial 5 instead.
100%|██████████| 102/102 [00:00<00:00, 397.80it/s]


### Step 2: Train the model with selected parameters

In [157]:
# Fix important paths
data_path = [_ for _ in os.listdir(output_folder) if _.endswith(".yaml")][-1]
hyps_path = str(Path(output_folder, "hyps.yaml"))

# Choose folder that will contain the different model runs
project_path = FileChooser('/cephyr/NOBACKUP/groups/snic2021-6-9/models/koster-ml')

# Temp placeholder experiment grouping (to be removed)
entity = "koster"
exp_name = "test_model"
display(project_path)

# Choose a directory to store WANDB logs locally
logging_path = FileChooser('/cephyr/NOBACKUP/groups/snic2021-6-9/models/koster-ml')
display(logging_path)

FileChooser(path='.', filename='', title='HTML(value='', layout=Layout(display='none'))', show_hidden='False',…

FileChooser(path='.', filename='', title='HTML(value='', layout=Layout(display='none'))', show_hidden='False',…

In [159]:
# Train YOLO model
%run -i "/usr/src/app/train.py" --entity $entity --data $data_path --hyp $hyps_path --weights $weights  \
            --project $project_path.selected --name $exp_name --batch $batch_size --epochs $epochs.value \
            --single-cls --workers 4

Exception: File `'/usr/src/app/train.py'` not found.

### Step 3: Evaluate model performance on test set

In [None]:
# Find trained model weights
tuned_weights = f"{Path(project_path.selected, exp_name, 'best.pt')}"

In [None]:
# Evaluate YOLOv3 Model on Unseen Test data for mAP metric
%run -i "/usr/src/app/test.py" --data $data_path --weights $tuned_weights --conf-thres $conf_thres.value

### Transfer model to web app server (for API use)

In [None]:
server_user = getpass.getpass('Enter your server user')
server_pass = getpass.getpass('Enter your server password')

In [None]:
t6.transfer_model("test_model", "koster/koster-ml", server_user, server_pass)

In [None]:
#END