In [32]:
import ast
import glob
import json
import os
import re
import shutil

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from IPython.core.interactiveshell import InteractiveShell
from PIL import Image
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, train_test_split
from tqdm import tqdm

InteractiveShell.ast_node_interactivity = "all"
import seaborn as sns
import torch
import torchvision
from IPython.display import clear_output

In [3]:
with open('/app/_data/sequences.json', 'r') as f:
    seq_dict = json.load(f)

In [4]:
TRAIN_DF_PART = "/app/_data/tensorflow-great-barrier-reef/train.csv"
IMAGE_FOLDER = "images"
LABEL_FOLDER = "labels"
SEED = 37

In [5]:
kfold_dict = {
    22643: ["water_reef", "train"],
    60754: ["water_reef", "train"],
    53708: ["water", "train"],
    8503: ["reef", "train"],
    18048: ["water_reef", "all"],
    26651: ["water_reef", "all"],
    15827: ["water_reef", "all"],
    29859: ["water_reef", "all"],
    59337: ["water", "all"],
    8399: ["water", "all"],
    45518: ["water", "all"],
    35305: ["water", "all"],
    45015: ["water", "all"],
    17665: ["water", "all"],
    40258: ["water", "all"],
    996: ["water", "all"],
    60510: ["reef", "all"],
    29424: ["water_reef", "all"],
    37114: ["reef", "all"],
    44160: ["water", "all"],
}

In [27]:
df = pd.read_csv(TRAIN_DF_PART)
df["img_path"] = (
    "/app/_data/tensorflow-great-barrier-reef/train_images/video_"
    + df.video_id.astype("str")
    + "/"
    + df.video_frame.astype("str")
    + ".jpg"
)
df["annotations"] = df["annotations"].apply(lambda x: ast.literal_eval(x))
df["len_annotation"] = df["annotations"].str.len()
df["image_id"] = df["image_id"].str.replace("-", "_", regex=True)
df["new_img_path"] = f"/app/_data/{IMAGE_FOLDER}/" + df["image_id"] + ".jpg"
df["label"] = df["len_annotation"].apply(lambda x: 0 if x == 0 else 1)
df["no_label"] = df["len_annotation"].apply(lambda x: True if x == 0 else False)
df["type"] = df["sequence"].apply(lambda x: kfold_dict[x][0])
df["train"] = df["sequence"].apply(lambda x: 1 if kfold_dict[x][1] == "train" else 0)

In [28]:
df["label_change"] = df["label"] & df["no_label"].shift(1) & df["no_label"].shift(
    2
) | df["no_label"] & df["label"].shift(1) & df["label"].shift(2)
df["sequense_change"] = df["sequence"] != df["sequence"].shift(1)
df["start_subseq"] = df["sequense_change"] | df["label_change"]
df.loc[df.index[-1], "start_subseq"] = True
df["start_subseq"].sum()

138

In [29]:
start_idx = 0
for subsequence_id, end_idx in enumerate(df[df["start_subseq"]].index):
    df.loc[start_idx:end_idx, "subsequence_id"] = subsequence_id
    start_idx = end_idx

df["subsequence_id"] = df["subsequence_id"].astype(int)
df["subsequence_id"].nunique()

137

## KFold split

In [94]:
kf = GroupKFold(n_splits=10)
list_train_ids = []
list_val_ids= []
for train_idx, val_idx in (
    kf.split(df, y=df.len_annotation, groups=df.subsequence_id)
):
    list_train_ids.append(train_idx)
    list_val_ids.append(val_idx)
    print(df.loc[train_idx, ["len_annotation", 'label']].sum()/ df.loc[val_idx, ["len_annotation", 'label']].sum())

len_annotation    inf
label             inf
dtype: float64
len_annotation    25.207048
label             19.326446
dtype: float64
len_annotation    27.878641
label             12.187668
dtype: float64
len_annotation    12.459276
label              7.351443
dtype: float64
len_annotation    3.791784
label             6.319940
dtype: float64
len_annotation    3.416481
label             4.759953
dtype: float64
len_annotation    3.408299
label             6.047278
dtype: float64
len_annotation    17.707547
label             13.094556
dtype: float64
len_annotation    9.895604
label             6.087896
dtype: float64
len_annotation    20.871324
label              9.979911
dtype: float64


In [47]:
# n_splits = 10
# skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

# for fold_id, (train_idx, val_idx) in enumerate(
#     skf.split(df, y=df[['subsequence_id','type', 'label']])
# ):
#     subseq_val_idx = df["subsequence_id"].iloc[val_idx]
# #     df.loc[df["subsequence_id"].isin(subseq_val_idx), "fold"] = fold_id

# # df["fold"] = df["fold"].astype(int)
# # for fold in range(10):
# #     print(f"\nFold {fold}")
# #     df.query("fold != @fold")[["len_annotation", "label"]].sum() / df.query(
# #         "fold == @fold"
# #     )[["len_annotation", "label"]].sum()

In [8]:
pd.pivot_table(df, index=["type", "train"], values=["len_annotation"], aggfunc=["sum"])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,len_annotation
type,train,Unnamed: 2_level_2
reef,0,113
reef,1,3195
water,0,2174
water,1,1146
water_reef,0,289
water_reef,1,4981


In [9]:
train_sequences = df.query("train == 1")['sequence'].unique().tolist()
val_sequences = df.query('train != 1 and type in ["water_reef", "reef"] and len_annotation != 0')['sequence'].unique().tolist()
train_sequences, val_sequences

([53708, 8503, 60754, 22643], [60510, 15827, 18048, 26651, 29859])

In [10]:
water_df = df.query('type=="water" and train==0').reset_index(drop=True)
seqs = water_df.query("len_annotation != 0")["sequence"].unique().tolist()
len(seqs)

8

In [11]:
val_seqs = []
for seq1 in seqs:
    labels_ratio = (
        water_df.query("sequence != @seq1")["label"].sum()
        / water_df.query("sequence == @seq1")["label"].sum()
    )
    sum_ratio = (
        water_df.query("sequence != @seq1")["len_annotation"].sum()
        / water_df.query("sequence == @seq1")["len_annotation"].sum()
    )
    if 5 <= labels_ratio <= 11 and 5 <= sum_ratio <= 11:
        val_seqs.append([seq1])
for seq1 in seqs:
    for seq2 in seqs:
        if seq1 != seq2:
            labels_ratio = (
                water_df.query("sequence not in [@seq1, @seq2]")["label"].sum()
                / water_df.query("sequence in [@seq1, @seq2]")["label"].sum()
            )
            sum_ratio = (
                water_df.query("sequence not in [@seq1, @seq2]")["len_annotation"].sum()
                / water_df.query("sequence in [@seq1, @seq2]")["len_annotation"].sum()
            )
            if 5 <= labels_ratio <= 11 and 5 <= sum_ratio <= 11:
                if [seq2, seq1] not in val_seqs:
                    val_seqs.append([seq1, seq2])

In [12]:
train_sequences.extend(list(set(seqs) - set(val_seqs[-1])))
val_sequences.extend(val_seqs[-1])

In [13]:
df.query('sequence in @train_sequences and len_annotation!=0')["label"].sum()
df.query('sequence in @train_sequences and len_annotation!=0')["len_annotation"].sum()


4394

11152

In [14]:
df.query('sequence in @val_sequences').shape[0]
df.query('sequence in @val_sequences and len_annotation!=0')["label"].sum()
df.query('sequence in @val_sequences and len_annotation!=0')["len_annotation"].sum()

7461

525

746

In [15]:
train_ids = df.query('sequence in @train_sequences and len_annotation!=0').index.tolist()
train_ids.extend(df.query('sequence in @train_sequences and len_annotation==0').sample(400).index.tolist())
train_ids.extend(df.query('sequence in [@zero_seqs[0], @zero_seqs[1]]').index.tolist())

val_ids = df.query('sequence in @val_sequences').index.tolist()
val_ids.extend(df.query('sequence == @zero_seqs[2]').sample(400).index.tolist())

len(train_ids), len(val_ids)

(5129, 7861)

In [16]:
train_img_path = df.loc[train_ids, "new_img_path"].tolist()
val_img_path = df.loc[val_ids, "new_img_path"].tolist()
np.savetxt(
    f"/app/_data/train_seq_0.txt",
    train_img_path,
    fmt="%s",
)
np.savetxt(f"/app/_data/val_seq_0.txt", val_img_path, fmt="%s")

## Custimize parameters

In [17]:
from IPython.core.magic import register_line_cell_magic


@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, "w") as f:
        f.write(cell.format(**globals()))

In [18]:
%%writetemplate /app/_data/yolov5/data/reef_seq_data.yaml

train: /app/_data/train_seq_0.txt # training directory
val: /app/_data/val_seq_0.txt # validation directory

# Classes
nc: 1  # number of classes
names: ['starfish']  # class names

In [19]:
!cat /app/_data/yolov5/data/hyps/hyp.scratch.yaml

# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
# Hyperparameters for COCO training from scratch
# python train.py --batch 40 --cfg yolov5m.yaml --weights '' --data coco.yaml --img 640 --epochs 300
# See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials

lr0: 0.01  # initial learning rate (SGD=1E-2, Adam=1E-3)
lrf: 0.1  # final OneCycleLR learning rate (lr0 * lrf)
momentum: 0.937  # SGD momentum/Adam beta1
weight_decay: 0.0005  # optimizer weight decay 5e-4
warmup_epochs: 3.0  # warmup epochs (fractions ok)
warmup_momentum: 0.8  # warmup initial momentum
warmup_bias_lr: 0.1  # warmup initial bias lr
box: 0.05  # box loss gain
cls: 0.5  # cls loss gain
cls_pw: 1.0  # cls BCELoss positive_weight
obj: 1.0  # obj loss gain (scale with pixels)
obj_pw: 1.0  # obj BCELoss positive_weight
iou_t: 0.20  # IoU training threshold
anchor_t: 4.0  # anchor-multiple threshold
# anchors: 3  # anchors per output layer (0 to ignore)
fl_gamma: 0.0  # focal loss gamma (effic

In [20]:
%%writetemplate /app/_data/yolov5/data/hyps/hyp.custom.seq.yaml
# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
# Hyperparameters for COCO training from scratch
# python train.py --batch 40 --cfg yolov5m.yaml --weights '' --data coco.yaml --img 640 --epochs 300
# See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials

lr0: 0.01  # initial learning rate (SGD=1E-2, Adam=1E-3)
lrf: 0.1  # final OneCycleLR learning rate (lr0 * lrf)
momentum: 0.937  # SGD momentum/Adam beta1
weight_decay: 0.0005  # optimizer weight decay 5e-4
warmup_epochs: 3.0  # warmup epochs (fractions ok)
warmup_momentum: 0.8  # warmup initial momentum
warmup_bias_lr: 0.1  # warmup initial bias lr
box: 0.05  # box loss gain
cls: 0.5  # cls loss gain
cls_pw: 1.0  # cls BCELoss positive_weight
obj: 1.0  # obj loss gain (scale with pixels)
obj_pw: 1.0  # obj BCELoss positive_weight
iou_t: 0.20  # IoU training threshold
anchor_t: 4.0  # anchor-multiple threshold
# anchors: 3  # anchors per output layer (0 to ignore)
fl_gamma: 0.0  # focal loss gamma (efficientDet default gamma=1.5)
hsv_h: 0.015  # image HSV-Hue augmentation (fraction)
hsv_s: 0.7  # image HSV-Saturation augmentation (fraction)
hsv_v: 0.4  # image HSV-Value augmentation (fraction)
degrees: 2.0  # image rotation (+/- deg)
translate: 0.1  # image translation (+/- fraction)
scale: 0.5  # image scale (+/- gain)
shear: 0.1  # image shear (+/- deg)
perspective: 0.0  # image perspective (+/- fraction), range 0-0.001
flipud: 0.1  # image flip up-down (probability)
fliplr: 0.5  # image flip left-right (probability)
mosaic: 1.0  # image mosaic (probability)
mixup: 0.5  # image mixup (probability)
copy_paste: 0.1  # segment copy-paste (probability)

In [21]:
!cat /app/_data/yolov5/data/reef_seq_data.yaml


train: /app/_data/train_seq_0.txt # training directory
val: /app/_data/val_seq_0.txt # validation directory

# Classes
nc: 1  # number of classes
names: ['starfish']  # class names


In [22]:
!pip install --upgrade wandb
clear_output()
import wandb

wandb.login()

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [23]:
%cd /app/_data/yolov5/
!pip install -r requirements.txt
clear_output()

In [23]:
# for path in glob.glob("/app/_data/yolov5/runs/train/*_seq_*"):
#     shutil.rmtree(path)

In [24]:
!python train.py --img 3008 \
                --batch 2\
                --epochs 60\
                --data reef_seq_data.yaml \
                --weights yolov5m6.pt \
                --name yolov5m6_seq_3008_0 \
                --hyp data/hyps/hyp.custom.seq.yaml \
                --single-cls \
                --patience 10

Downloading https://ultralytics.com/assets/Arial.ttf to /root/.config/Ultralytics/Arial.ttf...
[34m[1mwandb[0m: Currently logged in as: [33mtatanko[0m (use `wandb login --relogin` to force relogin)
[34m[1mtrain: [0mweights=yolov5m6.pt, cfg=, data=reef_seq_data.yaml, hyp=data/hyps/hyp.custom.seq.yaml, epochs=60, batch_size=2, imgsz=3008, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=True, optimizer=SGD, sync_bn=False, workers=8, project=runs/train, name=yolov5m6_seq_3008_0, exist_ok=False, quad=False, linear_lr=False, label_smoothing=0.0, patience=10, freeze=[0], save_period=-1, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mskipping check (Docker image), for updates see https://github.com/ultralytics/yolov5
YOLOv5 🚀 v6.0-193-gdb1f83b torch 1.9.1+cu111 CUDA:0 (NVIDIA GeForce RTX 3090, 24265MiB)



In [25]:
paths = [x for x in glob.glob("/app/_data/*/runs/train/*/*/*.pt") if 'seq' in x and 'best' in x]

In [26]:
paths

['/app/_data/yolov5_f2/runs/train/yolov5m6_seq_3008_0_f2/weights/best.pt',
 '/app/_data/yolov5/runs/train/yolov5m6_seq_3008_0/weights/best.pt',
 '/app/_data/yolov5/runs/train/yolov5m6_seq_val8_3008/weights/best.pt']

In [36]:
base_path = '/app/_data/yolo5_seq_weights_0'
if not os.path.exists(base_path):
    os.makedirs(base_path)
for path in paths:
    mod_name = f"{path.split('/')[-3]}_{path.split('/')[-1]}"
    new_path = f'{base_path}/{mod_name}'
    new_path
    if os.path.exists(new_path):
        print(f'Path {new_path} already exists, do you want to overwrite model?\nIf yes, print "Y"')
        ans = input()
        if ans == 'Y':
            shutil.copy(path, new_path)
            print(f'Model from    {path} \nare copied to {new_path}')
    else:
        shutil.copy(path, new_path)
        print(f'Model from    {path} \nare copied to {new_path}')

'/app/_data/yolo5_seq_weights_0/yolov5m6_seq_3008_0_f2_best.pt'

'/app/_data/yolo5_seq_weights_0/yolov5m6_seq_3008_0_f2_best.pt'

Model from    /app/_data/yolov5_f2/runs/train/yolov5m6_seq_3008_0_f2/weights/best.pt 
are copied to /app/_data/yolo5_seq_weights_0/yolov5m6_seq_3008_0_f2_best.pt


'/app/_data/yolo5_seq_weights_0/yolov5m6_seq_3008_0_best.pt'

'/app/_data/yolo5_seq_weights_0/yolov5m6_seq_3008_0_best.pt'

Model from    /app/_data/yolov5/runs/train/yolov5m6_seq_3008_0/weights/best.pt 
are copied to /app/_data/yolo5_seq_weights_0/yolov5m6_seq_3008_0_best.pt


'/app/_data/yolo5_seq_weights_0/yolov5m6_seq_val8_3008_best.pt'

'/app/_data/yolo5_seq_weights_0/yolov5m6_seq_val8_3008_best.pt'

Model from    /app/_data/yolov5/runs/train/yolov5m6_seq_val8_3008/weights/best.pt 
are copied to /app/_data/yolo5_seq_weights_0/yolov5m6_seq_val8_3008_best.pt
