In [2]:
import ast
import glob
import json
import os
import re
import shutil

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from IPython.core.interactiveshell import InteractiveShell
from PIL import Image
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from tqdm import tqdm

InteractiveShell.ast_node_interactivity = "all"
import seaborn as sns
import torch
import torchvision
from IPython.display import clear_output

In [3]:
with open("/app/_data/sequences.json", "r") as f:
    seq_dict = json.load(f)

In [4]:
TRAIN_DF_PART = "/app/_data/tensorflow-great-barrier-reef/train.csv"
IMAGE_FOLDER = "images"
LABEL_FOLDER = "labels"
SEED = 37

In [5]:
kfold_dict = {
    22643: ["water_reef", "train"],
    60754: ["water_reef", "train"],
    53708: ["water", "train"],
    8503: ["reef", "train"],
    18048: ["water_reef", "all"],
    26651: ["water_reef", "all"],
    15827: ["water_reef", "all"],
    29859: ["water_reef", "all"],
    59337: ["water", "all"],
    8399: ["water", "all"],
    45518: ["water", "all"],
    35305: ["water", "all"],
    45015: ["water", "all"],
    17665: ["water", "all"],
    40258: ["water", "all"],
    996: ["water", "all"],
    60510: ["reef", "all"],
    29424: ["water_reef", "all"],
    37114: ["reef", "all"],
    44160: ["water", "all"],
}

In [6]:
df = pd.read_csv(TRAIN_DF_PART)
df["img_path"] = (
    "/app/_data/tensorflow-great-barrier-reef/train_images/video_"
    + df.video_id.astype("str")
    + "/"
    + df.video_frame.astype("str")
    + ".jpg"
)
df["annotations"] = df["annotations"].apply(lambda x: ast.literal_eval(x))
df["len_annotation"] = df["annotations"].str.len()
df["image_id"] = df["image_id"].str.replace("-", "_", regex=True)
df["new_img_path"] = f"/app/_data/{IMAGE_FOLDER}/" + df["image_id"] + ".jpg"
df["label"] = df["len_annotation"].apply(lambda x: False if x == 0 else True)
df["type"] = df["sequence"].apply(lambda x: kfold_dict[x][0])
df["train"] = df["sequence"].apply(lambda x: 1 if kfold_dict[x][1] == "train" else 0)

In [449]:
df["no_label"] = df["len_annotation"].apply(lambda x: True if x == 0 else False)

In [450]:
df["label_change"] = df["label"] & df["no_label"].shift(1) & df["no_label"].shift(2) | df["no_label"] & df["label"].shift(1) & df["label"].shift(2)
df["sequense_change"] = df["sequence"] != df["sequence"].shift(1)
df["start_subseq"] = df["sequense_change"] | df["label_change"]
df.loc[df.index[-1], "start_subseq"] = True
df["start_subseq"].sum()

138

In [451]:
start_idx = 0
for subsequence_id, end_idx in enumerate(df[df["start_subseq"]].index):
    df.loc[start_idx:end_idx, "subsequence_id"] = subsequence_id
    start_idx = end_idx

df["subsequence_id"] = df["subsequence_id"].astype(int)
df["subsequence_id"].nunique()

137

In [423]:
gr_init = 1

for i in tqdm(df.index.tolist()):
    if i != len(df) - 1:
        if (
            df.loc[i, "label"] == df.loc[i + 1, "label"]
            or df.loc[i, "label"] == df.loc[i + 2, "label"]
        ):
            if df.loc[i, "sequence"] == df.loc[i + 1, "sequence"]:
                df.loc[i, "group"] = gr_init
                df.loc[i + 1, "group"] = gr_init
            else:
                df.loc[i, "group"] = gr_init
                gr_init += 1
        else:
            df.loc[i, "group"] = gr_init
            gr_init += 1
    else:
        if df.loc[i, "label"] == df.loc[i - 1, "label"]:
            df.loc[i, "group"] = gr_init
        else:
            df.loc[i, "group"] = gr_init + 1
df["group"] = df["group"].astype("int")

100% 23501/23501 [00:16<00:00, 1431.02it/s]


In [320]:
df["group"].nunique()

137

# Subsequence_kfold

In [470]:
df_split  = df.groupby("subsequence_id").agg({'label': 'max', 'len_annotation': 'sum', 'video_frame': 'count'}).astype(int).reset_index()
df_split.head()

Unnamed: 0,subsequence_id,label,len_annotation,video_frame
0,1,0,0,16
1,2,1,194,164
2,3,0,0,52
3,4,1,58,58
4,5,0,0,35


In [468]:
# n_splits = 10
# kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
# for fold_id, (train_idx, val_idx) in enumerate(kf.split(df_split['subsequence_id'], y=df_split["label"])):
#     subseq_val_idx = df_split['subsequence_id'].iloc[val_idx]
#     df.loc[df['subsequence_id'].isin(subseq_val_idx), 'fold'] = fold_id
    
# df['fold'] = df['fold'].astype(int)
# df['fold'].value_counts(dropna=False)

In [469]:
# for fold in range(10):
#     print(f'\nFold {fold}')
#     df.query('fold != @fold')[["len_annotation", "label"]].sum()  / df.query('fold == @fold')[["len_annotation", "label"]].sum()
# fold = 3

In [472]:
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
for fold_id, (train_idx, val_idx) in enumerate(skf.split(df_split['subsequence_id'], y=df_split["len_annotation"])):
    subseq_val_idx = df_split['subsequence_id'].iloc[val_idx]
    df.loc[df['subsequence_id'].isin(subseq_val_idx), 'fold'] = fold_id
    
df['fold'] = df['fold'].astype(int)
df['fold'].value_counts(dropna=False)
for fold in range(10):
    print(f'\nFold {fold}')
    df.query('fold != @fold')[["len_annotation", "label"]].sum()  / df.query('fold == @fold')[["len_annotation", "label"]].sum()



6    4398
3    3465
1    3063
7    3040
2    2762
5    1682
4    1420
9    1315
8    1241
0    1115
Name: fold, dtype: int64


Fold 0


len_annotation    18.764120
label             11.453165
dtype: float64


Fold 1


len_annotation    4.283304
label             8.957490
dtype: float64


Fold 2


len_annotation    16.218524
label             12.476712
dtype: float64


Fold 3


len_annotation    21.280899
label             11.328321
dtype: float64


Fold 4


len_annotation    15.948718
label              9.018330
dtype: float64


Fold 5


len_annotation    26.929577
label             13.510324
dtype: float64


Fold 6


len_annotation    5.342217
label             5.957567
dtype: float64


Fold 7


len_annotation    3.092879
label             5.347097
dtype: float64


Fold 8


len_annotation    18.797005
label             10.466200
dtype: float64


Fold 9


len_annotation    8.103290
label             8.369524
dtype: float64

## KFold split

In [8]:
pd.pivot_table(df, index=["type", "train"], values=["len_annotation"], aggfunc=["sum"])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,len_annotation
type,train,Unnamed: 2_level_2
reef,0,113
reef,1,3195
water,0,2174
water,1,1146
water_reef,0,289
water_reef,1,4981


In [9]:
train_sequences = df.query("train == 1")["sequence"].unique().tolist()
val_sequences = (
    df.query('train != 1 and type in ["water_reef", "reef"] and len_annotation != 0')[
        "sequence"
    ]
    .unique()
    .tolist()
)
train_sequences, val_sequences

([53708, 8503, 60754, 22643], [60510, 15827, 18048, 26651, 29859])

In [10]:
water_df = df.query('type=="water" and train==0').reset_index(drop=True)
seqs = water_df.query("len_annotation != 0")["sequence"].unique().tolist()
len(seqs)

8

In [11]:
df.query("sequence in @train_sequences")[["len_annotation", "label"]].sum()

len_annotation    9322
label             3035
dtype: int64

In [12]:
df.query("sequence in @val_sequences")[["len_annotation", "label"]].sum()

len_annotation    402
label             358
dtype: int64

In [13]:
df.query("sequence in @train_sequences")[["len_annotation", "label"]].sum() / df.query(
    "sequence in @val_sequences"
)[["len_annotation", "label"]].sum()

len_annotation    23.189055
label              8.477654
dtype: float64

In [76]:
val_seqs = []
for seq1 in seqs:
    labels_ratio = (
        water_df.query("sequence != @seq1")["label"].sum()
        / water_df.query("sequence == @seq1")["label"].sum()
    )
    sum_ratio = (
        water_df.query("sequence != @seq1")["len_annotation"].sum()
        / water_df.query("sequence == @seq1")["len_annotation"].sum()
    )
    if 4 <= labels_ratio <= 10 and 4 <= sum_ratio <= 10:
        val_seqs.append([seq1])
for seq1 in seqs:
    for seq2 in seqs:
        if seq1 != seq2:
            labels_ratio = (
                water_df.query("sequence not in [@seq1, @seq2]")["label"].sum()
                / water_df.query("sequence in [@seq1, @seq2]")["label"].sum()
            )
            sum_ratio = (
                water_df.query("sequence not in [@seq1, @seq2]")["len_annotation"].sum()
                / water_df.query("sequence in [@seq1, @seq2]")["len_annotation"].sum()
            )
            if 4 <= labels_ratio <= 10 and 4 <= sum_ratio <= 10:
                if [seq2, seq1] not in val_seqs:
                    val_seqs.append([seq1, seq2])

In [182]:
val_seqs = []
for seq1 in seqs:
    labels_ratio = (
        water_df.query("sequence != @seq1")["label"].sum()
        / water_df.query("sequence == @seq1")["label"].sum()
    )
    sum_ratio = (
        water_df.query("sequence != @seq1")["len_annotation"].sum()
        / water_df.query("sequence == @seq1")["len_annotation"].sum()
    )
    if 3 <= labels_ratio <= 10:
        val_seqs.append([seq1])
for i in range(len(seqs)):
    for j in range(i + 1, len(seqs)):
        seq1, seq2 = seqs[i], seqs[j]
        if seq1 != seq2:
            labels_ratio = (
                water_df.query("sequence not in [@seq1, @seq2]")["label"].sum()
                / water_df.query("sequence in [@seq1, @seq2]")["label"].sum()
            )
            sum_ratio = (
                water_df.query("sequence not in [@seq1, @seq2]")["len_annotation"].sum()
                / water_df.query("sequence in [@seq1, @seq2]")["len_annotation"].sum()
            )
            if 3 <= labels_ratio <= 10:
                val_seqs.append([seq1, seq2])
for i in range(len(seqs)):
    for j in range(i + 1, len(seqs)):
        for h in range(j + 1, len(seqs)):
            seq1, seq2, seq3 = seqs[i], seqs[j], seqs[h]
            if seq1 != seq2 != seq3:

                labels_ratio = (
                    water_df.query("sequence not in [@seq1, @seq2, @seq3]")[
                        "label"
                    ].sum()
                    / water_df.query("sequence in [@seq1, @seq2, @seq3]")["label"].sum()
                )
                sum_ratio = (
                    water_df.query("sequence not in [@seq1, @seq2, @seq3]")[
                        "len_annotation"
                    ].sum()
                    / water_df.query("sequence in [@seq1, @seq2, @seq3]")[
                        "len_annotation"
                    ].sum()
                )
                if 3 <= labels_ratio <= 10:
                    val_seqs.append([seq1, seq2, seq3])

In [183]:
val_seqs

[[40258],
 [59337],
 [996],
 [40258, 45015],
 [40258, 35305],
 [40258, 17665],
 [45518, 59337],
 [45518, 45015],
 [45518, 35305],
 [45518, 996],
 [45518, 17665],
 [59337, 45015],
 [59337, 35305],
 [59337, 996],
 [59337, 17665],
 [45015, 996],
 [35305, 996],
 [35305, 17665],
 [996, 17665],
 [45518, 59337, 45015],
 [45518, 59337, 35305],
 [45518, 59337, 17665],
 [45518, 45015, 35305],
 [45518, 45015, 996],
 [45518, 45015, 17665],
 [45518, 35305, 17665],
 [59337, 45015, 35305],
 [59337, 45015, 17665],
 [59337, 35305, 17665],
 [45015, 35305, 996],
 [45015, 35305, 17665],
 [45015, 996, 17665],
 [35305, 996, 17665]]

In [184]:
train_sequences = df.query("train == 1")["sequence"].unique().tolist()
val_sequences = (
    df.query('train != 1 and type in ["water_reef", "reef"] and len_annotation != 0')[
        "sequence"
    ]
    .unique()
    .tolist()
)

In [185]:
for i in range(len(val_seqs)):
    val_idx = i
    train_sequences = df.query("train == 1")["sequence"].unique().tolist()
    val_sequences = (
        df.query(
            'train != 1 and type in ["water_reef", "reef"] and len_annotation != 0'
        )["sequence"]
        .unique()
        .tolist()
    )
    train_sequences.extend(list(set(seqs) - set(val_seqs[val_idx])))
    train_sequences.extend(zero_seqs[:2])
    val_sequences.extend(val_seqs[val_idx])
    val_sequences.append(zero_seqs[:2])
    print(f"\nval_idx = {val_idx}")
    df.query("sequence in @train_sequences")[
        ["len_annotation", "label"]
    ].sum() / df.query("sequence in @val_sequences")[["len_annotation", "label"]].sum()


val_idx = 0


len_annotation    15.594142
label              6.650078
dtype: float64


val_idx = 1


len_annotation    18.996639
label              8.570039
dtype: float64


val_idx = 2


len_annotation    16.471366
label              7.675485
dtype: float64


val_idx = 3


len_annotation    15.056680
label              6.396992
dtype: float64


val_idx = 4


len_annotation    13.761787
label              5.803596
dtype: float64


val_idx = 5


len_annotation    11.240741
label              5.738356
dtype: float64


val_idx = 6


len_annotation    15.571031
label              6.722135
dtype: float64


val_idx = 7


len_annotation    20.672131
label              8.779324
dtype: float64


val_idx = 8


len_annotation    18.377850
label              7.768271
dtype: float64


val_idx = 9


len_annotation    13.798507
label              6.128986
dtype: float64


val_idx = 10


len_annotation    14.253846
label              7.660211
dtype: float64


val_idx = 11


len_annotation    18.221325
label              8.177239
dtype: float64


val_idx = 12


len_annotation    16.394737
label              7.281145
dtype: float64


val_idx = 13


len_annotation    12.613272
label              5.803596
dtype: float64


val_idx = 14


len_annotation    12.997647
label              7.184692
dtype: float64


val_idx = 15


len_annotation    15.876596
label              7.351443
dtype: float64


val_idx = 16


len_annotation    14.451948
label              6.602782
dtype: float64


val_idx = 17


len_annotation    14.949062
label              8.369524
dtype: float64


val_idx = 18


len_annotation    11.711538
label              6.521407
dtype: float64


val_idx = 19


len_annotation    15.03504
label              6.46434
dtype: float64


val_idx = 20


len_annotation    13.743494
label              5.860530
dtype: float64


val_idx = 21


len_annotation    11.228160
label              5.794199
dtype: float64


val_idx = 22


len_annotation    17.648903
label              7.437393
dtype: float64


val_idx = 23


len_annotation    13.369565
label              5.908708
dtype: float64


val_idx = 24


len_annotation    13.798507
label              7.337288
dtype: float64


val_idx = 25


len_annotation    12.691600
label              6.591049
dtype: float64


val_idx = 26


len_annotation    15.805085
label              6.985390
dtype: float64


val_idx = 27


len_annotation    12.613272
label              6.895666
dtype: float64


val_idx = 28


len_annotation    11.670927
label              6.223201
dtype: float64


val_idx = 29


len_annotation    13.984887
label              6.352765
dtype: float64


val_idx = 30


len_annotation    14.451948
label              7.992687
dtype: float64


val_idx = 31


len_annotation    11.393750
label              6.276627
dtype: float64


val_idx = 32


len_annotation    10.607805
label              5.701635
dtype: float64

In [148]:
val_idx = 3
train_sequences = df.query("train == 1")["sequence"].unique().tolist()
val_sequences = (
    df.query('train != 1 and type in ["water_reef", "reef"] and len_annotation != 0')[
        "sequence"
    ]
    .unique()
    .tolist()
)
train_sequences.extend(list(set(seqs) - set(val_seqs[val_idx])))
train_sequences.extend(zero_seqs[:2])
val_sequences.extend(val_seqs[val_idx])
val_sequences.append(zero_seqs[2])
print(f"\nval_idx = {val_idx}")
df.query("sequence in @train_sequences")[["len_annotation", "label"]].sum() / df.query(
    "sequence in @val_sequences"
)[["len_annotation", "label"]].sum()


val_idx = 3


len_annotation    16.471366
label              7.675485
dtype: float64

In [149]:
df.query("sequence in @train_sequences").shape[0]
df.query("sequence in @train_sequences")["label"].sum()
df.query("sequence in @train_sequences")["len_annotation"].sum()

13257

4352

11217

In [150]:
df.query("sequence in @val_sequences").shape[0]
df.query("sequence in @val_sequences")["label"].sum()
df.query("sequence in @val_sequences")["len_annotation"].sum()

10244

567

681

In [151]:
len(train_sequences) + len(val_sequences)

20

In [152]:
seq_dict[str(val_idx)] = {
    "val": list(map(int, val_sequences)),
    "train": list(map(int, train_sequences)),
}

In [153]:
seq_dict

{'0': {'val': [60510, 15827, 18048, 26651, 29859, 35305, 17665, 37114],
  'train': [53708,
   8503,
   60754,
   22643,
   40258,
   996,
   59337,
   45518,
   8399,
   45015,
   44160,
   29424]},
 '22': {'val': [60510, 15827, 18048, 26651, 29859, 45015, 35305, 17665, 37114],
  'train': [53708,
   8503,
   60754,
   22643,
   40258,
   996,
   59337,
   45518,
   8399,
   44160,
   29424]},
 '3': {'val': [60510, 15827, 18048, 26651, 29859, 996, 37114],
  'train': [53708,
   8503,
   60754,
   22643,
   17665,
   40258,
   59337,
   35305,
   45518,
   8399,
   45015,
   44160,
   29424]}}

In [154]:
with open("/app/_data/sequences.json", "w") as f:
    json.dump(seq_dict, f)

# chose ids

In [120]:
train = pd.concat(
    [
        df.query("sequence in @train_sequences and len_annotation!=0"),
        df.query("sequence in @train_sequences and len_annotation==0").sample(
            int(
                df.query("sequence in @train_sequences and len_annotation==0").shape[0]
                * 0.07
            )
        ),
    ]
).sample(frac=1)

In [15]:
train_ids = df.query(
    "sequence in @train_sequences and len_annotation!=0"
).index.tolist()
train_ids.extend(
    df.query("sequence in @train_sequences and len_annotation==0")
    .sample(400)
    .index.tolist()
)
train_ids.extend(df.query("sequence in [@zero_seqs[0], @zero_seqs[1]]").index.tolist())

val_ids = df.query("sequence in @val_sequences").index.tolist()
val_ids.extend(df.query("sequence == @zero_seqs[2]").sample(400).index.tolist())

len(train_ids), len(val_ids)

(5129, 7861)

In [16]:
train_img_path = df.loc[train_ids, "new_img_path"].tolist()
val_img_path = df.loc[val_ids, "new_img_path"].tolist()
np.savetxt(
    f"/app/_data/train_seq_0.txt",
    train_img_path,
    fmt="%s",
)
np.savetxt(f"/app/_data/val_seq_0.txt", val_img_path, fmt="%s")

# Annotation split

In [7]:
df_ann = pd.read_csv('/app/_data/train_alb.csv')

In [9]:
df_ann.groupby('video_id')[['label', 'len_annotation']].sum()

Unnamed: 0_level_0,label,len_annotation
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7391,9707
1,7470,21393
2,2248,8098


In [6]:
df = pd.read_csv(TRAIN_DF_PART)
df["img_path"] = (
    "/app/_data/tensorflow-great-barrier-reef/train_images/video_"
    + df.video_id.astype("str")
    + "/"
    + df.video_frame.astype("str")
    + ".jpg"
)
df["annotations"] = df["annotations"].apply(lambda x: ast.literal_eval(x))
df["len_annotation"] = df["annotations"].str.len()
df["image_id"] = df["image_id"].str.replace("-", "_", regex=True)
df["new_img_path"] = f"/app/_data/{IMAGE_FOLDER}/" + df["image_id"] + ".jpg"
df["label"] = df["len_annotation"].apply(lambda x: False if x == 0 else True)
df["type"] = df["sequence"].apply(lambda x: kfold_dict[x][0])
df["train"] = df["sequence"].apply(lambda x: 1 if kfold_dict[x][1] == "train" else 0)

In [449]:
df["no_label"] = df["len_annotation"].apply(lambda x: True if x == 0 else False)

In [450]:
df["label_change"] = df["label"] & df["no_label"].shift(1) & df["no_label"].shift(2) | df["no_label"] & df["label"].shift(1) & df["label"].shift(2)
df["sequense_change"] = df["sequence"] != df["sequence"].shift(1)
df["start_subseq"] = df["sequense_change"] | df["label_change"]
df.loc[df.index[-1], "start_subseq"] = True
df["start_subseq"].sum()

138

In [451]:
start_idx = 0
for subsequence_id, end_idx in enumerate(df[df["start_subseq"]].index):
    df.loc[start_idx:end_idx, "subsequence_id"] = subsequence_id
    start_idx = end_idx

df["subsequence_id"] = df["subsequence_id"].astype(int)
df["subsequence_id"].nunique()

137