In [None]:
%pip install tensorboard
%pip install fastai
%pip install pypdl

In [2]:
import json
import pandas as pd
from pypdl import Pypdl

In [3]:
dl = Pypdl()
#dl.start('https://ml-inat-competition-datasets.s3.amazonaws.com/2021/train.tar.gz')

In [4]:
dl = Pypdl()
#dl.start('https://ml-inat-competition-datasets.s3.amazonaws.com/2021/train.json.tar.gz')

In [5]:
marine_life_classes = ["Actinopterygii"]

""" marine_life_classes = [
    "Actinopterygii",
    "Gastropoda",
    "Malacostraca",
    "Bivalvia",
    "Anthozoa",
    "Elasmobranchii",
    "Asteroidea",
    "Polyplacophora",
    "Hexanauplia",
    "Echinoidea",
    "Scyphozoa",
    "Cephalopoda",
    "Hydrozoa",
    "Ascidiacea",
    "Holothuroidea",
    "Ophiuroidea",
] """

train_data = 'train_mini/train_mini'
annotation_json = 'train_mini.json'

train_file = "train.tar.gz"
train_annotations_tar = "train.json.tar.gz"
train_annotations = "train.json"

In [6]:
df = pd.DataFrame()

import tarfile

with tarfile.open(train_annotations_tar, "r:gz") as archive:
    archive.extractall(path=".")

with open(train_annotations) as f:
    data = json.load(f)

    rows = [category_info.values() for category_info in data["categories"]]
    df = pd.DataFrame(rows, columns=data["categories"][0].keys())

In [None]:
df

In [None]:
df["class"].unique()

In [None]:
df[df["name"] == "Tursiops truncatus"]

In [None]:
df["class"].value_counts()

In [None]:
df[df["class"].isin(marine_life_classes)]["common_name"].unique()

In [12]:
df = df[df["class"].isin(marine_life_classes)]

In [13]:
from fastai.vision.all import *
from fastai.callback.tensorboard import TensorBoardCallback
import os

In [None]:
extraction_folders = set(df["image_dir_name"].unique())

extraction_folders

In [15]:
import tarfile

members = []

with tarfile.open(train_file, "r:gz") as archive:
    members = archive.getmembers()

In [None]:
print(members[1].name.split("/")[-1])

filtered_members = [
    member
    for member in members
    if len(member.name.split("/")) > 1 and member.name.split("/")[1] in extraction_folders
]

top_members = [member.name for member in members]

len(filtered_members)

print(filtered_members)

In [None]:
with tarfile.open(train_file, "r:gz") as archive:
    archive.extractall(".", members=filtered_members)

In [12]:
path = Path("train_mini")

In [13]:
batch_tfms = [
    ToTensor(),
    *aug_transforms(flip_vert=True, max_lighting=0.1, max_zoom=1.05, max_warp=0.0),
    Normalize(),
]
cells = DataBlock(
    blocks=(ImageBlock, CategoryBlock),
    get_items=get_image_files,
    splitter=RandomSplitter(valid_pct=0.2, seed=42),
    get_y=parent_label,
    item_tfms=Resize(256),
    batch_tfms=batch_tfms,
)

dls = cells.dataloaders(path, bs=128)

In [None]:
dls.show_batch(max_n=4, figsize=(12, 9))

In [15]:
xb, yb = dls.one_batch()

# Confirm that the shape of the data is correct

In [None]:
xb.shape, yb.shape  

In [None]:
# See number of classes
dls.c

# Covolutional Neural Network

We start by defining methods to create a convolutional neural network. The first method creates a convolutional layer with a kernel size of 3 and a stride of 2. The second method creates a ResBlock which is a BatchNormalization and ReLU function mixed in one. The third method creates a fully connected layer using the above two methods while adding a dropout layer to prevent overfitting and also a AdaptiveAvgPool2d layer to reduce the size of the image to 1x1 before passing it to AdaptiveMaxPool2d layer. The final layer is a linear layer withou any activation function.

In [18]:
def conv2(ni, nf):
    return ConvLayer(ni, nf, stride=2)

In [19]:
class ResBlock(Module):
    def __init__(self, nf):
        self.conv1 = ConvLayer(nf, nf)
        self.conv2 = ConvLayer(nf, nf)

    def forward(self, x):
        return x + self.conv2(self.conv1(x))

In [20]:
def simple_cnn():
    return nn.Sequential(
        conv2(3, 16),
        ResBlock(16),
        conv2(16, 32),
        ResBlock(32),
        conv2(32, 64),
        ResBlock(64),
        nn.AdaptiveAvgPool2d(1),
        nn.AdaptiveMaxPool2d(1),
        Flatten(),
        nn.BatchNorm1d(64),
        nn.Dropout(0.25),
        nn.Linear(64, dls.c),
    )

In [21]:
learn = Learner(
    dls,
    simple_cnn(),
    loss_func=CrossEntropyLossFlat(),
    metrics=[accuracy],
    cbs=ActivationStats(with_hist=True),
)

In [None]:
learn.summary()

In [None]:
learn.lr_find()

In [24]:
path = Path('models')

# Early Stopping Callback to prevent overfitting and TensorBoardCallback to monitor the training process

In [None]:
if os.path.exists('models/cell_detection'):
  learn.load(path/'cell_detection')
else:
  learn.fit_one_cycle(20, 0.001, cbs=[TensorBoardCallback('tmp/runs/tb', trace_model=True), EarlyStoppingCallback(monitor='valid_loss', min_delta=0.1, patience=3)])
  learn.export('models/cell_detection.pkl')

In [None]:
learn.recorder.plot_sched()

In [None]:
interp = Interpretation.from_learner(learn)
interp.plot_top_losses(9)

In [None]:
learn.show_results()

In [60]:
with tarfile.open("public_test.json.tar.gz", "r:gz") as archive:
    archive.extractall(path=".")

In [None]:
members = []

with tarfile.open("public_test.tar.gz", "r:gz") as archive:
    archive.extractall("test_data")

In [None]:
test_dls = dls.test_dl(
    get_image_files("test_data/"), bs=32, with_labels=True
)

In [None]:
preds,targs = learn.get_preds(dl=test_dls)

In [None]:
targs

In [None]:
(preds).float().argmax(axis=1)

# Heatmap of the predictions on test set

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(targs, (preds>0.5).float().argmax(axis=1))

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
fig, ax = plt.subplots(figsize=(10,10))
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
sns.heatmap(cmn, annot=True, fmt='.2f',
            xticklabels=dls.vocab, yticklabels=dls.vocab)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
%tensorboard --logdir='data/tmp/runs/tb'