# Kaggle Competition

## Installing required libraries and downloading the dataset
Note that the location of where the `kaggle` library is installed might differ. Change that as you need.

In [3]:
# !pip install kaggle
# !~/.local/bin/kaggle competitions download -c tensorflow-great-barrier-reef
# !pip install numpy
# !pip install opencv-python

## Importing required libraries
Another note: `greatbarrierreef` library requires you to have `python 3.7.10`. If you have `python 3.7.10` and you have a folder in your current directory called `greatbarrierreef` then, you can uncomment the library import below.

In [90]:
import numpy as np
import pandas as pd
from tqdm import tqdm 

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from PIL import Image
import tensorflow as tf

import os
import ast  ## Change str -> list.
import sys
import time

import warnings
warnings.filterwarnings('ignore')

INPUT_DIR = './input/'
sys.path.insert(0, INPUT_DIR)

# import greatbarrierreef

## Reading the training data

In [5]:
df_train = pd.read_csv('input/train.csv')
df_train['img_path'] = os.path.join('input/train_images')+"/video_"+df_train.video_id.astype(str)+"/"+df_train.video_frame.astype(str)+".jpg"
df_train.head()

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,img_path
0,0,40258,0,0,0-0,[],input/train_images/video_0/0.jpg
1,0,40258,1,1,0-1,[],input/train_images/video_0/1.jpg
2,0,40258,2,2,0-2,[],input/train_images/video_0/2.jpg
3,0,40258,3,3,0-3,[],input/train_images/video_0/3.jpg
4,0,40258,4,4,0-4,[],input/train_images/video_0/4.jpg


In [6]:
with_annotation = len(df_train[df_train['annotations'] != '[]'])
without_annotation = len(df_train[df_train['annotations'] == '[]'])
print('Images with annotations:', with_annotation)
print('Images without annotations:', without_annotation)

Images with annotations: 4919
Images without annotations: 18582


In [64]:
df_annotated = df_train[df_train['annotations'].astype(str) != "[]"]
df_annotated['annotations'] = df_train['annotations'].apply(ast.literal_eval)
df_annotated.head()

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,img_path
16,0,40258,16,16,0-16,"[{'x': 559, 'y': 213, 'width': 50, 'height': 32}]",input/train_images/video_0/16.jpg
17,0,40258,17,17,0-17,"[{'x': 558, 'y': 213, 'width': 50, 'height': 32}]",input/train_images/video_0/17.jpg
18,0,40258,18,18,0-18,"[{'x': 557, 'y': 213, 'width': 50, 'height': 32}]",input/train_images/video_0/18.jpg
19,0,40258,19,19,0-19,"[{'x': 556, 'y': 214, 'width': 50, 'height': 32}]",input/train_images/video_0/19.jpg
20,0,40258,20,20,0-20,"[{'x': 555, 'y': 214, 'width': 50, 'height': 32}]",input/train_images/video_0/20.jpg


## Preprocessing the Data
Steps taken:
1. Crop all the images with annotation to only get the Starfish samples.
2. Resize the images to 200x200 with upscaling using Lanczos method.
3. (WIP) Apply data augmentations:
    - Rotation
    - Segmentation

In [148]:
img_data = []

for img_id in tqdm(df_annotated.index, position=0, leave=True):
    image_path = df_annotated['img_path'][5474]
    img = Image.open(image_path)
    for box in df_annotated['annotations'][5474]:
        area = [box['x'], box['y'], box['x']+box['width'], box['y']+box['height']]
        cropped_img = img.crop(area).resize((200, 200), resample=Image.ANTIALIAS)
        pil_image = cropped_img.convert('RGB')
        open_cv_image = np.array(pil_image)
        img_data.append(open_cv_image)

img_data = np.array(img_data)

100%|██████████| 4919/4919 [02:14<00:00, 36.48it/s]
