# Drive mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install modules

In [2]:
! pip install kaggle
! pip install --upgrade --force-reinstall --no-deps kaggle
! pip install -q efficientnet
! pip install kaggledatasets
! kaggle --version

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 3.1 MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=4a04a7dd99b086e993a0c3db736ec34ac92ca38fa2d8ec9d333912bf3003ba27
  Stored in directory: /root/.cache/pip/wheels/62/d6/58/5853130f941e75b2177d281eb7e44b4a98ed46dd155f556dc5
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
    Uninstalling kaggle-1.5.12:
      Successfully uninstalled kaggle-1.5.12
Successfully installed kaggle-1.5.12
[K     |████████████████████████████████| 50 kB 3.0 MB/s 
[?25hCollecting kaggledatasets
  Downloading kaggledatasets-0.0.1-py2.py3-none-any.whl (15 kB)
Installing collected packages: kaggledatasets
Successfully installed kaggledatasets-0.0.1
Tracebac

# Import kaggle json file

In [4]:
from google.colab import files
import os
if not os.path.isfile("kaggle.json"):
    uploaded = files.upload()
    for fn in uploaded.keys():
        print('uploaded file "{name}" with length {length} bytes'.format(
            name=fn, length=len(uploaded[fn])))
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
uploaded file "kaggle.json" with length 65 bytes


# As google drive limitaion, unzip to local session

In [None]:
# ! kaggle competitions download -c happy-whale-and-dolphin -p "/content/drive/MyDrive/kaggle/happy_whale/data"
! mkdir ./happy_whale -p
! unzip /content/drive/MyDrive/kaggle/happy_whale/data/happy-whale-and-dolphin.zip -d ./happy_whale

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  inflating: ./happy_whale/train_images/6e77b05ad80078.jpg  
  inflating: ./happy_whale/train_images/6e7b63b85261f2.jpg  
  inflating: ./happy_whale/train_images/6e7b8fbdd64148.jpg  
  inflating: ./happy_whale/train_images/6e804b713c0d5a.jpg  
  inflating: ./happy_whale/train_images/6e8086f3319651.jpg  
  inflating: ./happy_whale/train_images/6e822d56b4e2b6.jpg  
  inflating: ./happy_whale/train_images/6e82c7e6fc3a35.jpg  
  inflating: ./happy_whale/train_images/6e8343b40d072b.jpg  
  inflating: ./happy_whale/train_images/6e857b447feddf.jpg  
  inflating: ./happy_whale/train_images/6e890839cc8815.jpg  
  inflating: ./happy_whale/train_images/6e8b0722cace61.jpg  
  inflating: ./happy_whale/train_images/6e8b5e059a8a53.jpg  
  inflating: ./happy_whale/train_images/6e8bd97cdd5b20.jpg  
  inflating: ./happy_whale/train_images/6e8d54e6819e3a.jpg  
  inflating: ./happy_whale/train_images/6e8f6cd8454292.jpg  
  inflating: ./happy_whale/train_im

# Import using module

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
% matplotlib inline
sns.set()


import math
from sklearn.model_selection import KFold

import PIL
import PIL.Image

import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
import efficientnet.tfkeras as efn

AUTOTUNE = tf.data.experimental.AUTOTUNE
strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


# Make TFRecord

In [None]:
WORK_DIR = "./happy_whale"

submission_file_path = os.path.join(WORK_DIR, "sample_submission.csv")
train_csv_file_path = os.path.join(WORK_DIR, "train.csv")
train_img_path = os.path.join(WORK_DIR, "train_images")
test_img_path = os.path.join(WORK_DIR, "test_images")
train_df = pd.read_csv(train_csv_file_path)
test_df = pd.read_csv(submission_file_path)

In [None]:
concat_df = pd.concat([train_df['image'], test_df['image']])
image_name_to_image_id = dict((image_name, index) for index, image_name in enumerate(concat_df.unique()))
image_id_to_image_name = {v: k for k, v in image_name_to_image_id.items()}
image_ids = [image_name_to_image_id[image_name] for image_name in train_df['image']]
train_df['image_id'] = image_ids
individual_id_to_label = dict((i_id, index) for index, i_id in enumerate(train_df['individual_id'].unique()))
individual_id_to_label['new_individual'] = train_df["individual_id"].unique().shape[0]
label_to_individual_id = {v: k for k, v in individual_id_to_label.items()}
train_df['label'] = [individual_id_to_label[i_id] for i_id in train_df['individual_id']]
image_id_to_label = dict(zip(train_df["image_id"], train_df["label"]))

In [None]:
from tqdm import tqdm
from glob import glob

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def make_example(img_str, file_id, label):
    feature = {
        "image_id": _int64_feature(file_id),
        "image_raw": _bytes_feature(img_str),
        "label": _int64_feature(label)
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

train_img_path = "./happy_whale/train_images"
test_img_path = "./happy_whale/test_images"
output_dir = "./happy_whale/tfrecord"
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
samples = {}
print("reading data list")

reading data list


In [None]:
max_label = len(individual_id_to_label) - 1
for dir_path in [train_img_path, test_img_path]:
    dir_name = os.path.basename(dir_path)
    samples[dir_name] = []
    print(dir_name)
    for file_name in tqdm(os.listdir(dir_path)):
        image_id = image_name_to_image_id[file_name]
        image_label = image_id_to_label.get(image_id, max_label)
        img_path = os.path.join(dir_path, file_name)
        samples[dir_name].append((img_path, image_id, image_label))

train_images


100%|██████████| 51033/51033 [00:00<00:00, 206315.22it/s]


test_images


100%|██████████| 27956/27956 [00:00<00:00, 263776.33it/s]


In [None]:
print("Writing tfrecord file...")
for key, val in samples.items():
    print(key)
    for img_path, image_id, image_label in tqdm(val):
        tf_example = make_example(  
                img_str=open(img_path, 'rb').read(),
                file_id=image_id,
                label=image_label
        )
        file_name = os.path.basename(img_path).split(".")[0]
        output_path = os.path.join(output_dir, f"{key}_{file_name}.tfrecord")
        with tf.io.TFRecordWriter(output_path) as writer:
            writer.write(tf_example.SerializeToString())

Writing tfrecord file...
train_images


100%|██████████| 51033/51033 [09:21<00:00, 90.88it/s]


test_images


100%|██████████| 27956/27956 [05:08<00:00, 90.63it/s] 


In [None]:
! zip -r /content/drive/MyDrive/kaggle/happy_whale/tfrecord.zip ./happy_whale/tfrecord

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  adding: happy_whale/tfrecord/train_images_0f1c63c7050d80.tfrecord (deflated 1%)
  adding: happy_whale/tfrecord/train_images_f7c9e6ebafd747.tfrecord (deflated 0%)
  adding: happy_whale/tfrecord/test_images_d816bb7558f620.tfrecord (deflated 0%)
  adding: happy_whale/tfrecord/train_images_efc9bf4041275c.tfrecord (deflated 0%)
  adding: happy_whale/tfrecord/test_images_d4880d9f41a258.tfrecord (deflated 0%)
  adding: happy_whale/tfrecord/train_images_beb0a5dde6d2cc.tfrecord (deflated 2%)
  adding: happy_whale/tfrecord/test_images_37926c1be2d21f.tfrecord (deflated 0%)
  adding: happy_whale/tfrecord/train_images_b2422ee114631f.tfrecord (deflated 1%)
  adding: happy_whale/tfrecord/test_images_8071834cfc5304.tfrecord (deflated 0%)
  adding: happy_whale/tfrecord/train_images_b3410e3c23ae03.tfrecord (deflated 0%)
  adding: happy_whale/tfrecord/test_images_6f6651edccec50.tfrecord (deflated 0%)
  adding: happy_whale/tfrecord/train_images_d10e8c8f6