In [None]:
import pandas as pd
from io import BytesIO
import tensorflow as tf
from kaggle_datasets import KaggleDatasets
from tensorflow.python.lib.io import file_io

print("Tensorflow version " + tf.__version__)

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path() # you can list the bucket with "!gsutil ls $GCS_DS_PATH"

In [None]:
# get handles
data_dir  = GCS_DS_PATH
test_dir  = data_dir + '/test/*.jpg'
train_dir = data_dir + '/train/*.jpg'

test_images = !gsutil ls $test_dir
train_images = !gsutil ls $train_dir

test_labels = pd.read_csv(data_dir + '/test.csv') 
train_labels = pd.read_csv(data_dir + '/train.csv')

In [None]:
# https://www.tensorflow.org/tutorials/load_data/tfrecord
# The following functions can be used to convert a value to a type compatible with tf.train.Example.

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
print(train_labels.columns)

In [None]:
from sklearn.model_selection import train_test_split

# split into train validate
train, validate = train_test_split(train_labels, test_size=0.2)

datasets = {'test': test_labels, 'train': train, 'validate': validate}

# make tfrecords
for key, df in datasets.items():
    record_file = f'{key}.tfrecords'
    with tf.io.TFRecordWriter(record_file) as writer:
        for index, row in df.iterrows():
            
            # image
            img_id = row['Id']
            image_path = data_dir + f'/{key}/{img_id}.jpg'
            if key == 'validate':
                image_path = data_dir + f'/train/{img_id}.jpg'
            image_string = (BytesIO(file_io.read_file_to_string(image_path, binary_mode=True))).read()
            image_shape = tf.io.decode_jpeg(image_string).shape
            
            feature = {
              'height': _int64_feature(image_shape[0]),
              'width': _int64_feature(image_shape[1]),
              'depth': _int64_feature(image_shape[2]),
              'image_raw': _bytes_feature(image_string),
            }
            
            feature['Subject Focus'] = _int64_feature(row['Subject Focus'])
            feature['Eyes'] = _int64_feature(row['Eyes'])
            feature['Face'] = _int64_feature(row['Face'])
            feature['Near'] = _int64_feature(row['Near'])
            feature['Action'] = _int64_feature(row['Action'])
            feature['Accessory'] = _int64_feature(row['Accessory'])
            feature['Group'] = _int64_feature(row['Group'])
            feature['Collage'] = _int64_feature(row['Collage'])
            feature['Human'] = _int64_feature(row['Human'])
            feature['Occlusion'] = _int64_feature(row['Occlusion'])
            feature['Info'] = _int64_feature(row['Info'])
            feature['Blur'] = _int64_feature(row['Blur'])
            
            if key == 'train' or key == 'validate':
                feature['Pawpularity'] = _float_feature(row['Pawpularity'])

            tf_example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(tf_example.SerializeToString())