In [1]:
import re
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from kaggle_datasets import KaggleDatasets
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)
    
print(tf.__version__)

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("__gcloud_sdk_auth__")

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()
user_secrets.set_tensorflow_credential(user_credential)

In [4]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
GCS_PATH = KaggleDatasets().get_gcs_path()
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
IMAGE_SIZE = [45,45]
EPOCHS = 50

In [5]:
filenames = tf.io.gfile.glob(str(GCS_PATH + "/Vehicles_vs_Plants/train/*/*.jp*g"))
filenames.extend(tf.io.gfile.glob(str(GCS_PATH + "/Vehicles_vs_Plants/train/*/*.png")))
train_filenames, val_filenames = train_test_split(filenames, test_size=0.2)
test_filenames = tf.io.gfile.glob(str(GCS_PATH + "/Vehicles_vs_Plants/test/*/*.jp*g"))
test_filenames.extend(tf.io.gfile.glob(str(GCS_PATH + "/Vehicles_vs_Plants/test/*/*.png")))

In [6]:
COUNT_OTHER = len([filename for filename in train_filenames if "Other" in filename])
print("Other images count in training set: " + str(COUNT_OTHER))

COUNT_PLANTS = len([filename for filename in train_filenames if "/Plants" in filename])
print("Plants images count in training set: " + str(COUNT_PLANTS))

COUNT_VEHICLES = len([filename for filename in train_filenames if "/Vehicles/" in filename])
print("Vehicles images count in training set: " + str(COUNT_VEHICLES))


In [7]:
train_list_ds = tf.data.Dataset.from_tensor_slices(train_filenames)
val_list_ds = tf.data.Dataset.from_tensor_slices(val_filenames)

for f in train_list_ds.take(5):
    print(f.numpy())

In [9]:
TRAIN_IMG_COUNT = tf.data.experimental.cardinality(train_list_ds).numpy()
print("Training images count: " + str(TRAIN_IMG_COUNT))

VAL_IMG_COUNT = tf.data.experimental.cardinality(val_list_ds).numpy()
print("Validating images count: " + str(VAL_IMG_COUNT))

In [10]:
CLASS_NAMES = np.array([str(tf.strings.split(item, os.path.sep)[-1].numpy())[2:-1]
                        for item in tf.io.gfile.glob(str(GCS_PATH + "/Vehicles_vs_Plants/train/*"))])
CLASS_NAMES

In [11]:
def get_label(file_path):
    # convert the path to a list of path components
    parts = tf.strings.split(file_path, os.path.sep)
    # The second to last is the class-directory
    if parts[-2] == "Other":
        return 0
    elif  parts[-2] == "Plants":
        return 1
    else:
        return 2

In [12]:
def decode_img(img):
  # convert the compressed string to a 3D uint8 tensor
    img = tf.cond(
    tf.image.is_jpeg(img),
    lambda: tf.image.decode_jpeg(img, channels=3),
    lambda: tf.image.decode_png(img, channels=3))
  # Use `convert_image_dtype` to convert to floats in the [0,1] range.
    img = tf.image.convert_image_dtype(img, tf.float32)
  # resize the image to the desired size.
    return tf.image.resize(img, IMAGE_SIZE)

In [13]:
def process_path(file_path):
    label = get_label(file_path)
    # load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img, label

In [14]:
train_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)

val_ds = val_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)

In [16]:
for image, label in train_ds.take(1):
    print("Image shape: ", image.numpy().shape)
    print("Label: ", label.numpy())

In [17]:
test_list_ds = tf.data.Dataset.list_files(str(GCS_PATH + "/Vehicles_vs_Plants/test/*/*"))
test_ds = test_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE)
TEST_IMAGE_COUNT = tf.data.experimental.cardinality(test_list_ds).numpy()
TEST_IMAGE_COUNT

In [18]:
for image, label in train_ds.take(1):
    print("Image shape: ", image.numpy().shape)
    print("Label: ", label.numpy())

In [19]:
def prepare_for_training(ds, cache=True, shuffle_buffer_size=2000):
    # This is a small dataset, only load it once, and keep it in memory.
    # use `.cache(filename)` to cache preprocessing work for datasets that don't
    # fit in memory.
    if cache:
        if isinstance(cache, str):
            ds = ds.cache(cache)
        else:
            ds = ds.cache()

    ds = ds.shuffle(buffer_size=shuffle_buffer_size)

    # Repeat forever
    ds = ds.repeat()

    ds = ds.batch(BATCH_SIZE)

    # `prefetch` lets the dataset fetch batches in the background while the model
    # is training.
    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

In [20]:
train_ds = prepare_for_training(train_ds)
val_ds = prepare_for_training(val_ds)

image_batch, label_batch = next(iter(train_ds))

In [21]:
def show_batch(image_batch, label_batch):
    plt.figure(figsize=(10,10))
    for n in range(16):
        ax = plt.subplot(5,5,n+1)
        plt.imshow(image_batch[n])
        if label_batch[n] == 0:
            plt.title("OTHER")
        elif label_batch[n] == 1:
            plt.title("PLANTS")
        else :
            plt.title("VEHICLES")
        plt.axis("off")

In [166]:
show_batch(image_batch.numpy(), label_batch.numpy())

In [26]:
initial_bias = np.log([COUNT_OTHER/COUNT_PLANTS])
initial_bias

In [27]:
weight_for_0 = (1 / COUNT_OTHER)*(TRAIN_IMG_COUNT)/3.0 
weight_for_1 = (1 / COUNT_PLANTS)*(TRAIN_IMG_COUNT)/3.0
weight_for_2 = (1 / COUNT_VEHICLES)*(TRAIN_IMG_COUNT)/3.0

class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))
print('Weight for class 2: {:.2f}'.format(weight_for_2))

In [161]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

model = keras.Sequential([
    # Block One
    layers.Conv2D(filters=4, kernel_size=3, activation='relu', padding='same',
                  input_shape=[IMAGE_SIZE[0], IMAGE_SIZE[1], 3]),
    layers.MaxPool2D(),

    # Block Two
    layers.Conv2D(filters=8, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPool2D(),

#     Block Three
    layers.Conv2D(filters=16, kernel_size=3, activation='relu', padding='same'),
    layers.Conv2D(filters=16, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPool2D(),
    
    #Dense layers
    layers.Dropout(0.5),
    layers.Flatten(),
    layers.Dense(36, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(9, activation='relu'),
    layers.Dense(3, activation='softmax'),
])

In [162]:
model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics='sparse_categorical_accuracy'
    )

In [163]:
history = model.fit(
    train_ds,
    steps_per_epoch=TRAIN_IMG_COUNT // (BATCH_SIZE),
    epochs=100,
    validation_data=val_ds,
    validation_steps=VAL_IMG_COUNT // (BATCH_SIZE),
    class_weight=class_weight,
)

In [164]:
import pandas as pd
history_frame = pd.DataFrame(history.history)
history_frame.loc[:, ['loss', 'val_loss']].plot()
history_frame.loc[:, ['sparse_categorical_accuracy', 'val_sparse_categorical_accuracy']].plot();

In [36]:
pred = model.predict(test_ds)
predictions = np.argmax(pred, axis=-1)
probability = np.max(pred, axis=-1)
labels = {1:'Plants',2:'Vehicles'}
len([print(test_filenames[i].split('/')[-1], probability[i], labels[predictions[i]]) for i, v in enumerate(predictions.tolist()) if predictions[i] == 1 or predictions[i] == 2])