In [1]:
# import pre-processed data
import pandas as pd
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import PIL
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout, BatchNormalization, Activation, GlobalAveragePooling2D
from tensorflow.keras import applications, layers, losses, optimizers, Model
from tensorflow.keras.models import Sequential
import keras_toolkit as kt
from sklearn.model_selection import train_test_split
from textwrap import wrap


2023-04-04 19:55:47.349057: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# # change tensorflow device to GPU
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

## Pre-process image data so that it can be used in the model

In [3]:
PATH = '/workspaces/Shopee-Price-Match-Guarantee/00_source_data/shopee-product-matching/'
PATH_TO_IMG = '/workspaces/Shopee-Price-Match-Guarantee/00_source_data/shopee-product-matching/train_images/'
PATH_TO_TEST = '/workspaces/Shopee-Price-Match-Guarantee/00_source_data/shopee-product-matching/test_images/'
os.listdir(PATH)

['train.csv',
 'sample_submission.csv',
 'train_images',
 'test.csv',
 'test_images']

In [4]:
# how many images are there in trainig set
len(os.listdir(PATH_TO_IMG))

32411

In [None]:
COMPUTE_CV = True
if len(pd.read_csv(PATH + 'test.csv')) > 3: COMPUTE_CV = False

In [None]:
if COMPUTE_CV:
    dataset = pd.read_csv(PATH + 'train.csv')
    tmp = dataset.groupby('label_group').posting_id.agg('unique').to_dict() 
    dataset['target'] = dataset.label_group.map(tmp) # map the posting_ids that have same image
else:    
    dataset = pd.read_csv(PATH + 'test.csv')

In [None]:
dataset.head()

In [None]:
def show_random_img():
    # choose randomly two instances per each class
    labels_to_show = np.random.choice(dataset.label_group.unique(), 
                                      replace=False, size=27)
    img_to_show = []
    for label in labels_to_show:
        rows = dataset[dataset.label_group==label].copy()
        pair = np.random.choice([i for i in range(len(rows))], 
                                    replace=False, size=2)
        img_pair = rows.iloc[pair][['image', 'title']].values
        
        img_to_show += list(img_pair)
    
    fig, axes = plt.subplots(figsize = (18, 12), nrows=4,ncols=6)
    for imp, ax in zip(img_to_show, axes.ravel()):
        img = cv2.imread(PATH_TO_IMG + imp[0])
        title = '\n'.join(wrap(imp[1], 20))
        ax.set_title(title)
        ax.imshow(img)
        ax.axis('off')

    fig.tight_layout()

In [None]:
if COMPUTE_CV:
    show_random_img()

### Reorganize the data into a directory structure that can be used by the model

In [None]:
# create 20_intermediate_data folder if not exist
if not os.path.exists('/workspaces/Shopee-Price-Match-Guarantee/20_intermediate_data'):
    os.mkdir('/workspaces/Shopee-Price-Match-Guarantee/20_intermediate_data')
    print("Folder Created")
else:
    print("Folder already exists")

In [None]:
# create folder for each class
for label in dataset.label_group.unique():
    if not os.path.exists('/workspaces/Shopee-Price-Match-Guarantee/20_intermediate_data/' + str(label)):
        os.mkdir('/workspaces/Shopee-Price-Match-Guarantee/20_intermediate_data/' + str(label))
        print("Folder Created")
    else:
        print("Folder already exists")

In [None]:
# copy images to their respective class folder
for label in dataset.label_group.unique():
    for img in dataset[dataset.label_group==label].image.values:
        os.system('cp ' + PATH_TO_IMG + img + ' /workspaces/Shopee-Price-Match-Guarantee/20_intermediate_data/' + str(label) + '/')
        print("Image copied")

## Set up and Train the Model

In [5]:
# create train and validation set
train_ds = tf.keras.utils.image_dataset_from_directory(
    '/workspaces/Shopee-Price-Match-Guarantee/20_intermediate_data/',
    labels="inferred",
    label_mode="categorical",
    class_names=None,
    color_mode="rgb",
    batch_size=32,
    image_size=(256, 256),
    validation_split=0.2,
    subset="training",
    seed=17,
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    '/workspaces/Shopee-Price-Match-Guarantee/20_intermediate_data/',
    labels="inferred",
    label_mode="categorical",
    class_names=None,
    color_mode="rgb",
    batch_size=32,
    image_size=(256, 256),
    validation_split=0.2,
    subset="validation",
    seed=17,
)

Found 32459 files belonging to 11014 classes.
Using 25968 files for training.


2023-04-04 19:56:21.423142: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14575 MB memory:  -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0001:00:00.0, compute capability: 7.0


Found 32459 files belonging to 11014 classes.
Using 6491 files for validation.


In [6]:
# check the image size
for image_batch, labels_batch in train_ds:
    print(image_batch.shape)
    print(labels_batch.shape)
    break

2023-04-04 19:56:29.726080: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [25968]
	 [[{{node Placeholder/_0}}]]
2023-04-04 19:56:29.726444: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [25968]
	 [[{{node Placeholder/_0}}]]


(32, 256, 256, 3)
(32, 11014)


In [8]:
num_classes = len(train_ds.class_names)
print(f"We have {num_classes} classes.")

We have 11014 classes.


In [None]:
class_names = train_ds.class_names
# print(class_names)

### Visualize 6 images from the training set

In [None]:
import matplotlib.pyplot as plotter_lib

plotter_lib.figure(figsize=(10, 10))

epochs=10

for images, labels in train_ds.take(1):

  for var in range(6):

    ax = plt.subplot(3, 3, var + 1)

    plotter_lib.imshow(images[var].numpy().astype("uint8"))

    plotter_lib.axis("off")

In [9]:
# import the ResNet50 model from keras

resnet = Sequential()
pretrained_model = applications.ResNet50V2(
    include_top=False,
    weights='imagenet',
    input_shape=(256, 256, 3),
    pooling='avg',
    classes=num_classes
)

for layer in pretrained_model.layers:
    layer.trainable = False

resnet.add(pretrained_model)
# add a fully connected layer
resnet.add(Flatten())
resnet.add(Dense(512, activation='relu'))
resnet.add(Dense(num_classes, activation='softmax'))


# save the model
# if not os.path.exists('/workspaces/Shopee-Price-Match-Guarantee/30_results/ResNet50'):
#     os.mkdir('/workspaces/Shopee-Price-Match-Guarantee/30_results/ResNet50')
#     print("Folder Created")
#     model.save('/workspaces/Shopee-Price-Match-Guarantee/30_results/ResNet50/resnet50v2.h5')
#     print("Model Saved")
# else:
#     print("Folder already exists")
#     model.save('/workspaces/Shopee-Price-Match-Guarantee/30_results/ResNet50/resnet50v2.h5')
#     print("Model Saved")

In [10]:
resnet.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet50v2 (Functional)     (None, 2048)              23564800  
                                                                 
 flatten (Flatten)           (None, 2048)              0         
                                                                 
 dense (Dense)               (None, 512)               1049088   
                                                                 
 dense_1 (Dense)             (None, 11014)             5650182   
                                                                 
Total params: 30,264,070
Trainable params: 6,699,270
Non-trainable params: 23,564,800
_________________________________________________________________


## Train and Evaluate the Model

In [11]:
epochs=10 # set default to 10

In [12]:
resnet.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

history = resnet.fit(train_ds, epochs=epochs, validation_data=val_ds)

Epoch 1/10


2023-04-04 19:57:02.861417: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-04-04 19:57:03.517602: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-04 19:57:03.518134: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-04 19:57:03.518195: W tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.cc:109] Couldn't get ptxas version : FAILED_PRECONDITION: Couldn't get ptxas/nvlink version string: INTERNAL: Couldn't invoke ptxas --version
2023-04-04 19:57:03.518791: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-04 19:57:03.518885: W tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.cc:317] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This mes

InternalError: Graph execution error:

Detected at node 'StatefulPartitionedCall_3' defined at (most recent call last):
    File "<frozen runpy>", line 198, in _run_module_as_main
    File "<frozen runpy>", line 88, in _run_code
    File "/home/vscode/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/vscode/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/home/vscode/.venv/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 725, in start
      self.io_loop.start()
    File "/home/vscode/.venv/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/usr/local/lib/python3.11/asyncio/base_events.py", line 607, in run_forever
      self._run_once()
    File "/usr/local/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once
      handle._run()
    File "/usr/local/lib/python3.11/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/vscode/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "/home/vscode/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "/home/vscode/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
      await result
    File "/home/vscode/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/home/vscode/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/home/vscode/.venv/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/vscode/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3006, in run_cell
      result = self._run_cell(
    File "/home/vscode/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3061, in _run_cell
      result = runner(coro)
    File "/home/vscode/.venv/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/vscode/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3266, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/vscode/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3445, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/vscode/.venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_19139/1582739919.py", line 5, in <module>
      history = resnet.fit(train_ds, epochs=epochs, validation_data=val_ds)
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/engine/training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/engine/training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/engine/training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/engine/training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/engine/training.py", line 1054, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/optimizers/optimizer.py", line 543, in minimize
      self.apply_gradients(grads_and_vars)
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/optimizers/optimizer.py", line 1174, in apply_gradients
      return super().apply_gradients(grads_and_vars, name=name)
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/optimizers/optimizer.py", line 650, in apply_gradients
      iteration = self._internal_apply_gradients(grads_and_vars)
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/optimizers/optimizer.py", line 1200, in _internal_apply_gradients
      return tf.__internal__.distribute.interim.maybe_merge_call(
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/optimizers/optimizer.py", line 1250, in _distributed_apply_gradients_fn
      distribution.extended.update(
    File "/home/vscode/.venv/lib/python3.11/site-packages/keras/optimizers/optimizer.py", line 1245, in apply_grad_to_update_var
      return self._update_step_xla(grad, var, id(self._var_key(var)))
Node: 'StatefulPartitionedCall_3'
libdevice not found at ./libdevice.10.bc
	 [[{{node StatefulPartitionedCall_3}}]] [Op:__inference_train_function_9674]

## Template code below

In [None]:
# template code

# Data Preparation
 # training dataset train_ds
# validation dataset val_ds

# Model Architecture
base_model = tf.keras.applications.ResNet50V2(
    include_top=True,
    weights="imagenet",
    input_tensor=None,
    input_shape=None,
    pooling=None,
    classes=1000,
    classifier_activation="softmax",
)

x = tf.keras.layers.GlobalAveragePooling1D()(tf.expand_dims(x, axis=1))
x = tf.keras.layers.Dense(256, activation='relu')(x)
predictions = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
model = tf.keras.models.Model(inputs=base_model.input, outputs=predictions)

# Hyperparameter Tuning
learning_rate = 0.001
batch_size = 32
num_epochs = 10
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.CategoricalCrossentropy()

# Compile the model
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# Training
history = model.fit(train_ds, epochs=num_epochs, batch_size=batch_size, validation_data=val_ds)

# Evaluation on validation set
model.evaluate(val_ds)


# Fine-tuning
model.trainable = True
fine_tune_at = 100
for layer in model.layers[:fine_tune_at]:
  layer.trainable = False
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate/10)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])
history_fine = model.fit(train_ds, epochs=num_epochs, batch_size=batch_size, validation_data=val_ds)

# mkdir for saving model
if not os.path.exists('/workspaces/Shopee-Price-Match-Guarantee/30_results/ResNet50'):
    os.mkdir('/workspaces/Shopee-Price-Match-Guarantee/30_results/ResNet50')
    print("Folder Created")
    model.save('/workspaces/Shopee-Price-Match-Guarantee/30_results/ResNet50/resnet50v2.h5')
    print("Model Saved")
else:
    print("Folder already exists")
    model.save('/workspaces/Shopee-Price-Match-Guarantee/30_results/ResNet50/resnet50v2.h5')
    print("Model Saved")
