In [1]:
from google.colab import drive 
drive.mount("/content/drive", force_remount=True)
# Change directory to the package folder 
%cd '/content/drive/MyDrive/dl-project'


Mounted at /content/drive
/content/drive/MyDrive/dl-project


In [2]:
!pip install ray[tune]
!apt install libomp-dev
!python -m pip install --upgrade faiss-gpu==1.7.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ray[tune]
  Downloading ray-2.3.1-cp39-cp39-manylinux2014_x86_64.whl (58.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.6/58.6 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting frozenlist
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Collecting aiosignal
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting virtualenv>=20.0.24
  Downloading virtualenv-20.22.0-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m109.0 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboardX>=1.9
  Downloading tensorboardX-2.6-py2.py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━

In [3]:
# copy dataset from google drive to the actual machine
# !rm -rf /content/data/

import os 
DATA_FOLDER = '/content/data/shopee-product-matching/'
if not os.path.isdir(DATA_FOLDER):    
    !mkdir /content/data/
    !cp /content/drive/MyDrive/dl-project/shopee-product-matching.zip /content/data/shopee-product-matching.zip
    !unzip -q /content/data/shopee-product-matching.zip -d /content/data/shopee-product-matching

In [4]:
import numpy as np
import pandas as pd

import torch
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data.dataset import Dataset
from tqdm import tqdm
from modules.datasets.ImageContrastiveLossShopeeDataset import ImageContrastiveLossShopeeDataset
from modules.losses.ContrastiveLoss import ContrastiveLoss
from modules.models.SiameseNet import SiameseNet
from modules.distances.CosineDistance import CosineDistance
from modules.models.ResNet18EmbeddingsShopeeNet import ResNet18EmbeddingsShopeeNet
import modules.utils.dataset_utils as dataset_utils
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from functools import partial
import faiss

In [5]:
train_df = dataset_utils.get_dataset(DATA_FOLDER, is_test=False)
train_df = dataset_utils.add_target(train_df)
train_df.shape

(34250, 6)

In [6]:
# !rm  ./contrastive_loss_data.csv
CONTRASTIVE_LOSS_DATASET_PATH = './contrastive_loss_data.csv'
train_cl_df = dataset_utils.get_contrastive_loss_dataset(train_df, read_path=CONTRASTIVE_LOSS_DATASET_PATH)
train_cl_df.to_csv(CONTRASTIVE_LOSS_DATASET_PATH, index=False)
train_cl_df.shape

(68500, 7)

In [7]:
class CFG:
    IMG_SZ = 256
    TRAIN_RATIO = 0.8
    EPOCHS = 10
    MARGIN = 0.5
    DISTANCE = torch.nn.PairwiseDistance()



In [8]:
def get_datasets(cl_df):
    unique_ids = np.unique(
        np.concatenate([
            cl_df['posting_id_1'].unique(), 
            cl_df['posting_id_2'].unique()
            ])
        )
    
    np.random.shuffle(unique_ids)
    train_size = int(CFG.TRAIN_RATIO * len(unique_ids))
    train_ids = unique_ids[:train_size]  # 80% for training
    valid_ids = unique_ids[train_size:]
    train_cl_df = cl_df[(cl_df['posting_id_1'].isin(train_ids)) | (cl_df['posting_id_2'].isin(train_ids))]
    valid_cl_df = cl_df[(cl_df['posting_id_1'].isin(valid_ids)) | (cl_df['posting_id_2'].isin(valid_ids))] 
    
    train_transforms = transforms.Compose([
            transforms.Resize((CFG.IMG_SZ, CFG.IMG_SZ)),
            transforms.RandomPosterize(bits=2, p=0.3),
            transforms.RandomHorizontalFlip(p=0.4),
            transforms.RandomAutocontrast(p=0.3),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    valid_transforms = transforms.Compose([
            transforms.Resize((CFG.IMG_SZ, CFG.IMG_SZ)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

    train_dataset = ImageContrastiveLossShopeeDataset(
        train_cl_df['image_1'].values,
        train_cl_df['image_2'].values,
        train_cl_df['label'].values,
        train_transforms,
    )

    valid_dataset = ImageContrastiveLossShopeeDataset(
        valid_cl_df['image_1'].values,
        valid_cl_df['image_2'].values,
        valid_cl_df['label'].values,
        valid_transforms,
    )
    
    return train_dataset, valid_dataset


In [9]:
def train_model(config, cl_df, checkpoint_dir=None):
    model = ResNet18EmbeddingsShopeeNet(config['bn'], freeze_layers=config['freeze'])
    
    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
    

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    siamese_net = SiameseNet(model)
    if torch.cuda.device_count() > 1:
        siamese_net = torch.nn.DataParallel(siamese_net)
    siamese_net.to(device)
    criterion = ContrastiveLoss(device=device, m=CFG.MARGIN, distance=CFG.DISTANCE)
    optimizer = optim.Adam(siamese_net.parameters(), lr=config['lr'])
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=config['factor'])

    # Datasets and data loaders
    train_dataset, valid_dataset = get_datasets(cl_df)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=0,
        # persistent_workers=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=0,
        # persistent_workers=True,
    )

    ## Training loop: training + validation
    for epoch in range(CFG.EPOCHS):
        ## Training
        train_loss = 0.0
        epoch_steps = 0
        siamese_net.train()
        for i, data in enumerate(train_loader, 0):
            input_1, input_2, target = data
            input_1, input_2, target = input_1.to(device), input_2.to(device), target.to(device)
            optimizer.zero_grad()
            output_1, output_2 = siamese_net(input_1, input_2)
            loss = criterion(output_1, output_2, target)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            epoch_steps += 1
            if i % 50 == 0:
                print("[%d, %5d / %d batches] loss: %.3f" % (epoch + 1, i + 1, len(train_loader),
                                                train_loss / epoch_steps))
        

        ## Validation
        val_loss = 0.0
        siamese_net.eval()
        for i, data in enumerate(valid_loader, 0):
            with torch.no_grad():
                input_1, input_2, target = data
                input_1, input_2, target = input_1.to(device), input_2.to(device), target.to(device)
                output_1, output_2 = siamese_net(input_1, input_2)
                loss = criterion(output_1, output_2, target)
                val_loss += loss.cpu().numpy()

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)
        
        tune.report(
            val_loss=(val_loss / len(valid_loader)),
            train_loss=(train_loss / len(train_loader))
            )

In [10]:
config = {
    "bn": tune.choice([False, True]),
    "lr": tune.loguniform(1e-6, 1e-1),
    "batch_size": tune.choice([32, 64]),
    "factor": tune.uniform(0.1, 0.99),
    "freeze": tune.choice([0, 3, 6, 6])
}


In [11]:
def run_hyperparameter_search(num_samples, max_epochs, gpus_per_trial):
    cur_cl_df = train_cl_df.sample(frac=0.5)
    scheduler = ASHAScheduler(
        metric="val_loss",
        mode="min",
        max_t=max_epochs,
        grace_period=3,
        reduction_factor=3)
    reporter = CLIReporter(
        metric_columns=["train_loss", "val_loss", "training_iteration"]
        )
    result = tune.run(
        partial(train_model, cl_df=cur_cl_df),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)
    return result


In [None]:
result = run_hyperparameter_search(num_samples=8, max_epochs=CFG.EPOCHS, gpus_per_trial=0.25)

2023-04-22 21:34:00,548	INFO worker.py:1553 -- Started a local Ray instance.

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/master/tune/api_docs/trainable.html



== Status ==
Current time: 2023-04-22 21:34:02 (running for 00:00:00.91)
Memory usage on this node: 2.8/51.0 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 9.000: None | Iter 3.000: None
Resources requested: 2.0/8 CPUs, 0.25/1 GPUs, 0.0/30.37 GiB heap, 0.0/15.19 GiB objects
Result logdir: /root/ray_results/train_model_2023-04-22_21-34-01
Number of trials: 8/8 (7 PENDING, 1 RUNNING)
+-------------------------+----------+------------------+--------------+-------+----------+----------+-------------+
| Trial name              | status   | loc              |   batch_size | bn    |   factor |   freeze |          lr |
|-------------------------+----------+------------------+--------------+-------+----------+----------+-------------|
| train_model_66040_00000 | RUNNING  | 172.28.0.12:3210 |           64 | True  | 0.335834 |        0 | 0.00230678  |
| train_model_66040_00001 | PENDING  |                  |           64 | False | 0.500207 |        6 | 9.54807e-05 |
| train_model_66040_00

[2m[36m(func pid=3210)[0m Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
  0%|          | 0.00/44.7M [00:00<?, ?B/s]
 28%|██▊       | 12.7M/44.7M [00:00<00:00, 106MB/s]
 75%|███████▌  | 33.5M/44.7M [00:00<00:00, 166MB/s]
100%|██████████| 44.7M/44.7M [00:00<00:00, 173MB/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| train_model_66040_00007 | PENDING  |                  |           32 | True  | 0.694962 |        3 | 4.59365e-06 |
+-------------------------+----------+------------------+--------------+-------+----------+----------+-------------+


[2m[36m(func pid=3292)[0m [1,   151 / 513 batches] loss: 0.055
== Status ==
Current time: 2023-04-22 21:39:36 (running for 00:05:34.59)
Memory usage on this node: 14.3/51.0 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 9.000: None | Iter 3.000: None
Resources requested: 8.0/8 CPUs, 1.0/1 GPUs, 0.0/30.37 GiB heap, 0.0/15.19 GiB objects
Result logdir: /root/ray_results/train_model_2023-04-22_21-34-01
Number of trials: 8/8 (4 PENDING, 4 RUNNING)
+-------------------------+----------+------------------+--------------+-------+----------+----------+-------------+
| Trial name              | status   | loc              |   batch_size | bn    |   factor |   freeze |          lr |
|-----

Trial name,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,train_loss,training_iteration,trial_id,val_loss,warmup_time
train_model_66040_00000,2023-04-23_01-32-34,False,,378743de3fde456ea3562d6f26c507c4,579fce779cb1,9,172.28.0.12,3210,True,14309.0,1567.56,14309.0,1682213554,0,,0.0522542,9,66040_00000,0.053379,0.00344133
train_model_66040_00001,2023-04-23_01-10-45,False,,b8611f98fc8b4749b98ff13a245f99c3,579fce779cb1,9,172.28.0.12,3292,True,12995.6,1441.36,12995.6,1682212245,0,,0.0547474,9,66040_00001,0.0565145,0.00382257
train_model_66040_00002,2023-04-23_01-33-20,True,,ffe8d7cb119b45e3903b9deed2d3ea5e,579fce779cb1,10,172.28.0.12,3295,True,14350.4,1415.95,14350.4,1682213600,0,,0.0551961,10,66040_00002,0.0581232,0.00377345
train_model_66040_00003,2023-04-23_01-25-53,False,,fe576df11a0b48df970a256920eb3453,579fce779cb1,9,172.28.0.12,3297,True,13903.5,1527.49,13903.5,1682213153,0,,0.0551445,9,66040_00003,0.0561846,0.00696993


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| train_model_66040_00006 | PENDING  |                  |           32 | False | 0.241172 |        6 | 6.89185e-05 |              |            |                      |
| train_model_66040_00007 | PENDING  |                  |           32 | True  | 0.694962 |        3 | 4.59365e-06 |              |            |                      |
+-------------------------+----------+------------------+--------------+-------+----------+----------+-------------+--------------+------------+----------------------+


== Status ==
Current time: 2023-04-23 01:14:35 (running for 03:40:33.71)
Memory usage on this node: 14.8/51.0 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 9.000: -0.05706640437168161 | Iter 3.000: -0.05626609390577451
Resources requested: 8.0/8 CPUs, 1.0/1 GPUs, 0.0/30.37 GiB heap, 0.0/15.19 GiB objects
Result logdir: /root/ray_results/train_model_2023-04-22_21-34-01
Number of trials: 8/8 (4 PENDING, 4 RUNNING)
+---

[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_env/py_modules.py", line 24, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.working_dir import set_pythonpath_in_context
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/run

[2m[36m(func pid=3297)[0m [10,   451 / 1030 batches] loss: 0.055


[2m[33m(raylet)[0m [2023-04-23 01:34:20,451 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(68869) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3210)[0m [10,    51 / 514 batches] loss: 0.051
[2m[36m(func pid=3297)[0m [10,   501 / 1030 batches] loss: 0.055


[2m[33m(raylet)[0m [2023-04-23 01:35:20,455 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(69156) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3297)[0m [10,   551 / 1030 batches] loss: 0.055
[2m[36m(func pid=3210)[0m [10,   101 / 514 batches] loss: 0.052


[2m[33m(raylet)[0m [2023-04-23 01:36:20,459 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(69439) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3297)[0m [10,   601 / 1030 batches] loss: 0.055


[2m[33m(raylet)[0m [2023-04-23 01:37:20,464 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(69722) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3297)[0m [10,   651 / 1030 batches] loss: 0.055
[2m[36m(func pid=3210)[0m [10,   151 / 514 batches] loss: 0.052
[2m[36m(func pid=3297)[0m [10,   701 / 1030 batches] loss: 0.055


[2m[33m(raylet)[0m [2023-04-23 01:38:20,469 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(70002) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3297)[0m [10,   751 / 1030 batches] loss: 0.055


[2m[33m(raylet)[0m [2023-04-23 01:39:20,473 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(70281) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3210)[0m [10,   201 / 514 batches] loss: 0.053
[2m[36m(func pid=3297)[0m [10,   801 / 1030 batches] loss: 0.055


[2m[33m(raylet)[0m [2023-04-23 01:40:20,477 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(70560) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3297)[0m [10,   851 / 1030 batches] loss: 0.055


[2m[33m(raylet)[0m [2023-04-23 01:41:20,481 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(70842) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3210)[0m [10,   251 / 514 batches] loss: 0.053
[2m[36m(func pid=3297)[0m [10,   901 / 1030 batches] loss: 0.055


[2m[33m(raylet)[0m [2023-04-23 01:42:20,486 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(71121) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3297)[0m [10,   951 / 1030 batches] loss: 0.055
[2m[36m(func pid=3210)[0m [10,   301 / 514 batches] loss: 0.053
[2m[36m(func pid=3297)[0m [10,  1001 / 1030 batches] loss: 0.055


[2m[33m(raylet)[0m [2023-04-23 01:43:20,491 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(71403) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3210)[0m [10,   351 / 514 batches] loss: 0.053


[2m[33m(raylet)[0m [2023-04-23 01:45:20,501 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(71971) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3210)[0m [10,   401 / 514 batches] loss: 0.052


[2m[33m(raylet)[0m [2023-04-23 01:47:20,512 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(72530) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3210)[0m [10,   451 / 514 batches] loss: 0.052


[2m[33m(raylet)[0m [2023-04-23 01:48:20,517 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(72814) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

[2m[36m(func pid=3210)[0m [10,   501 / 514 batches] loss: 0.052


[2m[33m(raylet)[0m [2023-04-23 01:50:20,527 E 2692 2692] (raylet) worker_pool.cc:525: Some workers of the worker process(73373) have not registered within the timeout. The process is dead, probably it crashed during start.
[2m[33m(raylet)[0m Traceback (most recent call last):
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/workers/setup_worker.py", line 4, in <module>
[2m[33m(raylet)[0m     from ray._private.ray_constants import LOGGER_FORMAT, LOGGER_LEVEL
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/__init__.py", line 136, in <module>
[2m[33m(raylet)[0m     from ray._private.worker import (  # noqa: E402,F401
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/worker.py", line 74, in <module>
[2m[33m(raylet)[0m     from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
[2m[33m(raylet)[0m   File "/usr/local/lib/python3.9/dist-packages/ray/_private/runtime_e

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-ce65bca7c9d9>", line 1, in <cell line: 1>
    result = run_hyperparameter_search(num_samples=8, max_epochs=CFG.EPOCHS, gpus_per_trial=0.25)
  File "<ipython-input-11-5bfc5ad920e6>", line 12, in run_hyperparameter_search
    result = tune.run(
  File "/usr/local/lib/python3.9/dist-packages/ray/tune/tune.py", line 758, in run
    _report_progress(runner, progress_reporter)
  File "/usr/local/lib/python3.9/dist-packages/ray/tune/tune.py", line 138, in _report_progress
    reporter.report(trials, done, sched_debug_str, executor_debug_str)
  File "/usr/local/lib/python3.9/dist-packages/ray/tune/progress_reporter.py", line 715, in report
    self._print(self._progress_str(trials, done, *sys_info))
  File "/usr/local/lib/python3.9/dist-packages/ray/tune/progress_reporter.py", 

In [None]:
best_trial = result.get_best_trial("val_loss", "min", "last")
best_checkpoint = result.get_best_checkpoint(trial=best_trial, metric="val_loss", mode="min")

my_trials = [ trial for trial in list(result.trial_dataframes.keys()) if str(best_trial) in trial]
result.trial_dataframes[my_trials[0]]["val_loss"]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-444b9e0dd449>", line 1, in <cell line: 1>
    best_trial = result.get_best_trial("val_loss", "min", "last")
NameError: name 'result' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/usr/local/lib/pyt

In [None]:

print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
    best_trial.last_result["val_loss"]))

best_model = ResNet18EmbeddingsShopeeNet(config['bn'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_checkpoint = result.get_best_checkpoint(trial=best_trial, metric="val_loss", mode="min")
best_checkpoint_dir = best_checkpoint.to_directory(path="directory")

model_state, optimizer_state = torch.load(os.path.join(best_checkpoint_dir, "checkpoint"))
best_model.load_state_dict(model_state)
torch.save(best_model.state_dict(), './best_model_params.pt')

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-69f9c72f92c6>", line 1, in <cell line: 1>
    print("Best trial config: {}".format(best_trial.config))
NameError: name 'best_trial' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/usr/local/lib/pyth

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_model.to(DEVICE)

from modules.datasets.ImageShopeeDataset import ImageShopeeDataset

images_dataset = ImageShopeeDataset(
    train_df['image'].values,
    transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]))

images_loader = torch.utils.data.DataLoader(
    images_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=2
)

image_embeddings = []
i = 0
with torch.no_grad():
    for data in tqdm(images_loader):
        i += 1
        data = data.to(DEVICE)
        embeddings = best_model(data)
        embeddings = embeddings.reshape(embeddings.shape[0], embeddings.shape[1])
        embeddings = embeddings.detach().cpu().numpy()
        image_embeddings.extend(embeddings)

In [None]:
image_embeddings = np.stack(image_embeddings)
norms = np.linalg.norm(image_embeddings, axis=1)
image_embeddings = image_embeddings / norms[:, np.newaxis]
image_embeddings.shape

In [None]:
k = 100
res = faiss.StandardGpuResources()
index_img = faiss.IndexFlatIP(512)
# index_img = faiss.index_cpu_to_gpu(res, 0, index_img)
index_img.add(image_embeddings)
similarities_img, indexes_img = index_img.search(image_embeddings, k)

In [None]:
def calc_f1_score(targets, results):
    intersect = len(np.intersect1d(targets, results))
    return 2 * intersect / (len(targets) + len(results))


def process_for_threshold(similarities, indexes, threshold):
    f1_score_accumulated = 0
    for i in range(len(image_embeddings)):
        cur_sims = similarities[i]
        cur_indexes = indexes[i]
        duplicate_indexes = cur_indexes[cur_sims >= threshold]
        results = train_df.iloc[duplicate_indexes]['posting_id'].values
        targets = train_df.iloc[i]['target']
        f1_score = calc_f1_score(targets, results)
        f1_score_accumulated += f1_score
    return f1_score_accumulated / len(image_embeddings)


thresholds = np.arange(0.85, 0.99, 0.01)
f1_avg_scores = []
for threshold in tqdm(thresholds):
    f1_avg = process_for_threshold(similarities_img, indexes_img, threshold)
    f1_avg_scores.append(f1_avg)

In [None]:
import matplotlib.pyplot as plt

plt.plot(thresholds, f1_avg_scores)

# Add labels and title
plt.xlabel('Threshold for cosine similarity')
plt.ylabel('Average F1-score')
plt.title('F1-score vs threshold for cosine image similarity')
plt.grid(True)

max_f1 = max(f1_avg_scores)
max_threshold = thresholds[np.argmax(f1_avg_scores)]

# Save the plot to a file
plt.savefig('cnn-contrastive-loss-thresholds.png')

In [None]:
print(f"Max f1-score: {max_f1}, threshold: {max_threshold}") 