In [1]:
from google.colab import drive 
drive.mount("/content/drive", force_remount=True)
# Change directory to the package folder 
%cd '/content/drive/MyDrive/dl-project'
# Verify the contents of the current folder 
# !ls

Mounted at /content/drive
/content/drive/MyDrive/dl-project


In [2]:
!pip install transformers
!apt install libomp-dev
!python -m pip install --upgrade faiss-gpu==1.7.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libomp-dev is already the newest version (1:10.0-50~exp1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# copy dataset from google drive to the actual machine
# !rm -rf /content/data/

import os 
DATA_FOLDER = '/content/data/shopee-product-matching/'
if not os.path.isdir(DATA_FOLDER):    
    !mkdir /content/data/
    !cp /content/drive/MyDrive/dl-project/shopee-product-matching.zip /content/data/shopee-product-matching.zip
    !unzip -q /content/data/shopee-product-matching.zip -d /content/data/shopee-product-matching
    !cp /content/drive/MyDrive/dl-project/data/train.csv /content/data/shopee-product-matching/train.csv
    !cp /content/drive/MyDrive/dl-project/data/train80.csv /content/data/shopee-product-matching/train80.csv
    !cp /content/drive/MyDrive/dl-project/data/valid20.csv /content/data/shopee-product-matching/valid20.csv

In [4]:
import numpy as np
import pandas as pd

import torch
import torchvision.transforms as transforms
from torch.utils.data.dataset import Dataset
from tqdm import tqdm
from modules.datasets.ImageShopeeDataset import ImageShopeeDataset
from modules.models.ResNet18EmbeddingsShopeeNet import ResNet18EmbeddingsShopeeNet
from modules.models.DenseNetEmbeddingsShopeeNet import DenseNetEmbeddingsShopeeNet
from modules.utils.CommonVIsualizer import CommonVisualizer
from modules.utils.F1ScoreEvaluator import F1ScoreEvaluator
from modules.utils.EmbeddingsProducer import EmbeddingsProducer
import modules.utils.dataset_utils as dataset_utils

In [5]:
train_df = dataset_utils.get_dataset(DATA_FOLDER, is_test=False, file_name='train80.csv')
train_df = dataset_utils.add_target(train_df)

valid_df = dataset_utils.get_dataset(DATA_FOLDER, is_test=False, file_name='valid20.csv')
valid_df = dataset_utils.add_target(valid_df)


all_df = train_df.append(valid_df, ignore_index=True)
print(f"All shape {all_df.shape}")
print(f"Train shape {train_df.shape}")
print(f"Valid shape {valid_df.shape}")

All shape (34250, 7)
Train shape (27399, 7)
Valid shape (6851, 7)


  all_df = train_df.append(valid_df, ignore_index=True)


In [6]:
trans = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]) 
train_dataset = ImageShopeeDataset(
    train_df['image'].values,
    trans
    )
valid_dataset = ImageShopeeDataset(
    valid_df['image'].values,
    trans
)

all_dataset = ImageShopeeDataset(
    all_df['image'].values,
    trans
)


In [7]:
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=2
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=2
)

all_loader = torch.utils.data.DataLoader(
    all_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=2
)

In [8]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
resnet18_emb_model = ResNet18EmbeddingsShopeeNet()
resnet18_emb_model.to(DEVICE)
producer = EmbeddingsProducer(resnet18_emb_model, DEVICE)

resnet18_train_embeddings = producer.get_embeddings(train_loader, normalize=True)
resnet18_train_embeddings = resnet18_train_embeddings.squeeze()

resnet18_valid_embeddings = producer.get_embeddings(valid_loader, normalize=True)
resnet18_valid_embeddings = resnet18_valid_embeddings.squeeze()

resnet18_all_embeddings = np.concatenate((resnet18_train_embeddings, resnet18_valid_embeddings))

print(resnet18_all_embeddings.shape)

100%|██████████| 429/429 [02:47<00:00,  2.56it/s]
100%|██████████| 108/108 [00:42<00:00,  2.53it/s]

(34250, 512)





In [17]:
densenet_model = DenseNetEmbeddingsShopeeNet()
densenet_model.to(DEVICE)
producer = EmbeddingsProducer(densenet_model, DEVICE)
densenet_train_embeddings = producer.get_embeddings(train_loader, normalize=True)
densenet_valid_embeddings = producer.get_embeddings(valid_loader, normalize=True)
densenet_all_embeddings = np.concatenate((densenet_train_embeddings, densenet_valid_embeddings))
print(densenet_all_embeddings.shape)

100%|██████████| 429/429 [02:47<00:00,  2.57it/s]
100%|██████████| 108/108 [00:42<00:00,  2.56it/s]

(34250, 1000)





In [22]:
thresholds = np.arange(0.8, 0.99, 0.01)
visualizer = CommonVisualizer()

In [20]:
f1_score_eval_resnet18_train = F1ScoreEvaluator(train_df, resnet18_train_embeddings, k=100)
f1_score_eval_resnet18_valid = F1ScoreEvaluator(valid_df, resnet18_valid_embeddings, k=100)
f1_score_eval_resnet18_all = F1ScoreEvaluator(all_df, resnet18_all_embeddings, k=100)

f1_avg_scores_resnet18_train = f1_score_eval_resnet18_train.get_avg_f1_scores_for_thresholds(thresholds)
f1_avg_scores_resnet18_valid = f1_score_eval_resnet18_valid.get_avg_f1_scores_for_thresholds(thresholds)
f1_avg_scores_resnet18_all = f1_score_eval_resnet18_all.get_avg_f1_scores_for_thresholds(thresholds)

100%|██████████| 19/19 [02:42<00:00,  8.54s/it]
100%|██████████| 19/19 [00:40<00:00,  2.13s/it]
100%|██████████| 19/19 [03:24<00:00, 10.76s/it]


In [26]:
import matplotlib.pyplot as plt 

visualizer.plt_f1_score_vs_threshold(thresholds, f1_avg_scores_resnet18_train, 'ResNet18, baseline, train', './final-charts/resnet18-baseline-thresholds-train.png')
max_f1_resnet_train = max(f1_avg_scores_resnet18_train)
max_threshold_resnet_train = thresholds[np.argmax(f1_avg_scores_resnet18_train)]
plt.clf()

visualizer.plt_f1_score_vs_threshold(thresholds, f1_avg_scores_resnet18_valid, 'ResNet18, baseline, valid', './final-charts/resnet18-baseline-thresholds-valid.png')
max_f1_resnet_valid = max(f1_avg_scores_resnet18_valid)
max_threshold_resnet_valid = thresholds[np.argmax(f1_avg_scores_resnet18_valid)]
plt.clf()

visualizer.plt_f1_score_vs_threshold(thresholds, f1_avg_scores_resnet18_all, 'ResNet18, baseline', './final-charts/resnet18-baseline-thresholds-all.png')
max_f1_resnet_all = max(f1_avg_scores_resnet18_all)
max_threshold_resnet_all = thresholds[np.argmax(f1_avg_scores_resnet18_all)]
plt.clf()



<Figure size 640x480 with 0 Axes>

In [27]:
print(f"ResNet18, Train | Max f1-score: {max_f1_resnet_train}, max threshold: {max_threshold_resnet_train}")
print(f"ResNet18, Valid | Max f1-score: {max_f1_resnet_valid}, max threshold: {max_threshold_resnet_valid}")
print(f"ResNet18, All | Max f1-score: {max_f1_resnet_all}, max threshold: {max_threshold_resnet_all}")

Train | Max f1-score: 0.663419303454426, max threshold: 0.9000000000000001
Valid | Max f1-score: 0.6850545919519022, max threshold: 0.8800000000000001
  All | Max f1-score: 0.6555288254243538, max threshold: 0.9100000000000001


---------

In [28]:
f1_score_eval_densenet_train = F1ScoreEvaluator(train_df, densenet_train_embeddings, k=100)
f1_score_eval_densenet_valid = F1ScoreEvaluator(valid_df, densenet_valid_embeddings, k=100)
f1_score_eval_densenet_all = F1ScoreEvaluator(all_df, densenet_all_embeddings, k=100)

f1_avg_scores_densenet_train = f1_score_eval_densenet_train.get_avg_f1_scores_for_thresholds(thresholds)
f1_avg_scores_densenet_valid = f1_score_eval_densenet_valid.get_avg_f1_scores_for_thresholds(thresholds)
f1_avg_scores_densenet_all = f1_score_eval_densenet_all.get_avg_f1_scores_for_thresholds(thresholds)

100%|██████████| 19/19 [02:48<00:00,  8.85s/it]
100%|██████████| 19/19 [00:41<00:00,  2.19s/it]
100%|██████████| 19/19 [03:31<00:00, 11.14s/it]


In [31]:
import matplotlib.pyplot as plt 

visualizer.plt_f1_score_vs_threshold(thresholds, f1_avg_scores_densenet_train, 'DenseNet, baseline, train', './final-charts/densenet-baseline-thresholds-train.png')
max_f1_densenet_train = max(f1_avg_scores_densenet_train)
max_threshold_densenet_train = thresholds[np.argmax(f1_avg_scores_densenet_train)]
plt.clf()

visualizer.plt_f1_score_vs_threshold(thresholds, f1_avg_scores_densenet_valid, 'DenseNet, baseline, valid', './final-charts/densenet-baseline-thresholds-valid.png')
max_f1_densenet_valid = max(f1_avg_scores_densenet_valid)
max_threshold_densenet_valid = thresholds[np.argmax(f1_avg_scores_densenet_valid)]
plt.clf()

visualizer.plt_f1_score_vs_threshold(thresholds, f1_avg_scores_densenet_valid, 'DenseNet, baseline', './final-charts/densenet-baseline-thresholds-all.png')
max_f1_densenet_all = max(f1_avg_scores_densenet_all)
max_threshold_densenet_all = thresholds[np.argmax(f1_avg_scores_densenet_all)]
plt.clf()



<Figure size 640x480 with 0 Axes>

In [32]:
print(f"DenseNet, Train | Max f1-score: {max_f1_densenet_train}, max threshold: {max_threshold_densenet_train}")
print(f"DenseNet, Valid | Max f1-score: {max_f1_densenet_valid}, max threshold: {max_threshold_densenet_valid}")
print(f"DenseNet,   All | Max f1-score: {max_f1_densenet_all}, max threshold: {max_threshold_densenet_all}")

DenseNet, Train | Max f1-score: 0.6571591522253317, max threshold: 0.9200000000000002
DenseNet, Valid | Max f1-score: 0.6757532167626633, max threshold: 0.9100000000000001
DenseNet,   All | Max f1-score: 0.6501934780075753, max threshold: 0.9200000000000002
