### Effnet

In [1]:
!pip install ../input/externalshopee/Keras_Applications-1.0.8-py3-none-any.whl >> /dev/null
!pip install ../input/externalshopee/efficientnet-1.1.1-py3-none-any.whl >> /dev/null

In [2]:
import gc
import time
import math

import numpy as np
import pandas as pd

import cudf
import cuml
import cupy
from cuml import PCA
from cuml.neighbors import NearestNeighbors
from cuml.feature_extraction.text import TfidfVectorizer

from tqdm.notebook import tqdm
tqdm.pandas()

import tensorflow as tf
import efficientnet.tfkeras as efn

  from pandas import Panel


In [3]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
BATCH_SIZE = 8
IMAGE_SIZE = [512, 512]
# Seed
SEED = 42
# Verbosity
VERBOSE = 1
# Number of classes
N_CLASSES = 11014

# Paths
WORK_DIR = "../input/shopee-product-matching/"

EFFNET_B3 = '../input/all-data-shopee-efficientnetb3-arcmarginproduct-v2/EfficientNetB3_512_42.h5'
EFFNET_B4 = '../input/all-data-shopee-efficientnetb4-coslr-to-ens/EfficientNetB4_384_42.h5'
# EFFNET_B4 = '../input/all-data-shopee-efficientnetb4-to-ens/EfficientNetB4_384_42.h5'
EFFNET_B4_2 = '../input/last-validation-data-shopee-efficientnetb4/EfficientNetB4_456_2020.h5'
EFFNET_B5 = '../input/all-data-efficientnetb5-more-aug-to-ens/EfficientNetB5_512_42.h5'

In [4]:
# RESTRICT TENSORFLOW TO 2GB OF GPU RAM
# SO THAT WE HAVE 14GB RAM FOR RAPIDS
LIMIT = 2.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

We will restrict TensorFlow to max 2GB GPU RAM
then RAPIDS can use 14GB GPU RAM


In [5]:
# Flag to get cv score
GET_CV = False
# Flag to check ram allocations (debug)
CHECK_SUB = False

df = pd.read_csv(WORK_DIR + 'test.csv')
# If we are comitting, replace train set for test set and dont get cv
if len(df) > 3:
    GET_CV = False
del df

In [6]:
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

# Function to combine predictions
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return ' '.join( np.unique(x) )

def norm(x):
    return x / np.linalg.norm(x)

In [7]:
# Function to read out dataset
def read_dataset():
    if GET_CV:
        df = pd.read_csv(WORK_DIR + 'train.csv')
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
        if CHECK_SUB:
            df = pd.concat([df, df], axis = 0)
            df.reset_index(drop = True, inplace = True)
        df_cu = cudf.DataFrame(df)
        image_paths = WORK_DIR + 'train_images/' + df['image']
    else:
        df = pd.read_csv(WORK_DIR + 'test.csv')
        df_cu = cudf.DataFrame(df)
        image_paths = WORK_DIR + 'test_images/' + df['image']
        
    return df, df_cu, image_paths

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image

# Function to get our dataset that read images
def get_dataset(image):
    dataset = tf.data.Dataset.from_tensor_slices(image)
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Arcmarginproduct class keras layer
class ArcMarginProduct(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

# Function to get the embeddings of our images with the fine-tuned model
def get_image_embeddings(image_paths, model_weight):
    embeds = []
    
    margin = ArcMarginProduct(
            n_classes = N_CLASSES, 
            s = 30, 
            m = 0.5,
            name='head/arc_margin', 
            dtype='float32'
            )

    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3), name = 'inp1')
    label = tf.keras.layers.Input(shape = (), name = 'inp2')
    if 'B3' in model_weight:
        x = efn.EfficientNetB3(weights = None, include_top = False)(inp)
    elif 'B4' in model_weight:
        x = efn.EfficientNetB4(weights = None, include_top = False)(inp)
    elif 'B5' in model_weight:
        x = efn.EfficientNetB5(weights = None, include_top = False)(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = margin([x, label])
        
    output = tf.keras.layers.Softmax(dtype='float32')(x)

    model = tf.keras.models.Model(inputs = [inp, label], outputs = [output])
    model.load_weights(model_weight)
    model = tf.keras.models.Model(inputs = model.input[0], outputs = model.layers[-4].output)
    chunk = 5000
    iterator = np.arange(np.ceil(len(df) / chunk))
    for j in iterator:
        a = int(j * chunk)
        b = int((j + 1) * chunk)
        image_dataset = get_dataset(image_paths[a:b])
        image_embeddings = model.predict(image_dataset)
        embeds.append(image_embeddings)
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

"""
KNN
"""
# Function to get 50 nearest neighbors of each image and apply a distance threshold to maximize cv
def get_neighbors(df, embeddings, KNN = 50, th = 0.3):
    model = NearestNeighbors(n_neighbors = KNN, metric = 'cosine')
#     model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < th)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [8]:
df, df_cu, image_paths = read_dataset()

### Nfnet

In [9]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

import os 
import cv2
import timm
import random 

from tqdm import tqdm 

import albumentations as A 
from albumentations.pytorch.transforms import ToTensorV2

import torch 
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset 

In [10]:
class CFG:
    
    img_size = 512
    batch_size = 12
    seed = 42
    
    device = 'cuda'
    classes = 11014
    
    model_name = 'eca_nfnet_l1'
    model_path = '../input/shopeetorchmodel/nfnet_l1_mish_all_data_augs_20_epo.pt'
#     model_name = 'eca_nfnet_l0'
#     model_path = '../input/shopeetorchmodel/nfnet_l0_mish_all_data_augs_25_epo.pt'
    
    scale = 30 
    margin = 0.5
    
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)

In [11]:
def get_test_transforms():

    return A.Compose(
        [
            A.Resize(CFG.img_size,CFG.img_size,always_apply=True),
            A.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

class ShopeeDatasetTorchImage(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
    
        return image,torch.tensor(1)
    
class ArcMarginProductTorchImage(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProductTorchImage, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device='cuda')
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale

        return output

class ShopeeModelTorchImage(nn.Module):

    def __init__(
        self,
        n_classes = CFG.classes,
        model_name = CFG.model_name,
        fc_dim = 512,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True,
        pretrained = False):

        super(ShopeeModelTorchImage,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if model_name == 'resnext50_32x4d':
            final_in_features = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'efficientnet_b3':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()

        elif model_name == 'tf_efficientnet_b5_ns':
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif model_name == 'eca_nfnet_l0':
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()
            
        elif model_name == 'eca_nfnet_l1':
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()

        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc

        self.dropout = nn.Dropout(p=0.0)
        self.fc = nn.Linear(final_in_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        final_in_features = fc_dim

        self.final = ArcMarginProductTorchImage(
            final_in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        feature = self.extract_feat(image)
        #logits = self.final(feature,label)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x
    
class Mish_func(torch.autograd.Function):
    
    """from: https://github.com/tyunist/memory_efficient_mish_swish/blob/master/mish.py"""
    
    @staticmethod
    def forward(ctx, i):
        result = i * torch.tanh(F.softplus(i))
        ctx.save_for_backward(i)
        return result

    @staticmethod
    def backward(ctx, grad_output):
        i = ctx.saved_variables[0]
  
        v = 1. + i.exp()
        h = v.log() 
        grad_gh = 1./h.cosh().pow_(2) 

        # Note that grad_hv * grad_vx = sigmoid(x)
        #grad_hv = 1./v  
        #grad_vx = i.exp()
        
        grad_hx = i.sigmoid()
        grad_gx = grad_gh *  grad_hx #grad_hv * grad_vx 
        grad_f =  torch.tanh(F.softplus(i)) + i * grad_gx 
        
        return grad_output * grad_f 

class Mish(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        pass
    def forward(self, input_tensor):
        return Mish_func.apply(input_tensor)

def replace_activations(model, existing_layer, new_layer):
    
    """A function for replacing existing activation layers"""
    
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_activations(module, existing_layer, new_layer)

        if type(module) == existing_layer:
            layer_old = module
            layer_new = new_layer
            model._modules[name] = layer_new
    return model

def get_image_embeddings_torch(image_paths, model_name = CFG.model_name):
    embeds = []
    
    model = ShopeeModelTorchImage(model_name = model_name)
    model.eval()
    
    if model_name == 'eca_nfnet_l0' or model_name == 'eca_nfnet_l1':
        model = replace_activations(model, torch.nn.SiLU, Mish())

    model.load_state_dict(torch.load(CFG.model_path))
    model = model.to(CFG.device)
    
    image_dataset = ShopeeDatasetTorchImage(image_paths=image_paths,transforms=get_test_transforms())
    image_loader = torch.utils.data.DataLoader(
        image_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=4
    )
    
    with torch.no_grad():
        for img,label in tqdm(image_loader): 
            img = img.cuda()
            label = label.cuda()
            feat = model(img,label)
            image_embeddings = feat.detach().cpu().numpy()
            embeds.append(image_embeddings)
    
    del model
    image_embeddings = np.concatenate(embeds)
    print(f'Our image embeddings shape is {image_embeddings.shape}')
    del embeds
    gc.collect()
    return image_embeddings

### TF-IDF

In [12]:
# https://adrien.barbaresi.eu/blog/simple-multilingual-lemmatizer-python.html
!cp -r ../input/shopee-simplemma/simplemma-main/* ./

import re
import nltk
import string
import simplemma
from tqdm import tqdm
from simplemma import text_lemmatizer
langdata = simplemma.load_data('id')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import torch

In [13]:
def split_units(text):
    """
    split numbers and characters in a string
    Ex. ex15ap => ex 15 ap
    """
    sub = []
    char, num = "", ""
    text = text.strip()
    for letter in text:
        if letter.isdigit():
            if char:
                sub.append(char)
                char = ""
            num += letter
        else:
            if num:
                sub.append(num)
                num = ""
            char += letter
    sub.append(char) if char else sub.append(num)
    return ' '.join(sub)

def uniform_units(text):
    words = text.split()
    words = ['gram' if w in ['g', 'gr', 'grm'] else w for w in words]
    return ' '.join(words)

e_commerce_stopwords = ['ready', 'stock', 'free', 'gift', 'jaring', 'sabun', 'siap', 'kirim', 'diskon', '11', 'buruan',
                        'order', 'open', 'reseller', 'langsung', 'readystock', 'cod', 'bisa', 'promo', 'promotion',
                        'best', 'seller', 'sku', 'fast', 'delivery', 'bayar', 'ditempat', 'belanja', 'aman', 'nyaman',
                        'pos', 'today', 'hot', 'di', 'tempat', 'terlaris', 'garansi', 'stok', 'mohon', 'baca', 
                        'deskripsi', 'description', 'resmi', 'distributor', 'sold', 'out', 'ress', 'distri',
                        'ori', 'origin', 'original', 'new', 'import', 'lokal'
                       ]

def remove_sw(text):
    words = text.split()
    words = [w for w in words if w not in e_commerce_stopwords]
    return ' '.join(words) if len(words) > 0 else text

# text cleaning V5
def text_preprocess(text):
    text = text.lower()
    
    # remove unicode emojis
    text = re.sub(r'\\x(.){2}', ' ', text)
    
    # remove punctuations
    character_list = string.punctuation
    text = text.translate(str.maketrans(character_list, ' ' * len(character_list)))
    
    # split& uniform units
    text = split_units(text)
    text = uniform_units(text)
    
    # lemmatization id
    words = text_lemmatizer(text, langdata)
    
    # lemmatization en
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in words])
    
    # remove stopwords
    text = remove_sw(text)
    
    return text

df['title_to_use'] = df['title'].apply(text_preprocess)

In [14]:
tfidf_vec = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", binary=True)
tfidf_vec.fit(df['title_to_use'])
dictionary = tfidf_vec.get_feature_names()

In [15]:
"""Remove duplicates like : fresh care=>freshcare"""
dict_dup = {}

for text in tqdm(df['title_to_use']):
    words = nltk.word_tokenize(text)
    if len(words) < 2: continue
    # bigrams
    bigrams = nltk.bigrams(words)
    for grams in bigrams:
        tmp = ''.join(grams)
        if tmp in dictionary and tmp not in dict_dup and len(tmp) >= 6 and not tmp.isdigit():
            k = ' '.join(grams)
            dict_dup[k] = tmp
    # trigrams
    trigrams = nltk.trigrams(words)
    for grams in trigrams:
        tmp = ''.join(grams)
        if tmp in dictionary and tmp not in dict_dup and len(tmp) >= 9 and not tmp.isdigit():
            k = ' '.join(grams)
            dict_dup[k] = tmp

print(len(dict_dup))

def remove_duplicates(text):
    words = text.split()
    bigrams = [' '.join(words[i:i+2]) for i in range(len(words)-2+1)]
    trigrams = [' '.join(words[i:i+3]) for i in range(len(words)-3+1)]
    for g in bigrams + trigrams:
        if g in dict_dup:
            text = text.replace(g, dict_dup[g])
    return text

df['title_to_use_'] = df['title_to_use'].progress_apply(remove_duplicates)
del dictionary; gc.collect()

100%|██████████| 3/3 [00:00<00:00, 258.30it/s]

0





  0%|          | 0/3 [00:00<?, ?it/s]

26

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def text_knn(df, text_tensor, K, th, chunk = 128):
    out_preds = []
    for i in tqdm(list(range(0, df.shape[0], chunk)) + [df.shape[0]-chunk]):
        arr = text_tensor[i : i + chunk] @ text_tensor.T
        if len(df) > 3: 
            indices = torch.nonzero((arr > th) & (arr >= arr.sort(descending=True).values[:,K-1].reshape(arr.shape[0],-1)))
        else:
            indices = torch.nonzero(arr > th)

        preds = dict()
        for k in range(arr.shape[0]):
            preds[k] = []
        for ind in range(indices.size(0)):
            preds[indices[ind, 0].item()].append(indices[ind, 1].item())

        out_preds.extend([(df.iloc[k].posting_id, df.iloc[v].posting_id.tolist()) for k, v in preds.items()])
    return out_preds[:df.shape[0]]

def text_knn_2(df, tensor1, tensor2, K, th, chunk = 128):
    out_preds = []
    for i in tqdm(list(range(0, df.shape[0], chunk)) + [df.shape[0]-chunk]):
        arr = tensor1[i : i + chunk] @ tensor1.T + (tensor2[i : i + chunk] @ tensor2.T)
        if len(df) > 3: 
            indices = torch.nonzero((arr > th) & (arr >= arr.sort(descending=True).values[:,K-1].reshape(arr.shape[0],-1)))
        else:
            indices = torch.nonzero(arr > th)

        preds = dict()
        for k in range(arr.shape[0]):
            preds[k] = []
        for ind in range(indices.size(0)):
            preds[indices[ind, 0].item()].append(indices[ind, 1].item())

        out_preds.extend([(df.iloc[k].posting_id, df.iloc[v].posting_id.tolist()) for k, v in preds.items()])
    return out_preds[:df.shape[0]]

def text_knn_3(df, tensor1, tensor2, tensor3, weight1, weight2, weight3, K, th, chunk = 128):
    out_preds = []
    for i in tqdm(list(range(0, df.shape[0], chunk)) + [df.shape[0]-chunk]):
        arr = (tensor1[i : i + chunk] @ tensor1.T) * weight1 + \
              (tensor2[i : i + chunk] @ tensor2.T) * weight2 + \
              (tensor3[i : i + chunk] @ tensor3.T) * weight3
        if len(df) > 3: 
            indices = torch.nonzero((arr > th) & (arr >= arr.sort(descending=True).values[:,K-1].reshape(arr.shape[0],-1)))
        else:
            indices = torch.nonzero(arr > th)

        preds = dict()
        for k in range(arr.shape[0]):
            preds[k] = []
        for ind in range(indices.size(0)):
            preds[indices[ind, 0].item()].append(indices[ind, 1].item())

        out_preds.extend([(df.iloc[k].posting_id, df.iloc[v].posting_id.tolist()) for k, v in preds.items()])
    return out_preds[:df.shape[0]]

### SBert

In [17]:
import os
import random
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

import transformers

import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer

In [18]:
NUM_WORKERS = 4
BATCH_SIZE = 16
SEED = 42

transformer_model = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

TEXT_MODEL_PATH = '../input/best-multilingual-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'

model_params = {
    'n_classes':11014,
    'model_name':transformer_model,
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.3,
}

In [19]:
class ShopeeDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask
    
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeNet, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask):
        feature = self.extract_feat(input_ids,attention_mask)
        return F.normalize(feature)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)

        return features
    
def get_text_embeddings(df):
    embeds = []
    
    model = ShopeeNet(**model_params)
    model.eval()
    
    model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1]))
    model = model.to(device)

    text_dataset = ShopeeDataset(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        drop_last=False,
        num_workers=NUM_WORKERS
    )
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feat = model(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            embeds.append(text_embeddings)
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

## Inference

### 1. Text

1.1 TF-IDF

In [20]:
%%time
tfidf_vec = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", binary=True)
tfidf_embeddings = tfidf_vec.fit_transform(df['title_to_use_']).toarray().astype(np.float32)
tfidf_tensor = torch.from_numpy(tfidf_embeddings).to(device)
del tfidf_embeddings, tfidf_vec; gc.collect()

CPU times: user 1.64 s, sys: 643 ms, total: 2.28 s
Wall time: 4.38 s


26

In [21]:
gc.collect()

20

1.2 SBert

In [22]:
%%time
bert_embeddings = get_text_embeddings(df)
bert_embeddings = np.apply_along_axis(norm, 1, bert_embeddings)

bert_tensor = torch.from_numpy(bert_embeddings).to(device)
del bert_embeddings; gc.collect()

100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


Our text embeddings shape is (3, 768)
CPU times: user 6.79 s, sys: 2.61 s, total: 9.4 s
Wall time: 31 s


0

1.N All text

### 2. Image

2.1 Effnet B3 (loss 14.20)

In [23]:
%%time
IMAGE_SIZE = [512, 512]
effb3_embeddings = get_image_embeddings(image_paths, EFFNET_B3)
effb3_embeddings = np.apply_along_axis(norm, 1, effb3_embeddings)

image_embeddings = effb3_embeddings
del effb3_embeddings; gc.collect()

Our image embeddings shape is (3, 1536)
CPU times: user 6.94 s, sys: 528 ms, total: 7.47 s
Wall time: 11.5 s


35600

2.2 Nfnet (loss 14.31)

In [24]:
%%time
if True:
    nfnet_embeddings = get_image_embeddings_torch(image_paths.values)
    nfnet_embeddings = np.apply_along_axis(norm, 1, nfnet_embeddings)
    
    image_embeddings = np.concatenate([image_embeddings, nfnet_embeddings], axis=1)
    del nfnet_embeddings; gc.collect()

Building Model Backbone for eca_nfnet_l1 model


100%|██████████| 1/1 [00:00<00:00,  2.04it/s]


Our image embeddings shape is (3, 512)
CPU times: user 1.69 s, sys: 345 ms, total: 2.04 s
Wall time: 5.76 s


2.3 Effnet B4 (cosLR, 384, loss 14.34)

In [25]:
%%time
if True:
    IMAGE_SIZE = [384, 384]
    effb4_embeddings = get_image_embeddings(image_paths, EFFNET_B4)
    effb4_embeddings = np.apply_along_axis(norm, 1, effb4_embeddings)
    
    image_embeddings = np.concatenate([image_embeddings, effb4_embeddings], axis=1)
    del effb4_embeddings; gc.collect()

Our image embeddings shape is (3, 1792)
CPU times: user 7.59 s, sys: 228 ms, total: 7.82 s
Wall time: 11.1 s


2.3 Effnet B4_2 (Mosaic aug, 456, loss 14.31)

In [26]:
%%time
if True:
    IMAGE_SIZE = [456, 456]
    effb4_2_embeddings = get_image_embeddings(image_paths, EFFNET_B4_2)
    effb4_2_embeddings = np.apply_along_axis(norm, 1, effb4_2_embeddings)
    
    image_embeddings = np.concatenate([image_embeddings, effb4_2_embeddings], axis=1)
    del effb4_2_embeddings; gc.collect()

Our image embeddings shape is (3, 1792)
CPU times: user 7.31 s, sys: 161 ms, total: 7.47 s
Wall time: 10.1 s


2.5 Effnet B5

In [27]:
%%time
if False:
    IMAGE_SIZE = [512, 512]
    effb5_embeddings = get_image_embeddings(image_paths, EFFNET_B5)
    effb5_embeddings = np.apply_along_axis(norm, 1, effb5_embeddings)
    
    image_embeddings.append(effb5_embeddings)
    del effb5_embeddings; gc.collect()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 7.15 µs


2.N All images

In [28]:
image_embeddings = np.apply_along_axis(norm, 1, image_embeddings)

In [29]:
# # basic image match
# df, image_predictions = get_neighbors(df, image_embeddings, KNN = 51 if len(df)>3 else 3, th = 0.24)

In [30]:
# df['image_predictions'] = image_predictions
# del image_predictions; gc.collect()

In [31]:
# tfidf_tensor = torch.from_numpy(tfidf_embeddings).to(device)
# bert_tensor = torch.from_numpy(bert_embeddings).to(device)
# del bert_embeddings,tfidf_embeddings; gc.collect()
# tensor_concat_txt = torch.cat((tfidf_tensor, bert_tensor), 1)
# tensor_concat_txt = tensor_concat_txt / torch.norm(tensor_concat_txt,2, 1).view([-1,1])
# del tfidf_tensor, bert_tensor

In [32]:
# concat
tensor_concat_img = torch.from_numpy(image_embeddings).to(device)
del image_embeddings; gc.collect()

120

### 3. Concat image & text 

3.1 Concat pred 51 : to recall stable matches

In [33]:
def text_img_knn(df, tensor_concat_img, tfidf_tensor, bert_tensor, K, th_min, img_th_max, text_th_max, th_mean, th_top, chunk = 128):
    out_preds = []
    for i in tqdm(list(range(0, df.shape[0], chunk)) + [df.shape[0]-chunk]):
        arr_img = tensor_concat_img[i : i + chunk] @ tensor_concat_img.T
        arr_text = (tfidf_tensor[i : i + chunk] @ tfidf_tensor.T + bert_tensor[i : i + chunk] @ bert_tensor.T)*0.5
#         arr_text = arr_text / torch.norm(arr_text, 2, 1).view([-1,1])
        if len(df) > 3: 
            indices = torch.nonzero(
                                    ((arr_img > th_top) | (arr_text > th_top)) | ((arr_img > th_min) & (arr_text > text_th_max)) | 
                                    ((arr_img > img_th_max) & (arr_text > th_min)) | (arr_img + arr_text > 2 * th_mean)

#                                     & (arr_img >= arr_img.sort(descending=True).values[:,K-1].reshape(arr_img.shape[0],-1))
#                                     & (arr_text >= arr_text.sort(descending=True).values[:,K-1].reshape(arr_text.shape[0],-1))
                                   )
        else:
            indices = torch.nonzero(
                                    ((arr_img > th_top) | (arr_text > th_top)) | ((arr_img > th_min) & (arr_text > text_th_max)) | 
                                    ((arr_img > img_th_max) & (arr_text > th_min)) | (arr_img + arr_text > 2 * th_mean)
                                   )

        preds = dict()
        for k in range(arr_img.shape[0]):
            preds[k] = []
        for ind in range(indices.size(0)):
            preds[indices[ind, 0].item()].append(indices[ind, 1].item())

        out_preds.extend([(df.iloc[k].posting_id, df.iloc[v].posting_id.tolist()) for k, v in preds.items()])
    return out_preds[:df.shape[0]]

In [34]:
# get image & test union pred
out_preds = text_img_knn(df, tensor_concat_img, tfidf_tensor, bert_tensor, 51, th_min=0.1, img_th_max=0.74, text_th_max=0.82, th_mean=0.7, th_top=0.9)
df_pred_concat = pd.DataFrame(out_preds, columns=["index", "pred"])
df['concat_predictions'] = df_pred_concat['pred']
del out_preds, df_pred_concat; gc.collect()

100%|██████████| 2/2 [00:00<00:00, 113.06it/s]


47

3.2 Concat pred 2 : to improve match 1

In [35]:
# out_preds = text_knn_2(df, tensor_concat_img, tensor_concat_txt, K=2, th=0.5)
out_preds = text_knn_3(df, tensor_concat_img, tfidf_tensor, bert_tensor, 1, 0.5, 0.5, K=2, th=0.6)
df_match_1 = pd.DataFrame(out_preds, columns=["index", "pred"])
df['combo_pred_2'] = df_match_1['pred']
del out_preds, df_match_1; gc.collect()

100%|██████████| 2/2 [00:00<00:00, 342.94it/s]


47

3.3 Concat pred 2 : label propagation using strict thresholds

In [36]:
# out_preds = text_knn_2(df, tensor_concat_img, tensor_concat_txt, K=2, th=1.8)
out_preds = text_knn_3(df, tensor_concat_img, tfidf_tensor, bert_tensor, 1, 0.5, 0.5, K=2, th=1.7)
df_propa = pd.DataFrame(out_preds, columns=["index", "pred"])
df['combo_pred_propa'] = df_propa['pred']
del out_preds, df_propa; gc.collect()

100%|██████████| 2/2 [00:00<00:00, 337.72it/s]


47

In [37]:
del tensor_concat_img, tfidf_tensor, bert_tensor; gc.collect()

20

## Post-processing

In [38]:
def combine_predictions_pp(row):
#     x = np.concatenate([row['image_predictions'], row['tfidf_predictions']])
#     x = np.concatenate([row['image_predictions'], row['tfidf_predictions'], row['concat_predictions']])
    x = row['concat_predictions']
    if len(x) > 50:
        x = x[:50]
    return np.unique(x)

In [39]:
df['first_pred'] = df.apply(combine_predictions_pp, axis = 1)

#### PP based on nb of matches

In [40]:
df['nb_first_pred'] = df['first_pred'].apply(len)

# if use image + text
df['second_pred'] = np.where(df.nb_first_pred<2, df.combo_pred_2, df.first_pred)

df['nb_second_pred'] = df['second_pred'].apply(len)
print(len(df[df.nb_first_pred!=df.nb_second_pred]), 'rows processed')
print('Avg match item before pp :', df.nb_first_pred.mean(), '| Avg match item after pp :', df.nb_second_pred.mean())

0 rows processed
Avg match item before pp : 1.0 | Avg match item after pp : 1.0


#### PP based on symmetry

In [41]:
# if first A->[], B->[C]
# and second A->[B], B->[C]
# then third A->[B], B->[C, A]
if len(df)>3:
    def get_comp_set(row):
        return ' '.join(list(set(row['second_pred']) - set(row['first_pred'])))

    df_1_to_2 = df[(df.nb_first_pred==1)&(df.nb_second_pred==2)][['first_pred', 'second_pred']]
    df_1_to_2['second_pred_comp'] = df_1_to_2.apply(get_comp_set, axis = 1)
    del df_1_to_2['second_pred']

    dict_pp_comp = df_1_to_2.set_index('second_pred_comp').T.to_dict('list')
    def update_pp(row):
        if row['posting_id'] in dict_pp_comp:
            return np.unique(np.concatenate([row['third_pred'], dict_pp_comp[row['posting_id']][0]]))
        return row['third_pred']

    df['third_pred'] = df['second_pred']
    df['third_pred'] = df.apply(update_pp, axis=1)
    df['nb_third_pred'] = df['third_pred'].apply(len)

    print(len(df[df.nb_second_pred!=df.nb_third_pred]), 'rows processed')
    print('Avg match item before pp :', df.nb_second_pred.mean(), '| Avg match item after pp :', df.nb_third_pred.mean())
    
    # propagation pp
    cnt = 0
    pp_dict_propa = {}
    for index, row in df.iterrows():
        item, matches = row['posting_id'], row['third_pred']
        if len(matches) == 2:
            p_matches = np.concatenate(df[df.posting_id.isin(matches)]['combo_pred_propa'].tolist())
            if item in pp_dict_propa:
                pp_dict_propa[item] = np.unique(np.concatenate([p_matches, pp_dict_propa[item]]))
            else:
                pp_dict_propa[item] = np.unique(np.concatenate([p_matches, matches]))
        cnt += 1
        if cnt % 5000 == 0: print('Checked :', cnt)
            
    def update(i):
        return pp_dict_propa[i] if i in pp_dict_propa else df[df.posting_id==i]['third_pred'].values[0]
    
    if len(pp_dict_propa) > 0:
        print(len(pp_dict_propa), 'values to update')
        df['propa_pred'] = df['posting_id'].progress_apply(update)
    else:
        print("Nothing to update")
        df['propa_pred'] = df['third_pred']
    df['nb_propa_pred'] = df['propa_pred'].apply(len)
        
    print(len(df[df.nb_propa_pred!=df.nb_third_pred]), 'rows processed')
    print('Avg match item before pp :', df.nb_third_pred.mean(), '| Avg match item after pp :', df.nb_propa_pred.mean())
    
    df['third_pred'] = df['propa_pred']

else:
    df['third_pred'] = df['second_pred']

In [42]:
"""OOF LOGS"""

# all

# average effnetb3 & nfnet_l1 & effnetb4 & effnetb482 + bert + pp_propa
# 0.24(4)/0.8/1.5-0.6-1.7 : 5.87(3554)/5.98(703)/6.00(249)/0.954
# 0.24(4)/0.8/1.6-0.6-1.7 : 5.80(4500)/5.93(880)/5.96(356)/0.950
# 0.22(4)/0.8/1.55-0.6-1.7 : 5.73(4666)/5.86(965)/5.90(378)/0.945 767
# 0.22(4)/0.8/1.6-0.6-1.7 : 5.68(5222)/5.83(1068)/5.88(470)/0.942

###############################

# add concat_pred

# baseline
# 0.26/0.8/1.4-0.5 : 5.64/5.79(5152)/5.84(1648)/0.921 764

# average effnetb3 & nfnet_l0
# 0.26(2)/0.8/1.4-0.5 : 5.66/5.79(4587)/5.84(1453)/0.928 765

# average effnetb3 & nfnet_l1
# 0.24(2)/0.8/1.4-0.5 : 5.68/5.81(4421)/5.85(1330)/0.932 766
# 0.26(2)/0.8/1.4-0.5 : 5.78/5.89(3904)/5.93(1168)/0.938

# average effnetb3 & nfnet_l1 + bert
# 0.24(2)/0.8/1.5-0.6 : 5.69/5.82(4309)/5.85(1015)/0.939 767

# average effnetb3 & nfnet_l1 & effnetb4 + bert
# 0.24(3)/0.8/1.5-0.6 : 5.75/5.87(4053)/5.90(930)/0.942 768

# average effnetb3 & nfnet_l1 & effnetb3_2 & effnetb4 + bert
# 0.24(4)/0.8/1.5-0.6 : 5.78/5.90(4099)/5.93(894)/0.945 768++

# average effnetb3 & nfnet_l1 & effnetb3_2 & effnetb4 + average (tfidf+bert) + bert
# 0.24(4)/1.6/1.5-0.6 : 5.60/5.72(4062)/5.74(730)/0.964
# 0.24(4)/1.5/1.5-0.6 : 5.66/5.76(3455)/5.78(635)/0.966 768+

# average effnetb3 & nfnet_l1 & effnetb5 + bert
# 0.24(3)/0.8/1.5-0.6 : 5.62/5.76(4908)/5.80(1167)/0.932 767

###############################

# union effnetb3 & nfnet_l0
# 0.26/0.28/0.8-0.5 : 5.84/5.95(3690)/5.99(1226)/0.935

# average effnetb3 & nfnet_l0
# 0.26(2)/0.8-0.5 : 5.56/5.73(5373)/5.78(1618)/0.923

# baseline
# 0.26/0.8-0.5 : 5.56/5.73(5871)/5.78(1799)/0.917 763



# 5.861489051094891
# Our final f1 cv score is 0.9793075685070094

'OOF LOGS'

### Manual Debug

Check cosine dist between 2 embeddings

In [43]:
# from numpy import dot
# from numpy.linalg import norm

# df['nb_matches'] = df['matches'].apply(lambda x:len(x.split()))
# df[(df.nb_second_pred==1)&(df.nb_matches==2)][['posting_id', 'matches', 'second_pred']]

In [44]:
# df[(df.nb_third_pred==3)&(df.nb_matches==2)&(df.nb_second_pred==2)][['posting_id', 'f1', 'matches', 'second_pred', 'third_pred']]

In [45]:
# df[df.posting_id=='train_1943466047'].index.values[0]

In [46]:
# ind_a = 116

# # only works when a have 2 matches
# matches_a = df.iloc[ind_a]['matches'].split()
# id_a = df.iloc[ind_a]['posting_id']
# matches_b = list(set(matches_a) - set([id_a]))[0]
# ind_b = df[df.posting_id==matches_b].index.values[0]

# get cosine dist / val
# a = image_embeddings[ind_a]
# b = image_embeddings[ind_b]
# print("Image cosine distance : ", round(1-dot(a, b)/(norm(a)*norm(b)),3))
# a = text_embeddings[ind_a]
# b = text_embeddings[ind_b]
# print("Text cosine value     : ", round(dot(a, b)/(norm(a)*norm(b)),3))
# a = bert_embeddings[ind_a]
# b = bert_embeddings[ind_b]
# print("Bert cosine distance : ", round(1-dot(a, b)/(norm(a)*norm(b)),3))

In [47]:
# model = NearestNeighbors(n_neighbors = 2, metric = 'cosine')
# model.fit(image_embeddings)
# distances, indices = model.kneighbors(image_embeddings)

In [48]:
# distances[ind_a]

In [49]:
# arr = text_tensor[34176] @ text_tensor.T
# arr.sort(descending=True).values[:2]

In [50]:
# model = NearestNeighbors(n_neighbors = 51, metric = 'cosine')
# model.fit(bert_embeddings)
# distances, indices = model.kneighbors(bert_embeddings)

Check distribution

In [51]:
# N = 10
# df_head_N = pd.DataFrame()
# df_head_N['nb_first_pred'] = df.nb_first_pred.value_counts().sort_index().head(N)
# df_head_N['nb_second_pred'] = df.nb_second_pred.value_counts().sort_index().head(N)
# df_head_N['nb_third_pred'] = df.nb_third_pred.value_counts().sort_index().head(N)
# df_head_N.head(N).T


#                 1	       2	 3	   4	       5	 6	     7	     8	    9	    10
# nb_first_pred	2367.0	12380.0	4896.0	3372.0	2218.0	1654.0	1008.0	919.0	790.0	460.0
# nb_second_pred	NaN	    14747.0	4896.0	3372.0	2218.0	1654.0	1008.0	919.0	790.0	460.0
# nb_third_pred	NaN	    14541.0	5042.0	3391.0	2245.0	1661.0	1008.0	922.0	791.0	463.0

In [52]:
# pd.DataFrame(df['matches'].apply(lambda x:len(x.split())).value_counts().sort_index()).head(9).T

In [53]:
# ax = df['nb_third_pred'].value_counts().sort_index().plot.bar(figsize=(18,3))

In [54]:
# ax = df['matches'].apply(lambda x:len(x.split())).value_counts().sort_index().plot.bar(figsize=(18,3))

### Submit

In [55]:
# Concatenate image predctions with text predictions
if GET_CV:
    df['pred_matches'] = df['third_pred'].apply(lambda x:' '.join(x))
    print(df['pred_matches'].apply(lambda x: len(x.split())).mean())
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'Our final f1 cv score is {score}')
#     df['matches'] = df['pred_matches']
#     df[['posting_id', 'matches']].to_csv('submission.csv', index = False)
else:
    df['matches'] = df['third_pred'].apply(lambda x:' '.join(x))
    df[['posting_id', 'matches']].to_csv('submission.csv', index = False)