In [1]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import math
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader

import transformers
from transformers import get_cosine_schedule_with_warmup

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GroupKFold

import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
df = pd.read_csv("../input/train-folds/train_folds.csv",index_col=0)
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,path,fold
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,666,../input/shopee-product-matching/train_images/...,3
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",7572,../input/shopee-product-matching/train_images/...,3
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,6172,../input/shopee-product-matching/train_images/...,4
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,10509,../input/shopee-product-matching/train_images/...,3
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,9425,../input/shopee-product-matching/train_images/...,1


In [3]:
transformer_path = 'sentence-transformers/paraphrase-xlm-r-multilingual-v1'
tokenizer = transformers.AutoTokenizer.from_pretrained(transformer_path)

Downloading:   0%|          | 0.00/541 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [4]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f889149bf50>

In [5]:
class Shop(Dataset):
    def __init__(self,df,tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        
        p_id =  self.df.posting_id.iloc[idx]
        
        text = str(self.df.title[idx])
        text = " ".join(text.split())
        
        text_input =  self.tokenizer(text,truncation = True , padding='max_length', 
                                      max_length = 40,return_tensors="pt")
        
        input_ids = text_input["input_ids"][0]
        mask = text_input["attention_mask"][0]
  
        label = self.df.iloc[idx].label_group	
        label =torch.tensor(label, dtype=torch.long)
         
        return input_ids,mask,label,p_id, 

In [6]:
da  = Shop(df,tokenizer)
x,y,j,k = da[900]
x,y,j,k

(tensor([     0,  23096, 135824,  34324, 172153,    568, 152438,  11893, 195798,
            248,  10855,   7285,    341,  91293,      6,  75706,    159, 175909,
            132,   4692,  94449,     56,   4879,      2,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor(2576),
 'train_2937206762')

In [7]:
class ArcModule(nn.Module):
    def __init__(self, in_features, out_features, s = 10, m = 0.6):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.kaiming_normal_(self.weight)

        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = torch.tensor(math.cos(math.pi - m))
        self.mm = torch.tensor(math.sin(math.pi - m) * m)
        
      

    def forward(self, inputs, labels=None):
        cos_th = F.linear(inputs, F.normalize(self.weight))
        cos_th = cos_th.clamp(-1, 1) 
        sin_th = torch.sqrt(1.0 - torch.pow(cos_th, 2)+ 1e-8)
        cos_th_m = cos_th * self.cos_m - sin_th * self.sin_m
        # print(type(cos_th), type(self.th), type(cos_th_m), type(self.mm))
        cos_th_m = torch.where(cos_th > self.th, cos_th_m, cos_th - self.mm)

        cond_v = cos_th - self.th
        cond = cond_v <= 0
        cos_th_m[cond] = (cos_th - self.mm)[cond]

        if labels.dim() == 1:
            labels = labels.unsqueeze(-1)
        onehot = torch.zeros(cos_th.size()).to(device)
        labels = labels.type(torch.LongTensor).to(device)
        onehot.scatter_(1, labels, 1.0)
        outputs = onehot * cos_th_m + (1.0 - onehot) * cos_th
        outputs = outputs * self.s
        return outputs

In [8]:
class Model(nn.Module):
    def __init__(self,path,output_size =df.label_group.nunique()):
        super().__init__()
        self.op = output_size
        self.path = path
        self.backbone = transformers.AutoModel.from_pretrained(self.path)
        self.fc1 = nn.Linear(768,224)
        self.do = nn.Dropout(p=0.3)
        self.bn1 = nn.BatchNorm1d(224)
        self.bn2 =  nn.BatchNorm1d(768)
        self.pl = nn.PReLU()
        self.margin = ArcModule(in_features=224, out_features = self.op)
        
        nn.init.kaiming_normal_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        
        
    def forward(self,input_ids,attention_mask,labels=None):
        x = self.backbone(input_ids=input_ids,attention_mask=attention_mask)
        x = x[0][:,0,:]
        x = self.bn2(x)
        x = self.do(x)
        feat = self.bn1(self.fc1(x))
        feat = F.normalize(feat)
        if labels is not None:
            return self.margin(feat, labels)
        else:
            return feat

In [9]:
model = Model(transformer_path)
model.to(device)

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Model(
  (backbone): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)

In [10]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [11]:
def train_one_epoch(train_loader,model,optimizer,criterion,e,epochs,scheduler):
    losses = AverageMeter()
    model.train()
    global_step = 0
    loop = tqdm(enumerate(train_loader),total = len(train_loader))
    
    for step,(input_ids, attention_mask,labels,_) in loop:
        input_ids = input_ids.to(device)
        attention_mask =attention_mask.to(device)    
        labels= labels.to(device)
        #inp_id,mask
        logitss = model(input_ids, attention_mask,labels)
        batch_size = labels.size(0)
        loss  = criterion(logitss,labels)
        
        losses.update(loss.item(), batch_size)
        
        optimizer.zero_grad()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(m.parameters(), 1000 )
        optimizer.step()
        scheduler.step() 
        global_step += 1
        
        loop.set_description(f"Epoch {e+1}/{epochs}")
        loop.set_postfix(loss = loss.item() ,stage = 'train')
        
        
    return losses.avg

In [12]:
def val_one_epoch(loader,model,optimizer,criterion,scheduler):
    losses = AverageMeter()
    model.eval()
    global_step = 0
    loop = tqdm(enumerate(loader),total = len(loader))
    
    for step,(input_ids, attention_mask,labels,_) in loop:
        input_ids = input_ids.to(device)
        attention_mask =attention_mask.to(device)    
        labels= labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            logitss = model(input_ids, attention_mask,labels)
        loss  = criterion(logitss,labels)
        
        
        losses.update(loss.item(), batch_size)
        loop.set_postfix(loss = loss.item(), stage = 'valid')
        
        optimizer.step()
        optimizer.zero_grad()
        global_step += 1
        
    
        
    return losses.avg

In [13]:
def fit(fold):
    df_train = df[df.fold ==  2 ].reset_index(drop=True)
    df_valid = df[df.fold == fold].reset_index(drop=True)
    
    train_data = Shop(df_train,tokenizer)
    val_data   = Shop(df_valid,tokenizer)
    
    train_loader = DataLoader(train_data,shuffle=True,
                        num_workers=4,
                        batch_size=32,
                        drop_last=True,
                            pin_memory=True)
    
    val_loader = DataLoader(val_data,shuffle=False,
                        num_workers=4,
                            pin_memory=True,
                        batch_size=32)
    criterion= nn.CrossEntropyLoss()
    epochs = 4
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4 , weight_decay = 1e-7)
    
    num_train_steps = math.ceil(len(train_loader))
    warmup_epochs = 1
    num_warmup_steps= num_train_steps * warmup_epochs
    num_training_steps=int(num_train_steps * epochs)
    scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps = num_warmup_steps,num_training_steps =num_training_steps)
    
    best_acc = 0
    loop = range(epochs)
    for e in loop:
        
        train_loss = train_one_epoch(train_loader,model,optimizer,criterion,e,epochs,scheduler)
        print(f'For epoch {e+1}/{epochs}')
        print(f'average train_loss {train_loss}')
        
        val_loss = val_one_epoch(val_loader,model,optimizer,criterion,scheduler)
        print(f'avarage val_loss { val_loss }')

In [14]:
fit(0)

  0%|          | 0/214 [00:00<?, ?it/s]

For epoch 1/4
average train_loss 14.515458820022156


  0%|          | 0/215 [00:00<?, ?it/s]

avarage val_loss 15.118433164447055


  0%|          | 0/214 [00:00<?, ?it/s]

For epoch 2/4
average train_loss 14.209789084496899


  0%|          | 0/215 [00:00<?, ?it/s]

avarage val_loss 15.125658783907786


  0%|          | 0/214 [00:00<?, ?it/s]

For epoch 3/4
average train_loss 14.713570420987137


  0%|          | 0/215 [00:00<?, ?it/s]

avarage val_loss 15.961426387058003


  0%|          | 0/214 [00:00<?, ?it/s]

For epoch 4/4
average train_loss 14.10091012437767


  0%|          | 0/215 [00:00<?, ?it/s]

avarage val_loss 16.25272433250738


In [15]:
torch.save(model.state_dict(),'bert_T_1_F0.pth') 

In [16]:
test = df[df.fold == 0].reset_index(drop = True)
a = Shop(test,tokenizer)
t_loader =  DataLoader(a,shuffle=False,
                        num_workers=4,
                        batch_size=32,
                        pin_memory=True)
def generate_test_features(test_loader):
    model.eval()
    bar = tqdm(test_loader)
    
    FEAS = []

    with torch.no_grad():
   
        for batch_idx, (input_ids, attention_mask,_,_) in enumerate(bar):

            input_ids = input_ids.to(device)
            attention_mask =attention_mask.to(device)    

            features = model(input_ids, attention_mask)

            FEAS += [features.detach().cpu()]

    FEAS = torch.cat(FEAS).cpu().numpy()
    
    return FEAS
FEAS = generate_test_features(t_loader)

  0%|          | 0/215 [00:00<?, ?it/s]

In [17]:
FEAS.shape

(6851, 224)

In [18]:
from sklearn import metrics
import gc
preds = []
CHUNK = 900
#te = ar[:,1:]
print('Finding similar titles...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    print('chunk',a,'to',b)
    cts = metrics.pairwise_distances(FEAS[a:b],FEAS, metric='cosine')
    cts = 1-cts
    for k in range(b-a):
        IDX = np.where(cts[k,]>0.8)[0]
        o = test.iloc[IDX].posting_id.values
        preds.append(o)

#del te,dx,modelx
_ = gc.collect()

Finding similar titles...
chunk 0 to 900
chunk 900 to 1800
chunk 1800 to 2700
chunk 2700 to 3600
chunk 3600 to 4500
chunk 4500 to 5400
chunk 5400 to 6300
chunk 6300 to 6851


In [19]:
test['preds'] = preds
tmp = test.groupby('image_phash').posting_id.agg('unique').to_dict()
test['preds3'] = test.image_phash.map(tmp)
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score
def combine_for_sub(row):
    x = np.concatenate([row.preds, row.preds3])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.preds, row.preds3])
    return np.unique(x)
tmp = test.groupby('label_group').posting_id.agg('unique').to_dict()
test['target'] = test.label_group.map(tmp)
test['oof'] = test.apply(combine_for_cv,axis=1)
test['f1'] = test.apply(getMetric('oof'),axis=1)
print('CV Score =', test.f1.mean() )

CV Score = 0.001718186869392665


In [20]:
FEAS

array([[ 0.01874758, -0.05582908,  0.07023361, ...,  0.04048274,
         0.0647586 ,  0.06772909],
       [ 0.01874756, -0.0558291 ,  0.07023355, ...,  0.04048279,
         0.06475856,  0.06772909],
       [ 0.0187476 , -0.0558291 ,  0.07023357, ...,  0.04048274,
         0.0647586 ,  0.06772912],
       ...,
       [ 0.01874755, -0.05582904,  0.07023363, ...,  0.04048271,
         0.06475863,  0.06772912],
       [ 0.01874759, -0.05582911,  0.07023361, ...,  0.04048279,
         0.06475859,  0.0677291 ],
       [ 0.01874762, -0.05582913,  0.07023358, ...,  0.04048279,
         0.06475857,  0.0677291 ]], dtype=float32)