In [1]:
import pandas as pd
import numpy as np
import re
import torch
from torch import nn
import nltk
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util

from transformers import BertModel, BertTokenizer, AutoTokenizer, AutoModel, TFBertModel
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import torchvision.models as models

import tensorflow as tf 
import tensorflow_hub as hub 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tokenization 
import warnings
warnings.filterwarnings("ignore")

import math
import collections
import langid
import fasttext
import string 

nltk.download('stopwords')

In [2]:
training_images_path = '/workspaces/Shopee-Price-Match-Guarantee/00_source_data/shopee-product-matching/train_images'
training_dataset =pd.read_csv('/workspaces/Shopee-Price-Match-Guarantee/00_source_data/shopee-product-matching/train.csv')
testing_dataset = pd.read_csv('/workspaces/Shopee-Price-Match-Guarantee/00_source_data/shopee-product-matching/test.csv')

In [3]:
# remove stop words 
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# remove punctuation
import string
punctuations = string.punctuation

# remove numbers
import re
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

# remove special characters
def remove_special_characters(text):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    return text

# remove extra spaces
def remove_extra_spaces(text):
    text = re.sub(' +', ' ', text)
    return text

def word_tokenize(text):
    text = text.split()
    return text

# remove stop words
def remove_stop_words(text):
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stop_words]
    filtered_sentence = (" ").join(tokens_without_sw)
    return filtered_sentence

# remove all preprocessing
def remove_all_preprocessing(text):
    text = remove_numbers(text)
    text = remove_special_characters(text)
    text = remove_extra_spaces(text)
    text = remove_stop_words(text)
    return text

# apply all preprocessing
training_dataset['title'] = training_dataset['title'].apply(lambda x: remove_all_preprocessing(x))
testing_dataset['title'] = testing_dataset['title'].apply(lambda x: remove_all_preprocessing(x))

In [4]:
# get rid of \
training_dataset['title'] = training_dataset['title'].apply(lambda x: x.replace('\\', ''))
# lower case
training_dataset['title'] = training_dataset['title'].apply(lambda x: x.lower())

In [5]:
bert_model = '../input/bert-base-uncased-220421/bert_base'


In [8]:
import keras


In [9]:
class ArcMarginProduct(keras.layers.Layer):
    '''
    Implements large margin arc distance.

    Reference:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
            blob/master/src/modeling/metric_learning.py
    '''
    def __init__(self, n_classes, s=30, m=0.50, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProduct, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProduct, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True,
            regularizer=None)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output

In [6]:
def get_bert_model(mname):
    
    idx = layers.Input((105), dtype="int32", name="input_idx")
    masks = layers.Input((105), dtype="int32", name="input_masks")
    
    nlp = transformers.TFBertModel.from_pretrained(mname)
    bert_out= nlp([idx, masks])[0]
    
    ## fine-tuning
    x = layers.GlobalAveragePooling1D()(bert_out)
    x = layers.Dense(750, activation="swish", name='text-embed')(x)
    
    model_title = Sequential([
        Input(shape=(100,), name='title-input'),
        Embedding(25000, 100, input_length=100, name='title-embed'),
        Dropout(0.2),
        Conv1D(300, 3, padding='valid', activation='relu', strides=1),
        GlobalMaxPool1D(),
        Dense(720, name='title-dense1'), #650 -> 0.81
        Activation('swish', name='title-act1'),
        Dropout(0.2),
        
        Dense(650, name='title-dense2'),
        BatchNormalization(name='title-bn2'),
        Activation('swish', name='title-act2'),
    ], name='title-vec')
    
    margin = ArcMarginProduct(
        n_classes = CLASSES, 
        s = 30, 
        m = 0.7, 
        name='head/arc_margin', 
        dtype='float32'
    )
    
    concatenate = Concatenate(name='concatenate')([x, model_title.output])
    label = Input(shape=(), name='arc-input')
    arc_face = margin([concatenate, label])
    
    output = Dense(CLASSES, activation='softmax', name='output')(arc_face)

    # Compile model
    model = tf.keras.Model(inputs=[idx, masks, model_title.input, label], outputs=[output])
    return model

In [11]:
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
embeddings = model.encode(training_dataset['title'])
print(embeddings)

[[-0.05237605  0.00266704 -0.0351623  ... -0.0220446   0.03120822
   0.0482746 ]
 [ 0.03168393  0.0207246  -0.01772413 ... -0.02363313  0.00672482
  -0.06600535]
 [-0.02553968  0.02782196  0.01990272 ... -0.0465625  -0.01983832
   0.01210797]
 ...
 [-0.00043891  0.0663081  -0.00434749 ...  0.04577914  0.02965026
  -0.03386611]
 [-0.02243138  0.02246639  0.05454137 ...  0.01867248  0.01682098
   0.00408043]
 [-0.03517014  0.02272914  0.00684843 ... -0.00679976 -0.01856173
  -0.00943737]]


In [19]:
def build_model(bert_layer,N_CLASSES,max_len=512):
    tokens = tf.keras.layers.Input(shape=(max_len,),dtype=tf.int32)

    y = tf.keras.layers.Input(shape=(),dtype=tf.int32)
    bert = bert_layer.bert([tokens])
    cls = bert.pooler_output 
    x = tf.keras.layers.BatchNormalization()(cls)
    mar = margin([x,y])
    output = tf.keras.layers.Softmax()(mar)
    model = tf.keras.models.Model(inputs=[tokens,y],outputs=[output])
    return model 
   

In [20]:
N_classes = training_dataset["label_group"].nunique()

In [21]:
max_len= 512

In [22]:
model = build_model(model,N_classes,max_len=max_len)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),loss=tf.keras.losses.cosine_similarity, metrics="accuracy")
model.summary()

AttributeError: 'SentenceTransformer' object has no attribute 'bert'

In [7]:
from sentence_transformers import SentenceTransformer, models
word_embedding_model = models.Transformer('sentence-transformers/distiluse-base-multilingual-cased-v2', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])


Downloading (…)lve/main/config.json: 100%|██████████| 610/610 [00:00<00:00, 550kB/s]
Downloading pytorch_model.bin: 100%|██████████| 539M/539M [00:08<00:00, 65.8MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 531/531 [00:00<00:00, 790kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 996k/996k [00:00<00:00, 37.4MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.96M/1.96M [00:00<00:00, 74.8MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 174kB/s]


In [7]:
bert_version = 'sentence-transformers/distiluse-base-multilingual-cased-v2'
tokenizer = BertTokenizer.from_pretrained(bert_version)
model = BertModel.from_pretrained(bert_version)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at sentence-transformers/distiluse-base-multilingual-cased-v2 were not used when initializing BertModel: ['transformer.layer.5.ffn.lin1.bias', 'transformer.layer.3.attention.out_lin.weight', 'transformer.layer.3.attention.out_lin.bias', 'transformer.layer.5.ffn.lin1.weight', 'transformer.layer.4.attention.k_lin.bias', 'transformer.layer.3.attention.v_lin.weight', 'transformer.layer.2.ffn.lin2.bias', 'transformer.layer.4.output_layer_norm.weight', 'transformer.layer.3.attention.q_lin.weight', 'transformer.layer

In [8]:
training_dataset

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,paper bag victoria secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,double tape m vhb mm x original double foam tape,2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,maling tts canned pork luncheon meat gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,daster batik lengan pendek motif acak campur l...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,nescafe xcxclair latte ml,3648931069
...,...,...,...,...,...
34245,train_4028265689,fff1c07ceefc2c970a7964cfb81981c5.jpg,e3cd72389f248f21,masker bahan kain spunbond non woven gsm ply l...,3776555725
34246,train_769054909,fff401691371bdcb382a0d9075dfea6a.jpg,be86851f72e2853c,mamypoko pants royal soft s popok celana,2736479533
34247,train_614977732,fff421b78fa7284284724baf249f522e.jpg,ad27f0d08c0fcbf0,khanzaacc robot res mm subwoofer bass metal wi...,4101248785
34248,train_3630949769,fff51b87916dbfb6d0f8faa01bee67b8.jpg,e3b13bd1d896c05c,kaldu non msg halal mama kamu ayam kampung sap...,1663538013


In [10]:
from transformers import BertTokenizer, TFBertModel
from tqdm import tqdm


2023-04-13 22:21:31.704802: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
model_name='cahya/bert-base-indonesian-522M'

In [12]:
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading (…)solve/main/vocab.txt: 100%|██████████| 230k/230k [00:00<00:00, 59.8MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 156kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 62.0/62.0 [00:00<00:00, 78.2kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 468/468 [00:00<00:00, 616kB/s]


In [16]:
model = TFBertModel.from_pretrained(model_name)

Downloading tf_model.h5: 100%|██████████| 545M/545M [00:15<00:00, 35.7MB/s] 
Some layers from the model checkpoint at cahya/bert-base-indonesian-522M were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at cahya/bert-base-indonesian-522M.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [14]:
bert_title_vectors = np.zeros((training_dataset.shape[0],768))

In [17]:
for idx,txt in tqdm(enumerate(training_dataset['title'])):
  encoded_input = tokenizer(txt, return_tensors='tf')
  output = model(encoded_input)
  bert_title_vectors[idx]= output['pooler_output']

3722it [14:58,  4.26it/s]

In [9]:
model_name = 'sentence-transformers/distiluse-base-multilingual-cased-v2'
model = SentenceTransformer(model_name)

In [14]:
max_seq_length = 128
encoded_data = []

In [15]:
encoded_titles = []
for title in training_dataset['title']:
    encoded_title = model.encode(title)
    encoded_titles.append(torch.tensor(encoded_title))
input_ids = pad_sequence(encoded_titles, batch_first=True)

In [16]:
attention_masks = torch.ones(input_ids.shape[0], input_ids.shape[1], dtype=torch.long)

In [17]:
batch_size = 32

In [18]:
dataset = TensorDataset(input_ids, attention_masks)

In [19]:
dataloader = DataLoader(dataset, batch_size=batch_size)

In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_function = torch.nn.CrossEntropyLoss()

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [25]:
num_epochs = 10
batch_size = 32
num_training_steps = (len(training_dataset) // batch_size) * num_epochs


In [27]:
model = model.to(device)

In [28]:
num_epochs = 5

In [32]:
attention_masks = torch.ones(input_ids.shape[0], input_ids.shape[1], dtype=torch.long)

In [38]:
num_epochs = 5
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_function = torch.nn.CrossEntropyLoss()
targets = torch.tensor(training_dataset['label_group'].values).to(device)

model.to(device)
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        inputs, masks = batch
        inputs = inputs.to(device)
        masks = masks.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader)}")

IndexError: too many indices for tensor of dimension 2

In [7]:
from tqdm.auto import tqdm
from torchmetrics.text.bert import BERTScore

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(25 * len(eval_dataloader)))


for epoch in range(25):
  model.train()
  for batch in train_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      loss.backward()

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar_train.update(1)

  model.eval()
  for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar_eval.update(1)
    
  print(metric.compute())

NameError: name 'num_training_steps' is not defined