In [1]:
import re
import string
from torch import clamp
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

class TokenSimilarity:

    def load_pretrained(self, from_pretrained:str="indobenchmark/indobert-base-p1"):
        self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained)
        self.model = AutoModel.from_pretrained(from_pretrained)
        
    def __cleaning(self, text:str):
        # clear punctuations
        text = text.translate(str.maketrans('', '', string.punctuation))

        # clear multiple spaces
        text = re.sub(r'/s+', ' ', text).strip()

        return text
        
    def __process(self, first_token:str, second_token:str):
        inputs = self.tokenizer([first_token, second_token],
                                max_length=self.max_length,
                                truncation=self.truncation,
                                padding=self.padding,
                                return_tensors='pt')

        attention = inputs.attention_mask

        outputs = self.model(**inputs)

        # get the weights from the last layer as embeddings
        embeddings = outputs[0] # when used in older transformers version
        # embeddings = outputs.last_hidden_state # when used in newer one

        # add more dimension then expand tensor
        # to match embeddings shape by duplicating its values by rows
        mask = attention.unsqueeze(-1).expand(embeddings.shape).float()

        masked_embeddings = embeddings * mask
        
        # MEAN POOLING FOR 2ND DIMENSION
        # first, get sums by 2nd dimension
        # second, get counts of 2nd dimension
        # third, calculate the mean, i.e. sums/counts
        summed = masked_embeddings.sum(1)
        counts = clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed/counts

        # return mean pooling as numpy array
        return mean_pooled.detach().numpy()
        
    def predict(self, first_token:str, second_token:str,
                return_as_embeddings:bool=False, max_length:int=16,
                truncation:bool=True, padding:str="max_length"):
        self.max_length = max_length
        self.truncation = truncation
        self.padding = padding

        first_token = self.__cleaning(first_token)
        second_token = self.__cleaning(second_token)

        mean_pooled_arr = self.__process(first_token, second_token)
        if return_as_embeddings:
            return mean_pooled_arr

        # calculate similarity
        similarity = cosine_similarity([mean_pooled_arr[0]], [mean_pooled_arr[1]])

        return similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = TokenSimilarity()
model.load_pretrained('indobenchmark/indobert-base-p2')

In [7]:
str1 = "membeli"
str2 = "menjauh"
str3 = "meminjam"

In [8]:
print(model.predict(str1, str3))

[[0.86526513]]


In [6]:
print(model.predict(str2, str3))

[[1.0000002]]


In [None]:
def get_similarity(text1, text2):
    words = text1.split()
    results={}
    for word in words:
        results[word] = model.predict(word, text2)
    return results

In [None]:
text1 = "dijaga reputasinya oleh pemilik produk"
text2 = "Beli 2 kotak isi total 20 buah pie terdapat 5 buah pie yang sudah jamuran. Padahal paket langsung dibuka sesaat setelah diantar kurir dan expire date juga masih 1 bulan lebih. Coba cek ulasan, banyak pembeli juga mengalami kendala pie berjamur. Jelek sekali kontrol kualitas pie susu ini. Kasian reputasi toko ini ikutan rusak gara-gara produk yang . Maaf ya kaka penjual, ini ulasan jujur. Mungkin bisa disampaikan kepada produsen produk agar memperbaiki kontrol kualitas produknya."
get_similarity(text1, text2)

In [None]:
x =model.get_mean_pooled_arr(text1, text2)
x

In [None]:
type(x[0])

In [None]:
import numpy

In [None]:
a = [[1, 0, 0, 0]]
b = [[1, 0, 0, 0]]
cosine_similarity(a, b)

In [None]:
a