In [None]:
!pip install -q transformers

import re
import torch

from transformers import AutoTokenizer, AutoModelForMaskedLM
import numpy as np

In [None]:

class SynWords:
    def __init__(self):
        # load trained model and tokenizer:
        self.tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased")
        self.model = AutoModelForMaskedLM.from_pretrained("HooshvareLab/bert-fa-base-uncased")
        self.syn_dict = {}
        self.load_syn_words()

    def load_syn_words(self):
        #load synonym dictionary and save synonyms in self.syn_dict:
        file1 = open('Farhang_Motaradef-Motazad.txt', 'r', encoding="utf8")
        lines = file1.readlines()

        for line in lines:
            word, s = line.split('&')[0].split(':')
            s = re.sub('\d', '', s)
            s = re.sub('\n', '', s)
            s = re.sub('\u200c', '', s)
            syns = s.split('،')
            self.syn_dict[word.strip()] = syns


    def find_equivalent_words(self, text,word ,topN=2):
        output = []

        encoded_input = self.tokenizer(text, return_tensors='pt')
        tokenized_text = self.tokenizer.tokenize(text)

        if self.tokenizer.tokenize(word)[0] not in tokenized_text:
            output = 'معادلی پیدا نشد'
            return output

        # get contexualized embedding of query word if it 's in syn_dict and not empty:
        word_index = tokenized_text.index(self.tokenizer.tokenize(word)[0])
        model_output = self.model(**encoded_input)
        word_embedding1 = model_output[0][0][word_index]

        if word not in self.syn_dict:
            output = 'معادلی پیدا نشد'
            return output

        syns = self.syn_dict[word]

        similarities = []
        for syn in syns:
            syn = syn.strip()

            #replace condidate synonym in text and get contexualized embedding:
            text2 = text.replace(word, syn)
            encoded_input = self.tokenizer(text2, return_tensors='pt')
            tokenized_text = self.tokenizer.tokenize(text2)


            if self.tokenizer.tokenize(syn)[0] not in tokenized_text:
                continue

            if syn in tokenized_text:
                word_index = tokenized_text.index(self.tokenizer.tokenize(syn)[0])
                model_output = self.model(**encoded_input)
                word_embedding = model_output[0][0][word_index]

                #calculate cosine similarity between base embedding and all condidates embeddings:
                cos_dist = float(1 - cosine(word_embedding.detach().numpy(), word_embedding1.detach().numpy()))
                print(syn,cos_dist)
                similarities.append([syn, cos_dist])

        for _ in range(topN):
            # return words of topN most similar embeddings to base:
            i = np.argmax(np.array(similarities)[:, 1].astype(float))
            syn_i = similarities.pop(i)[0]
            output.append(syn_i)

        return output

In [None]:
synWords = SynWords()

In [None]:
text = 'تمام هستی‌‌‌ ‌‌‌خود را در راه دفاع از سرزمین فدا کرد.'
word = 'هستی'

In [None]:
syns = synWords.find_equivalent_words(text,word)