In [185]:
import torch
import random
import nltk
from nltk.corpus import stopwords, wordnet
from transformers import AutoTokenizer

In [198]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nilsgrunefeld/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nilsgrunefeld/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def get_synonym(word, lang="en"):
    supported_languages = ["en", "de", "es", "fr", "it", "ko", "pt", "ru", "zh"]
    print(lang)
    print(lang in supported_languages)
    
    if lang not in supported_languages:
        raise ValueError(f"Unsupported language. Supported languages: {supported_languages}")
    
    if lang == "en":
        synonyms = []
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                if lemma.name() != word and "_" not in lemma.name():
                    synonyms.append(lemma.name())
        
        if not synonyms:
            return word
        return random.choice(synonyms)
    
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas(lang=lang):
            if lemma.name() != word and "_" not in lemma.name():
                synonyms.append(lemma.name())
    
    # If no synonyms found in the target language, optionally try English as fallback
    if not synonyms:
        # Uncomment below to fall back to English synonyms when none found in target language
        # return get_synonym(word, "en")
        return word
    
    return random.choice(synonyms)


In [337]:
SUPPORTED_LANGUAGES = ["en", "de", "es", "fr", "it", "ko", "pt", "ru", "zh"]

def get_synonym(word, lang='en'):
    if lang not in SUPPORTED_LANGUAGES:
        raise ValueError(f"Unsupported language: {lang}")

    synonyms = set()

    # Retrieve English synsets (WordNet is English-based)
    synsets = wordnet.synsets(word, lang='eng') if lang != 'en' else wordnet.synsets(word)

    for syn in synsets:
        if lang == 'en':
            for lemma in syn.lemmas():
                name = lemma.name()
                if name != word and "_" not in name:
                    synonyms.add(name)
        else:
            for lemma in syn.lemmas(lang=lang):
                name = lemma.name()
                if name != word and "_" not in name:
                    synonyms.add(name)

    return random.choice(list(synonyms)) if synonyms else word


In [188]:
def token_to_word(token, tokenizer):
    return tokenizer.decode([token]).strip()

In [162]:
def replace_tokens_with_synonyms(inputs, tokenizer, device, replacement_prob=0.15):
    stop_words = set(stopwords.words("english"))

    input_ids = inputs["input_ids"].clone()

    for i in range(input_ids.shape[0]):
        for j in range(input_ids.shape[1]):
            if random.random() < replacement_prob:
                token_id = input_ids[i, j].item()
                word = token_to_word(token_id, tokenizer)

                if (
                    word.lower() in stop_words
                    or word.startswith("##")
                    or not word.isalpha()
                ):
                    continue

                synonym = get_synonym(word)

                synonym_tokens = tokenizer(
                    synonym, return_tensors="pt", add_special_tokens=False
                ).to(device)

                if synonym_tokens["input_ids"].shape[1] == 1:
                    input_ids[i, j] = synonym_tokens["input_ids"][0, 0]

    return input_ids

In [163]:
sentence = "The quick brown fox jumps over the lazy dog."
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [164]:
inputs = tokenizer(
    sentence,
    return_tensors="pt",
    add_special_tokens=False,
).to(device)

In [184]:
modified_input_ids = replace_tokens_with_synonyms(inputs, tokenizer, device, replacement_prob=0.5)
modified_sentence = tokenizer.decode(modified_input_ids[0])
print(f"Original: {sentence}")
print(f"Modified: {modified_sentence}")

Original: The quick brown fox jumps over the lazy dog.
Modified: the quick brown fox jumps over the lazy dog.


In [311]:
get_synonym("dog")

[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]
Synset('dog.n.01')
[Lemma('dog.n.01.dog'), Lemma('dog.n.01.domestic_dog'), Lemma('dog.n.01.Canis_familiaris')]
Synset('frump.n.01')
[Lemma('frump.n.01.frump'), Lemma('frump.n.01.dog')]
Synset('dog.n.03')
[Lemma('dog.n.03.dog')]
Synset('cad.n.01')
[Lemma('cad.n.01.cad'), Lemma('cad.n.01.bounder'), Lemma('cad.n.01.blackguard'), Lemma('cad.n.01.dog'), Lemma('cad.n.01.hound'), Lemma('cad.n.01.heel')]
Synset('frank.n.02')
[Lemma('frank.n.02.frank'), Lemma('frank.n.02.frankfurter'), Lemma('frank.n.02.hotdog'), Lemma('frank.n.02.hot_dog'), Lemma('frank.n.02.dog'), Lemma('frank.n.02.wiener'), Lemma('frank.n.02.wienerwurst'), Lemma('frank.n.02.weenie')]
Synset('pawl.n.01')
[Lemma('pawl.n.01.pawl'), Lemma('pawl.n.01.detent'), Lemma('pawl.n.01.click'), Lemma('pawl.n.01.dog')]
Synset('andiron.n.01')
[Lemma('andiron.n.01.andiron'

'cad'

In [316]:
get_synonym("한국어", lang="ko")

'한국어'

In [338]:
# print(f"English synonym for 'good': {get_synonym('good', 'en')}")
print(f"German synonym for 'gut': {get_synonym('gut', 'de')}")
print(f"Spanish synonym for 'bueno': {get_synonym('bueno', 'es')}")
print(f"French synonym for 'bon': {get_synonym('bon', 'fr')}")

WordNetError: Language is not supported.