In [416]:
import torch
import random
import nltk
from nltk.corpus import stopwords, wordnet
from transformers import AutoTokenizer
from deep_translator import GoogleTranslator

In [198]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nilsgrunefeld/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nilsgrunefeld/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def get_synonym(word, lang="en"):
    supported_languages = ["en", "de", "es", "fr", "it", "ko", "pt", "ru", "zh"]
    print(lang)
    print(lang in supported_languages)
    
    if lang not in supported_languages:
        raise ValueError(f"Unsupported language. Supported languages: {supported_languages}")
    
    if lang == "en":
        synonyms = []
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                if lemma.name() != word and "_" not in lemma.name():
                    synonyms.append(lemma.name())
        
        if not synonyms:
            return word
        return random.choice(synonyms)
    
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas(lang=lang):
            if lemma.name() != word and "_" not in lemma.name():
                synonyms.append(lemma.name())
    
    # If no synonyms found in the target language, optionally try English as fallback
    if not synonyms:
        # Uncomment below to fall back to English synonyms when none found in target language
        # return get_synonym(word, "en")
        return word
    
    return random.choice(synonyms)


In [449]:
# Languages supported
LANGUAGES = ["en", "de", "es", "fr", "it", "ko", "pt", "ru", "zh"]

def get_synonym(word, lang="en", tokenizer=None):
    if lang not in LANGUAGES:
        raise ValueError(f"Unsupported language: {lang}")

    try:
        # Translate to English if not already
        word_en = word if lang == "en" else GoogleTranslator(source=lang, target='en').translate(word)

        # Get English synonyms from WordNet
        synsets = wordnet.synsets(word_en)
        synonym_candidates = set()
        for syn in synsets:
            for lemma in syn.lemmas():
                synonym = lemma.name().replace('_', ' ')
                if synonym.lower() != word_en.lower():
                    synonym_candidates.add(synonym)

        if not synonym_candidates:
            return None

        if tokenizer:
            synonyms = []
            for synonym in list(synonym_candidates):
                if lang == "en":
                    if len(tokenizer.tokenize(synonym)) == 1:
                        synonyms.append(synonym)
                else:
                    translated_syn = GoogleTranslator(source='en', target=lang).translate(synonym)
                    if len(tokenizer.tokenize(translated_syn)) == 1:
                        synonyms.append(synonym)
        else:
            synonyms = synonym_candidates
        
        if len(synonyms) == 0:
            return None

        # Choose a random synonym
        chosen_syn = random.choice(list(synonyms))

        # Translate back to original language if needed
        return chosen_syn if lang == "en" else GoogleTranslator(source='en', target=lang).translate(chosen_syn)

    except Exception as e:
        print(f"Error: {e}")
        return None


In [450]:
def token_to_word(token, tokenizer):
    return tokenizer.decode([token]).strip()

In [495]:
def replace_tokens_with_synonyms(inputs, tokenizer, device, lang="en", replacement_prob=0.15):
    stop_words = set(stopwords.words("english"))

    input_ids = inputs["input_ids"].clone()

    for i in range(input_ids.shape[0]):
        for j in range(input_ids.shape[1]):
            if random.random() < replacement_prob:
                token_id = input_ids[i, j].item()
                word = token_to_word(token_id, tokenizer)

                if (
                    word.lower() in stop_words
                    or word.startswith("##")
                    or not word.isalpha()
                ):
                    continue

                synonym = get_synonym(word, lang=lang, tokenizer=tokenizer)
                if not synonym:
                    synonym = word

                synonym_tokens = tokenizer(
                    synonym, return_tensors="pt", add_special_tokens=False
                ).to(device)

                if synonym_tokens["input_ids"].shape[1] == 1:
                    if synonym_tokens["input_ids"][0, 0] != token_id:
                        input_ids[i, j] = synonym_tokens["input_ids"][0, 0]

    return input_ids

In [452]:
sentence = "The quick brown fox jumps over the lazy dog."
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [453]:
inputs = tokenizer(
    sentence,
    return_tensors="pt",
    add_special_tokens=False,
).to(device)

In [454]:
modified_input_ids = replace_tokens_with_synonyms(inputs, tokenizer, device, replacement_prob=0.5)
modified_sentence = tokenizer.decode(modified_input_ids[0])
print(f"Original: {sentence}")
print(f"Modified: {modified_sentence}")

Original: The quick brown fox jumps over the lazy dog.
Modified: the quick brownish confuse jumps over the lazy tail.


In [455]:
sample_it = "Il video mostra un gruppo di ballerini che esegue una coreografia di danza Jazz in un ambiente chiuso, probabilmente uno studio di danza."
sample_zh = "该视频展示了一群舞者在一个封闭的环境中执行爵士舞编舞，可能是一个舞蹈工作室。"
sample_de = "Das Video zeigt eine Gruppe von Tänzern, die in einer geschlossenen Umgebung, wahrscheinlich einem Tanzstudio, eine Jazz-Choreografie ausführen."
sample_fr = "La vidéo montre un groupe de danseurs exécutant une chorégraphie de danse jazz dans un environnement clos, probablement un studio de danse."
sample_es = "El video muestra a un grupo de bailarines realizando una coreografía de danza jazz en un entorno cerrado, probablemente un estudio de danza."
sample_pt = "O vídeo mostra um grupo de dançarinos executando uma coreografia de dança jazz em um ambiente fechado, provavelmente um estúdio de dança."
sample_ru = "В видео показана группа танцоров, исполняющих джазовую хореографию в закрытом помещении, вероятно, в танцевальной студии."
sample_ko = "이 비디오는 아마도 댄스 스튜디오에서 닫힌 환경에서 재즈 댄스 안무를 수행하는 무용수 그룹을 보여줍니다."

lang_samples = {
    "it": sample_it,
    "zh": sample_zh,
    "de": sample_de,
    "fr": sample_fr,
    "es": sample_es,
    "pt": sample_pt,
    "ru": sample_ru,
    "ko": sample_ko
}

In [494]:
lang_choice = "ko"

inputs = tokenizer(
    lang_samples[lang_choice],
    return_tensors="pt",
    add_special_tokens=False,
).to(device)

modified_input_ids = replace_tokens_with_synonyms(inputs, tokenizer, device, lang=lang_choice, replacement_prob=1)
modified_sentence = tokenizer.decode(modified_input_ids[0])
print(f"Original: {lang_samples[lang_choice]}")
print(f"Modified: {modified_sentence}")

Replacing ᄃ with 100
Replacing ᄃ with 100
Original: 이 비디오는 아마도 댄스 스튜디오에서 닫힌 환경에서 재즈 댄스 안무를 수행하는 무용수 그룹을 보여줍니다.
Modified: 이 비디오는 아마도 100ᅢᆫ스 스튜디오에서 [UNK] 환경에서 재즈 100ᅢᆫ스 안무를 수행하는 무용수 그룹을 보여줍니다.


In [491]:
import nltk
from nltk.corpus import wordnet as wn

nltk.download('omw-1.4')  # Required for multilingual WordNet

def get_synonyms_in_language(word, lang='es'):
    synsets = wn.synsets(word, lang=lang)
    synonyms = set()
    print(synonyms)
    for syn in synsets:
        for lemma in syn.lemmas(lang):
            lemma_name = lemma.name().replace('_', ' ')
            if lemma_name.lower() != word.lower():
                synonyms.add(lemma_name)
    return list(synonyms) if synonyms else None


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nilsgrunefeld/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [487]:
wn.langs()

['eng',
 'als',
 'arb',
 'bul',
 'cmn',
 'dan',
 'ell',
 'fin',
 'fra',
 'heb',
 'hrv',
 'isl',
 'ita',
 'ita_iwn',
 'jpn',
 'cat',
 'eus',
 'glg',
 'spa',
 'ind',
 'zsm',
 'nld',
 'nno',
 'nob',
 'pol',
 'por',
 'ron',
 'lit',
 'slk',
 'slv',
 'swe',
 'tha']

In [489]:
get_synonyms_in_language("merde", lang="fra")

set()


['les boules',
 'baiser',
 'foutre',
 'vis',
 'étron',
 'foirer',
 'bousiller',
 'cric',
 'niquer',
 'faire',
 'enculer',
 'putain',
 'rendre',
 'valet',
 'caguer',
 'taureau',
 'baise',
 'gâcher',
 'baisage',
 'chier',
 'repriser',
 'déféquer']