## Installation

In [1]:
#!pip install numpy requests nlpaug
#!pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece
#!pip install nltk>=3.4.5

## Usage

In [2]:
import nlpaug.augmenter.word as naw

In [3]:
def augmenter(action):
    return naw.ContextualWordEmbsAug(model_path='dbmdz/bert-base-turkish-cased', action=action)

In [4]:
text = "İlaç kaydetmek istiyorum"

In [None]:
action = "insert"
for i in range(5):
    augmented_text = augmenter(action).augment(text)
    print(augmented_text)

In [None]:
action = "substitute"
for i in range(5):
    augmented_text = augmenter(action).augment(text)
    print(augmented_text)

## Easy Data Augmentation (EDA) 🚜

Please find here the model used below -> https://github.com/akoksal/Turkish-Word2Vec

In [None]:
#!pip install gensim

In [6]:
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('trmodel', binary=True)

In [7]:
sample_sentence = "Karşımda beni ciddi ciddi süzen, küçük, eşi görülmedik biri duruyordu."

### Synonym Replacement
Select a word that is not a stop words and replace it its synonym.

In [8]:
from random import randint

In [9]:
tokenized_sentence = sample_sentence.split(" ")
tokenized_sentence

['Karşımda',
 'beni',
 'ciddi',
 'ciddi',
 'süzen,',
 'küçük,',
 'eşi',
 'görülmedik',
 'biri',
 'duruyordu.']

In [10]:
while True:
    random_idx = randint(0, len(tokenized_sentence) - 1)
    random_word = tokenized_sentence[random_idx]
    # there could be a word not included in dictionary if so, choose another.
    try:
        synonym_word = word_vectors.similar_by_word(random_word)[0][0]
        break
    except:
        pass

In [11]:
random_word, synonym_word

('ciddi', 'şiddetli')

In [12]:
tokenized_sentence[random_idx] = synonym_word
augmented_sentence = " ".join(tokenized_sentence)

In [13]:
sample_sentence, augmented_sentence

('Karşımda beni ciddi ciddi süzen, küçük, eşi görülmedik biri duruyordu.',
 'Karşımda beni şiddetli ciddi süzen, küçük, eşi görülmedik biri duruyordu.')