<a href="https://colab.research.google.com/github/mohammedterry/NLP_for_ML/blob/master/NLPAugmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Augmentation

In [0]:
conversation_dataset = [
  ["hi","how are you","fine thanks","great. Take care"],
  ["heya","ugh go away","why are you so mean to me","im just in a bad mood"],
  ["what is your name","im Carla","nice to meet you Carla","likewise"]
]

## Shifting Conversations

Varying the Position of Sentences within a conversation by shifting the text to the left / right is a useful trick for sequence data

In [7]:
def shift_examples(examples):
  return [
    example[idx:] for example in examples for idx in range(
      len(example)
    ) if len(example[idx:]) > 1
  ]
shift_examples(conversation_dataset)

[['hi', 'how are you', 'fine thanks', 'great. Take care'],
 ['how are you', 'fine thanks', 'great. Take care'],
 ['fine thanks', 'great. Take care'],
 ['heya', 'ugh go away', 'why are you so mean to me', 'im just in a bad mood'],
 ['ugh go away', 'why are you so mean to me', 'im just in a bad mood'],
 ['why are you so mean to me', 'im just in a bad mood'],
 ['what is your name', 'im Carla', 'nice to meet you Carla', 'likewise'],
 ['im Carla', 'nice to meet you Carla', 'likewise'],
 ['nice to meet you Carla', 'likewise']]

## Combining Conversations

In [15]:
def combine_examples(examples):
  return [
    first_example + second_example for idx,first_example in enumerate(
        examples
    ) for jdx, second_example in enumerate(
        examples
    ) if idx != jdx
  ]

combine_examples(conversation_dataset)

[['hi',
  'how are you',
  'fine thanks',
  'great. Take care',
  'heya',
  'ugh go away',
  'why are you so mean to me',
  'im just in a bad mood'],
 ['hi',
  'how are you',
  'fine thanks',
  'great. Take care',
  'what is your name',
  'im Carla',
  'nice to meet you Carla',
  'likewise'],
 ['heya',
  'ugh go away',
  'why are you so mean to me',
  'im just in a bad mood',
  'hi',
  'how are you',
  'fine thanks',
  'great. Take care'],
 ['heya',
  'ugh go away',
  'why are you so mean to me',
  'im just in a bad mood',
  'what is your name',
  'im Carla',
  'nice to meet you Carla',
  'likewise'],
 ['what is your name',
  'im Carla',
  'nice to meet you Carla',
  'likewise',
  'hi',
  'how are you',
  'fine thanks',
  'great. Take care'],
 ['what is your name',
  'im Carla',
  'nice to meet you Carla',
  'likewise',
  'heya',
  'ugh go away',
  'why are you so mean to me',
  'im just in a bad mood']]

## Substituting Synonyms (using Wordnet)

In [32]:
import nltk 
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
from nltk.corpus import wordnet as wn
from string import punctuation

In [0]:
def synonyms(word, pos_tag):
  return list(
    {
      lemma.replace("_"," ").replace("-"," ") for synset in wn.synsets(
          _clean_word(word),
          pos_tag,
      ) for lemma in synset.lemma_names()
    }
  )

def _clean_word(word):
  return word.lower().strip(punctuation) 

In [18]:
synonyms("Test.", 'n')

['mental test',
 'trial',
 'mental testing',
 'run',
 'trial run',
 'test',
 'psychometric test',
 'examination',
 'exam',
 'tryout']

In [19]:
synonyms("Test.", 'v')

['test', 'prove', 'quiz', 'try out', 'try', 'examine', 'screen', 'essay']

In [0]:
def _tokenise(sentence):
  return nltk.word_tokenize(sentence)

def _infer_pos_tags(tokens):
  return [
    (
      token,
      _convert_nltk_to_wordnet_tag(nltk_tag)
    ) for token,nltk_tag in nltk.pos_tag(tokens)
  ]

def _convert_nltk_to_wordnet_tag(pos_tag):
  NOUN = "NN"
  VERB = "VB"
  ADJECTIVE = "JJ"
  ADVERB = "RB"
  if pos_tag.startswith(NOUN):
    return "n"
  if pos_tag.startswith(VERB):
    return "v" 
  if pos_tag.startswith(ADVERB):
    return "r"
  if pos_tag.startswith(ADJECTIVE):
    return "a"

In [45]:
_infer_pos_tags(
    tokens = _tokenise("testing, testing, this is a test")
)

[('testing', 'v'),
 (',', None),
 ('testing', 'v'),
 (',', None),
 ('this', None),
 ('is', 'v'),
 ('a', None),
 ('test', 'n')]

In [0]:
def synonymous_examples(examples, include_verbs = False):
  synonymous = []
  for example in examples:
    for idx,sentence in enumerate(example):
      tokens = _tokenise(sentence)
      tagged_words = _infer_pos_tags(tokens)
      for jdx,word_pos in enumerate(tagged_words):
        word, pos_tag = word_pos
        if pos_tag and (include_verbs or pos_tag != "v"):
          for synonym in synonyms(word, pos_tag):
              new_tokens = tokens[:jdx] + [synonym] + tokens[jdx+1:]
              new_sentence = ' '.join(new_tokens)
              new_example = example[:idx] + [new_sentence] + example[idx+1:]
              synonymous.append(new_example)
  return synonymous

In [345]:
synonymous_examples(
  [
    ["This is a little test", "yes it is"],
    ["this is another test", "no it isn't"]
  ]
)

[['This is a slight test', 'yes it is'],
 ['This is a little test', 'yes it is'],
 ['This is a fiddling test', 'yes it is'],
 ['This is a picayune test', 'yes it is'],
 ['This is a small test', 'yes it is'],
 ['This is a footling test', 'yes it is'],
 ['This is a lilliputian test', 'yes it is'],
 ['This is a short test', 'yes it is'],
 ['This is a trivial test', 'yes it is'],
 ['This is a petty test', 'yes it is'],
 ['This is a niggling test', 'yes it is'],
 ['This is a piddling test', 'yes it is'],
 ['This is a minuscule test', 'yes it is'],
 ['This is a piffling test', 'yes it is'],
 ['This is a little mental test', 'yes it is'],
 ['This is a little trial', 'yes it is'],
 ['This is a little mental testing', 'yes it is'],
 ['This is a little run', 'yes it is'],
 ['This is a little trial run', 'yes it is'],
 ['This is a little test', 'yes it is'],
 ['This is a little psychometric test', 'yes it is'],
 ['This is a little examination', 'yes it is'],
 ['This is a little exam', 'yes it is'

In [72]:
len(synonymous_examples(conversation_dataset))

128

In [73]:
len(synonymous_examples(conversation_dataset, include_verbs=True))

254

## Back Translating (aka Spinning)

In [0]:
from textblob import TextBlob

In [0]:
def _spin_text(text, foreign_language): 
  try: 
    spun_text = _clean_word(
      TextBlob(
        TextBlob(text).translate(
          from_lang="en",
          to=foreign_language
        ).raw
      ).translate(
        from_lang=foreign_language,
        to="en"
      ).raw
    )
    return spun_text if spun_text != _clean_word(text) else None
  except:
    return None

In [58]:
_spin_text("i have given you a small test", "es")

"i've done a little test"

In [59]:
_spin_text("that's an entirely different kettle of fish", "ar")

'this kettle is completely different than fish'

In [60]:
def rephrase_examples(examples):
  rephrased_examples = []
  repeat_rephrasings = []
  for example in examples:
    for idx,sentence in enumerate(example):
      sentence_spun_from_spanish = _spin_text(sentence, "es")
      if sentence_spun_from_spanish and sentence_spun_from_spanish not in repeat_rephrasings:
        repeat_rephrasings.append(sentence_spun_from_spanish)
        rephrased_examples.append(
          example[:idx] + [sentence_spun_from_spanish] + example[idx+1:]
        )
      sentence_spun_from_arabic = _spin_text(sentence, "ar")
      if sentence_spun_from_arabic and sentence_spun_from_arabic not in repeat_rephrasings:
        repeat_rephrasings.append(sentence_spun_from_arabic)
        rephrased_examples.append(
          example[:idx] + [sentence_spun_from_arabic] + example[idx+1:]
        )
  return examples + rephrased_examples

rephrase_examples(conversation_dataset)

[['hi', 'how are you', 'fine thanks', 'great. Take care'],
 ['heya', 'ugh go away', 'why are you so mean to me', 'im just in a bad mood'],
 ['what is your name', 'im Carla', 'nice to meet you Carla', 'likewise'],
 ['hi', 'how are you', 'well thanks', 'great. Take care'],
 ['hi', 'how are you', 'good thanks', 'great. Take care'],
 ['hi', 'how are you', 'fine thanks', 'excellent. beware'],
 ['hi', 'how are you', 'fine thanks', 'great. take care of yourself'],
 ['hey', 'ugh go away', 'why are you so mean to me', 'im just in a bad mood'],
 ['heya', 'ugh go away', 'why are you cruel with me', 'im just in a bad mood'],
 ['heya', 'ugh go away', 'why are you so mean to me', "i'm in a bad mood"],
 ['heya',
  'ugh go away',
  'why are you so mean to me',
  "i'm just in a bad mood"],
 ["what's your name", 'im Carla', 'nice to meet you Carla', 'likewise'],
 ['what is your name', "i'm carla", 'nice to meet you Carla', 'likewise'],
 ['what is your name', 'i am carla', 'nice to meet you Carla', 'like

## Inserting words (using BERT)

In [92]:
!pip3 install -U pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 19.4MB/s eta 0:00:01[K     |█████▎                          | 20kB 1.8MB/s eta 0:00:01[K     |████████                        | 30kB 2.6MB/s eta 0:00:01[K     |██████████▋                     | 40kB 1.7MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 2.1MB/s eta 0:00:01[K     |███████████████▉                | 61kB 2.5MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 2.9MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 3.3MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 3.7MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 2.8MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 2.8MB/s eta 0:00:01[K     |██████████████████████

In [0]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

In [97]:
model_name = 'bert-base-uncased'
bert_tokeniser = BertTokenizer.from_pretrained(model_name)
bert_model = BertForMaskedLM.from_pretrained(model_name)

100%|██████████| 407873900/407873900 [00:30<00:00, 13529294.61B/s]


In [0]:
def _format_model_input(text, tokeniser, insert_mask_at_idx):
  tokens = tokeniser.tokenize(
    f"[CLS] {text} [SEP]"
  )
  tokens_with_mask = tokens[:insert_mask_at_idx] + [
    "[MASK]"
  ] + tokens[insert_mask_at_idx:]
  return torch.tensor(
    [
      tokeniser.convert_tokens_to_ids(tokens_with_mask)
    ]
  )

def _format_model_output(model_output, token_idxs, tokeniser, masked_idx):
  tokens = tokeniser.convert_ids_to_tokens(
    token_idxs.tolist()[0]
  )
  tokens[masked_idx] = tokeniser.convert_ids_to_tokens(
    [
      torch.argmax(
        model_output[0, masked_idx]
      ).item()
    ]
  )[0]
  return ' '.join(tokens[1:-1]).replace("##","")

In [0]:
def _insert_mask_and_predict(sentence, model, tokeniser, masked_idx):
  tokens_with_mask_inserted = _format_model_input(
    text = sentence,
    tokeniser = tokeniser,
    insert_mask_at_idx = masked_idx,
  )
  segment_ids = torch.tensor(
    [[0]*len(tokens_with_mask_inserted)]
  )
  with torch.no_grad():
    return _format_model_output(
      model_output = model(
        tokens_with_mask_inserted,
        segment_ids
      ),
      tokeniser = tokeniser,
      token_idxs = tokens_with_mask_inserted,
      masked_idx = masked_idx,
    )

In [0]:
def _insert_words(example):
  new_examples = [example]
  idx = 1
  try:
    while True:
      new_examples.append(
        _insert_mask_and_predict(
          sentence = example,
          model = bert_model,
          tokeniser = bert_tokeniser,
          masked_idx = idx
        )
      )
      idx += 1
  except:
    new_examples.pop()
    return new_examples

In [275]:
_insert_words("i have two drinks")

['i have two drinks',
 '" i have two drinks',
 'i only have two drinks',
 'i have my two drinks',
 'i have two more drinks',
 'i have two drinks .']

In [0]:
def bert_inserted_examples(examples):
  new_examples = []
  for example in examples:
    for idx,sentence in enumerate(example):
      for inserted_sentence in _insert_words(sentence):
        new_examples.append(
          example[:idx] + [inserted_sentence] + example[idx+1:]
        )
  return examples + new_examples

In [354]:
bert_inserted_examples(conversation_dataset)

[['hi', 'how are you', 'fine thanks', 'great. Take care'],
 ['heya', 'ugh go away', 'why are you so mean to me', 'im just in a bad mood'],
 ['what is your name', 'im Carla', 'nice to meet you Carla', 'likewise'],
 ['hi', 'how are you', 'fine thanks', 'great. Take care'],
 ['" hi', 'how are you', 'fine thanks', 'great. Take care'],
 ['hi .', 'how are you', 'fine thanks', 'great. Take care'],
 ['hi', 'how are you', 'fine thanks', 'great. Take care'],
 ['hi', '" how are you', 'fine thanks', 'great. Take care'],
 ['hi', 'how old are you', 'fine thanks', 'great. Take care'],
 ['hi', 'how are you you', 'fine thanks', 'great. Take care'],
 ['hi', 'how are you ?', 'fine thanks', 'great. Take care'],
 ['hi', 'how are you', 'fine thanks', 'great. Take care'],
 ['hi', 'how are you', '" fine thanks', 'great. Take care'],
 ['hi', 'how are you', 'fine , thanks', 'great. Take care'],
 ['hi', 'how are you', 'fine thanks .', 'great. Take care'],
 ['hi', 'how are you', 'fine thanks', 'great. Take care']

## Generating longer conversations (using GPT-2)

In [312]:
!pip3 install gpt-2-simple



In [313]:
from nltk.tokenize import sent_tokenize
import gpt_2_simple as gpt2
model_name = "774M"
gpt2.download_gpt2(model_name=model_name)
gpt2.load_gpt2(
  gpt2.start_tf_sess(), 
  model_name=model_name
)

Fetching checkpoint: 1.05Mit [00:00, 162Mit/s]                                                      
Fetching encoder.json: 1.05Mit [00:00, 92.0Mit/s]                                                   
Fetching hparams.json: 1.05Mit [00:00, 280Mit/s]                                                    
Fetching model.ckpt.data-00000-of-00001: 3.10Git [01:03, 48.8Mit/s]                                 
Fetching model.ckpt.index: 1.05Mit [00:00, 373Mit/s]                                                
Fetching model.ckpt.meta: 2.10Mit [00:00, 120Mit/s]                                                 
Fetching vocab.bpe: 1.05Mit [00:00, 113Mit/s]                                                       


In [0]:
def extend_conversations(examples):
  return examples + [
    _extend_conversation(
      '. '.join(example)
    ) for example in examples
  ]

def _extend_conversation(conversation_as_string):
  generated_samples = gpt2.generate(
    sess,
    model_name=model_name,
    prefix=conversation_as_string,
    length=100,
    return_as_list = True
  )
  n = len(
    sent_tokenize(
      conversation_as_string
    )
  )
  return sent_tokenize(
    generated_samples[0]
  )[:n+1]


In [348]:
extend_conversations(conversation_dataset)

[['hi', 'how are you', 'fine thanks', 'great. Take care'],
 ['heya', 'ugh go away', 'why are you so mean to me', 'im just in a bad mood'],
 ['what is your name', 'im Carla', 'nice to meet you Carla', 'likewise'],
 ['hi.', 'how are you.', 'fine thanks.', 'great.', 'Take care.', 'I got it.'],
 ['heya.',
  'ugh go away.',
  'why are you so mean to me.',
  'im just in a bad mood because I lost in the qualifiers.',
  'why are you so mean to me.'],
 ['what is your name.',
  'im Carla.',
  'nice to meet you Carla.',
  'likewise, my name is Carla.',
  "Bill: How'd you get here?"]]

# Pipeline

In [0]:
def augment_dataset(dataset):
  dataset = extend_conversations(dataset)
  dataset = bert_inserted_examples(dataset)
  dataset = rephrase_examples(dataset)
  dataset = synonymous_examples(dataset)
  dataset = shift_examples(dataset)
  return dataset

In [0]:
augmented_dataset = augment_dataset(conversation_dataset)

In [359]:
augmented_dataset

[['how do you do', 'how are you', 'fine thanks', 'great. Take care'],
 ['how are you', 'fine thanks', 'great. Take care'],
 ['fine thanks', 'great. Take care'],
 ['hello', 'how are you', 'fine thanks', 'great. Take care'],
 ['how are you', 'fine thanks', 'great. Take care'],
 ['fine thanks', 'great. Take care'],
 ['Hawaii', 'how are you', 'fine thanks', 'great. Take care'],
 ['how are you', 'fine thanks', 'great. Take care'],
 ['fine thanks', 'great. Take care'],
 ['hullo', 'how are you', 'fine thanks', 'great. Take care'],
 ['how are you', 'fine thanks', 'great. Take care'],
 ['fine thanks', 'great. Take care'],
 ['howdy', 'how are you', 'fine thanks', 'great. Take care'],
 ['how are you', 'fine thanks', 'great. Take care'],
 ['fine thanks', 'great. Take care'],
 ['Aloha State', 'how are you', 'fine thanks', 'great. Take care'],
 ['how are you', 'fine thanks', 'great. Take care'],
 ['fine thanks', 'great. Take care'],
 ["Hawai'i", 'how are you', 'fine thanks', 'great. Take care'],
 ['

In [360]:
len(augmented_dataset)

38293