In [1]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

<IPython.core.display.Javascript object>

In [47]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from db.dbv2 import Table, AugmentedTable, TrainTestTable

<IPython.core.display.Javascript object>

In [140]:
from src.dataset.gpt_augmentor import Augmentor
from src.dataset.utils import (
    truncate_by_token,
    avg_segment_length_by_char,
    avg_segment_length_by_token,
)

from nltk.tokenize import word_tokenize

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


<IPython.core.display.Javascript object>

In [4]:
dataset_type = "committee"

table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [5]:
target_sentences_original = table.get_target_sentences()
target_sentences = [s[1] for s in target_sentences_original]

<IPython.core.display.Javascript object>

In [6]:
# Take the first 5 target sentences for testing
test_target_sentences = target_sentences_original[:5]

<IPython.core.display.Javascript object>

In [7]:
avg_segment_length_by_char(["one", "two", "to"], floor=True)

2.0

<IPython.core.display.Javascript object>

In [8]:
# example segment
test_segments = []

for target_sentence in test_target_sentences:
    segment = table.get_segment(target_sentence[0])
    test_segments.append([s[1] for s in segment])

<IPython.core.display.Javascript object>

In [9]:
len(test_segments)

5

<IPython.core.display.Javascript object>

In [10]:
test_sentence = target_sentences[0]
test_segment = test_segments[0]

<IPython.core.display.Javascript object>

In [104]:
test_segments = [
    [
        "this is one sentence",
        "about donald trump",
        "and pakistan",
        "i am not a republican",
        "rather I am a democrat",
    ],
    #     [
    #         "welcome to fight club",
    #         "where there are no rules",
    #         "except 1 rule",
    #         "you don't speak about fight club",
    #     ],
    #     ["i am not a republican", "rather I am a democrat", "I love Elon Musk"],
]

<IPython.core.display.Javascript object>

## GTA 1

- Using GPT-2, we take the first truncated portion of the first sentence in a segment and feed it into the model. The output should be the same size as the the overall dataset average sentence length.
- We then take that output sentence as the first sentence in the augmented segment
- Using that newly augmented sentence, we feed it into GPT again to generate a new sentence of the same size.
- We do this autoregressive process for `n` times.
    - For experimentation, we do `n = k/2` where `k` is the average segment size in the dataset.

- On average, we will have about half the amount of total data in our augmented dataset than our real dataset

Note: By default, GPT is autoregressive, so instead of having to re-run GPT on every sentence that's generated, just run it once and multiply the output_tokens by `n` to get the desired sentences. Afterward, the post-processing will need to chop the initial sentence off and break the complete output sentence into its relative sentences.

In [126]:
def post_augmentation_processing(sentence, real_sentence_chars, n):
    return sentence[real_sentence_chars:]


post_augmentation_processing("this is a sentence.\n\n and another one", 4, 0)

' is a sentence.\n\n and another one'

<IPython.core.display.Javascript object>

In [127]:
# GTA 1 POC
dataset_avg_sentence_length = sum(
    [avg_segment_length_by_token(segment, floor=True) for segment in test_segments]
) // len(test_segments)

avg_segment_length = sum([len(t) for t in test_segments]) // len(
    test_segments
)  # avg number of sentences per segment in the dataset
n = int(avg_segment_length / 2)  # the number of sentences we will generate per segment

max_sent_tokens = 64
augmented_segments = []

for segment in test_segments:
    first_sentence = truncate_by_token(segment[0], max_sent_tokens)

    augmented_segment = []
    for i in range(0, avg_segment_length):
        next_sentence = (
            segment[0] if len(augmented_segment) == 0 else augmented_segment[-1]
        )
        next_sentence = " ".join(word_tokenize(next_sentence)[:max_sent_tokens])
        next_sentence_length = len(next_sentence)
        sentence_tokens_length = len(word_tokenize(next_sentence))
        # create segment
        augmented_sentence = Augmentor.augment_gpt2_single(
            next_sentence,
            fast=True,
            # add the length of the current sentence to the dataset avg length of sentence
            output_tokens=int(n * int(dataset_avg_sentence_length)),
            num_return_sequences=1,
        )

        augmented_sentence = post_augmentation_processing(
            # feed in the first generated sentence
            augmented_sentence[0][0],
            next_sentence_length,
            n,
        )

        augmented_segment.append(augmented_sentence)
        print("augmented_segment", augmented_segment)

    augmented_segments.append(augmented_segment)

completed augmentation...
augmented_segment [' of the book: "The']
completed augmentation...
augmented_segment [' of the book: "The', ' Book of Mormon, by Joseph']
completed augmentation...
augmented_segment [' of the book: "The', ' Book of Mormon, by Joseph', 'Smith.\n\n, by']
completed augmentation...
augmented_segment [' of the book: "The', ' Book of Mormon, by Joseph', 'Smith.\n\n, by', '. L. Sacks']
completed augmentation...
augmented_segment [' of the book: "The', ' Book of Mormon, by Joseph', 'Smith.\n\n, by', '. L. Sacks', ', R. J. P']


<IPython.core.display.Javascript object>

In [128]:
n * int(dataset_avg_sentence_length), n

(6, 2)

<IPython.core.display.Javascript object>

In [129]:
augmented_segments

[[' of the book: "The',
  ' Book of Mormon, by Joseph',
  'Smith.\n\n, by',
  '. L. Sacks',
  ', R. J. P']]

<IPython.core.display.Javascript object>

In [141]:
Augmentor.gta1(test_segments)

completed augmentation...
augmented_segment ['. The whole point of this']
completed augmentation...
augmented_segment ['. The whole point of this', ' is to give the players a']
completed augmentation...
augmented_segment ['. The whole point of this', ' is to give the players a', ' chance to win a big game']
completed augmentation...
augmented_segment ['. The whole point of this', ' is to give the players a', ' chance to win a big game', '.\n\nThe only thing']
completed augmentation...
augmented_segment ['. The whole point of this', ' is to give the players a', ' chance to win a big game', '.\n\nThe only thing', ' I can say is that I']


[['. The whole point of this',
  ' is to give the players a',
  ' chance to win a big game',
  '.\n\nThe only thing',
  ' I can say is that I']]

<IPython.core.display.Javascript object>

## GTA 2

- Using GPT-2, we take the first truncated portion of the first sentence in a segment and feed it into the model. The output should be the same size as the first sentence length (for averaging similar segment sizes).
- That first outputted sentence becomes the target sentence for the augmented segment.
- Using the sentence sentence in the real segment, we repeat the first step. The second sentence in the augmented segment will be the output of the real second sentence fed into GPT-2.
- Continuing this process, we will be left with an augmented segment the same exact size as the real segment it’s modeled after with hopefully less variance than GTA 1 toward the end of the segments.

Cons:
- Possible disjointedness with augmented sentences since they may vary quite a bit from the immediate sentence previously due to relying on an intermediary in-between sentence to generate.

In [134]:
# GTA 2 POC
dataset_avg_sentence_length = sum(
    [avg_segment_length_by_token(segment, floor=True) for segment in test_segments]
) // len(test_segments)

avg_segment_length = sum([len(t) for t in test_segments]) // len(
    test_segments
)  # avg number of sentences per segment in the dataset
n = int(avg_segment_length / 2)  # the number of sentences we will generate per segment

max_sent_tokens = 64
augmented_segments = []

for segment in test_segments:
    augmented_segment = []
    for sentence in segment:
        sentence = truncate_by_token(sentence, max_sent_tokens)
        sentence_length = len(sentence)
        sentence_tokens_length = len(word_tokenize(next_sentence))

        augmented_sentence = Augmentor.augment_gpt2_single(
            sentence,
            fast=True,
            # add the length of the current sentence to the dataset avg length of sentence
            output_tokens=int(n * int(dataset_avg_sentence_length)),
            num_return_sequences=1,
        )

        augmented_sentence = post_augmentation_processing(
            # feed in the first generated sentence
            augmented_sentence[0][0],
            sentence_length,
            n,
        )

        augmented_segment.append(augmented_sentence)
        print("sentence", sentence, "augmented_sentence", augmented_sentence)

    augmented_segments.append(augmented_segment)

completed augmentation...
sentence this is one sentence augmented_sentence  that is almost impossible to translate
completed augmentation...
sentence about donald trump augmented_sentence  trumpson, I think.
completed augmentation...
sentence and pakistan augmented_sentence ) to the US.

completed augmentation...
sentence i am not a republican augmented_sentence . If I am a democrat
completed augmentation...
sentence rather I am a democrat augmented_sentence ) and I'm not going


<IPython.core.display.Javascript object>

In [135]:
test_segments, augmented_segments

([['this is one sentence',
   'about donald trump',
   'and pakistan',
   'i am not a republican',
   'rather I am a democrat']],
 [[' that is almost impossible to translate',
   ' trumpson, I think.',
   ') to the US.\n',
   '. If I am a democrat',
   ") and I'm not going"]])

<IPython.core.display.Javascript object>