In [1]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

from db.dbv2 import Table, AugmentedTable, TrainTestTable

<IPython.core.display.Javascript object>

In [62]:
from src.dataset.gpt_augmentor import Augmentor
from src.dataset.utils import (
    truncate_by_token,
    avg_segment_length_by_char,
    avg_segment_length_by_token,
)

from nltk.tokenize import word_tokenize

<IPython.core.display.Javascript object>

In [7]:
dataset_type = "committee"

table = Table(dataset_type)
augmented_table = AugmentedTable(dataset_type)
train_test_table = TrainTestTable(dataset_type)

<IPython.core.display.Javascript object>

In [12]:
target_sentences_original = table.get_target_sentences()
target_sentences = [s[1] for s in target_sentences_original]

<IPython.core.display.Javascript object>

In [19]:
# Take the first 5 target sentences for testing
test_target_sentences = target_sentences_original[:5]

<IPython.core.display.Javascript object>

In [28]:
avg_segment_length_by_char(["one", "two", "to"], floor=True)

2.0

<IPython.core.display.Javascript object>

In [77]:
# example segment
test_segments = []

for target_sentence in test_target_sentences:
    segment = table.get_segment(target_sentence[0])
    test_segments.append([s[1] for s in segment])

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


<IPython.core.display.Javascript object>

In [78]:
len(test_segments)

5

<IPython.core.display.Javascript object>

In [79]:
test_sentence = target_sentences[0]
test_segment = test_segments[0]

<IPython.core.display.Javascript object>

In [70]:
test_segments = [
    ["this is one sentence", "about donald trump", "and pakistan"],
    [
        "welcome to fight club",
        "where there are no rules",
        "except 1 rule",
        "you don't speak about fight club",
    ],
]

<IPython.core.display.Javascript object>

In [None]:
# GTA 1 POC
dataset_avg_sentence_length = sum(
    [avg_segment_length_by_token(segment, floor=True) for segment in test_segments]
) // len(test_segments)

max_sent_tokens = 64
augmented_sentences = []

for segment in test_segments:
    first_sentence = truncate_by_token(segment[0], max_sent_tokens)
    first_sentence_length = len(first_sentence)

    augmented_sentence = Augmentor.augment_gpt2_single(
        first_sentence,
        fast=True,
        # add the length of the current sentence to the dataset avg length of sentence
        output_tokens=int(dataset_avg_sentence_length),
        num_return_sequences=2,
    )

    augmented_sentences.append(augmented_sentence)

In [None]:
first_sentence_length

In [None]:
augmented_sentences

## GTA 1

- Using GPT-2, we take the first truncated portion of the first sentence in a segment and feed it into the model. The output should be the same size as the the overall dataset average sentence length.
- We then take that output sentence as the first sentence in the augmented segment
- Using that newly augmented sentence, we feed it into GPT again to generate a new sentence of the same size.
- We do this autoregressive process for `n` times.
    - For experimentation, we do `n = k/2` where `k` is the average segment size in the dataset.

- On average, we will have about half the amount of total data in our augmented dataset than our real dataset

In [None]:
min_sent_tokens = 8
max_sent_tokens = 64

cleaned_target_sentences = []
for s in target_sentences:
    if len(word_tokenize(s)) > min_sent_tokens:
        shortened_sentence = truncate_by_token(s, max_sent_tokens)
        cleaned_target_sentences.append(shortened_sentence)

augmented_segments = Augmentor.augment_gpt2(
    cleaned_target_sentences[:2],
    fast=True,
    # multiply by 5 to account for 5 as a max segment
    max_seq_word_length=max_sent_tokens * 5,
    verbose=True,
)

## GTA 2

- Using GPT-2, we take the first truncated portion of the first sentence in a segment and feed it into the model. The output should be the same size as the first sentence length (for averaging similar segment sizes).
- That first outputted sentence becomes the target sentence for the augmented segment.
- Using the sentence sentence in the real segment, we repeat the first step. The second sentence in the augmented segment will be the output of the real second sentence fed into GPT-2.
- Continuing this process, we will be left with an augmented segment the same exact size as the real segment it’s modeled after with hopefully less variance than GTA 1 toward the end of the segments.