In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.insert(0, config.root_path)

In [3]:
from db.db import DB, AugmentedDB

dataset_type = "wiki"

In [4]:
db = DB(dataset_type)
augmented_db = AugmentedDB(dataset_type)

In [5]:
def get_data(db, pct=0.1, max_seg_size=256):
    segments = db.get_random_segments_pct(pct_data=pct, max_segment_size=max_seg_size)
    
    return segments

In [6]:
regular_segments = db.get_random_segments_pct(pct_data=1, max_segment_size=10)
augmented_segments = augmented_db.get_random_segments_pct(pct_data=1, max_segment_size=10)

In [14]:
len(regular_segments), len(augmented_segments)

(10, 2544)

## Tokenizing

In [49]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
encoding = tokenizer.encode_plus(text,
                                 add_special_tokens = True,
                                 truncation = True,
                                 padding = "max_length",
                                 return_attention_mask = True,
                                 return_tensors = "pt")

## GPT Augmentor

In [17]:
sys.path.append('../')
from src.dataset.gpt_augmentor import Augmentor

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [18]:
result = Augmentor.augment_gpt2(["I enjoy walking with my cute dog", "I enjoy walking with my cute dog"], fast=True)

In [19]:
result

[["I enjoy walking with my cute dog. It's so good to have the chance to walk with a dog. But I have not been able to find a puppy that I would want to adopt or get the chance to look for.\n\nI am",
  "I enjoy walking with my cute dog. I like seeing him, I don't like having my dog go through me, but when we walk together that makes for a wonderful bonding moment. I appreciate the interaction, I just don't understand how it would",
  'I enjoy walking with my cute dog and playing with our kids," said David J. Smith, director of the Humane Society of the US.\n\n"So as a result, I\'ve got more work in my time," he said.\n\n'],
 ["I enjoy walking with my cute dog. It's so good to have the chance to walk with a dog. But I have not been able to find a puppy that I would want to adopt or get the chance to look for.\n\nI am",
  "I enjoy walking with my cute dog. I like seeing him, I don't like having my dog go through me, but when we walk together that makes for a wonderful bonding moment. I app

## Insert GPT Augmented Data

In [7]:
from db.db import DB, AugmentedDB
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

In [8]:
dataset_type = "wiki"

In [9]:
db = DB(dataset_type)
augmented_db = AugmentedDB(dataset_type)

In [10]:
min_sent_tokens = 8
max_sent_tokens = 64

In [11]:
target_sentences = db.get_target_sentences()
target_sentences = [s[1] for s in target_sentences]
cleaned_target_sentences = []
for s in target_sentences:
    if len(word_tokenize(s)) > min_sent_tokens:
        shortened_sentence = " ".join(word_tokenize(s)[:max_sent_tokens])
        cleaned_target_sentences.append(shortened_sentence)

In [12]:
augmented_segments = Augmentor.augment_gpt2(cleaned_target_sentences[:2], 
                                            fast=True, 
                                            # multiply by 5 to account for 5 as a max segment
                                            max_seq_word_length=max_sent_tokens*5, 
                                            verbose=True)

NameError: name 'Augmentor' is not defined

In [126]:
for options in augmented_segments:
    for option in options:
        segment = nltk.tokenize.sent_tokenize(option)
        
        augmented_db.create_segment(segment)

## Dataset Testing

In [95]:
from src.dataset.albert import AlbertDataset

In [97]:
dataset.num_samples

795

In [98]:
sentences, tokenized_sentences, labels = dataset.process()

In [99]:
len(sentences)

795

### Experiment Testing

In [17]:
import copy

config_overrides = {
        "bert_type": ["ldabert"],
        "dataset_type": ["clinical"],
        "final_dropout": [0.5, 0.8],
        "dense_neurons": [64, 128, 256],
        "pct_data": [1],
        "augment_pct": [0.1, 0.25, 0.5, 1],
        "epochs": [1000]
    }

experiments = [{},]

for k,v in config_overrides.items():
    new_values = len(v)
    current_exp_len = len(experiments)
    for _ in range(new_values-1):
       experiments.extend(copy.deepcopy(experiments[:current_exp_len]))
    for validx in range(len(v)):
       for exp in experiments[validx*current_exp_len:(validx+1)*current_exp_len]:
            exp[k] = v[validx]

print(experiments)

[{'bert_type': 'ldabert', 'dataset_type': 'clinical', 'final_dropout': 0.5, 'dense_neurons': 64, 'pct_data': 1, 'augment_pct': 0.1, 'epochs': 1000}, {'bert_type': 'ldabert', 'dataset_type': 'clinical', 'final_dropout': 0.8, 'dense_neurons': 64, 'pct_data': 1, 'augment_pct': 0.1, 'epochs': 1000}, {'bert_type': 'ldabert', 'dataset_type': 'clinical', 'final_dropout': 0.5, 'dense_neurons': 128, 'pct_data': 1, 'augment_pct': 0.1, 'epochs': 1000}, {'bert_type': 'ldabert', 'dataset_type': 'clinical', 'final_dropout': 0.8, 'dense_neurons': 128, 'pct_data': 1, 'augment_pct': 0.1, 'epochs': 1000}, {'bert_type': 'ldabert', 'dataset_type': 'clinical', 'final_dropout': 0.5, 'dense_neurons': 256, 'pct_data': 1, 'augment_pct': 0.1, 'epochs': 1000}, {'bert_type': 'ldabert', 'dataset_type': 'clinical', 'final_dropout': 0.8, 'dense_neurons': 256, 'pct_data': 1, 'augment_pct': 0.1, 'epochs': 1000}, {'bert_type': 'ldabert', 'dataset_type': 'clinical', 'final_dropout': 0.5, 'dense_neurons': 64, 'pct_data':

In [18]:
import pandas as pd

pd.DataFrame.from_dict(experiments)

Unnamed: 0,bert_type,dataset_type,final_dropout,dense_neurons,pct_data,augment_pct,epochs
0,ldabert,clinical,0.5,64,1,0.1,1000
1,ldabert,clinical,0.8,64,1,0.1,1000
2,ldabert,clinical,0.5,128,1,0.1,1000
3,ldabert,clinical,0.8,128,1,0.1,1000
4,ldabert,clinical,0.5,256,1,0.1,1000
5,ldabert,clinical,0.8,256,1,0.1,1000
6,ldabert,clinical,0.5,64,1,0.25,1000
7,ldabert,clinical,0.8,64,1,0.25,1000
8,ldabert,clinical,0.5,128,1,0.25,1000
9,ldabert,clinical,0.8,128,1,0.25,1000
