In [2]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset('text',data_files={'train': ["../extracted_text/kumar_and_clark/kumar_and_clark_top_1.txt","../extracted_text/kumar_and_clark/kumar_and_clark_top_2.txt"], 'test': "../extracted_text/kumar_and_clark/kumar_and_clark_top_3.txt"})
print(dataset)
print("example :")
print(dataset['train'][0])

Found cached dataset text (/dccstor/cgdial/ojasgramo/cache/huggingface/datasets/text/default-9d77d126da39b7b3/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2528
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})
example :
{'text': 'CHAPTER CONTENTS '}


In [3]:
context_length = 512
stride = 256

In [4]:
from transformers import AutoTokenizer, GPT2Model

tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
outputs = tokenizer(
    dataset["train"][:]["text"],
    truncation=True,
    # padding=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    stride=stride,
    return_length=True,
    padding=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
# print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
# print(f"attention mask :\n {outputs['attention_mask']}")

In [5]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
        padding=True,
        stride=stride
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
            
    # print(input_batch)

    # padded_batch = [stride*[tokenizer.pad_token_id] + input_batch[0][:stride]]
    # padded_batch += input_batch
    # print("input_batch")
    # print(input_batch)
    # print("padded_batch")
    # print(padded_batch)
    # print(input_batch[0])
    return {"input_ids": padded_batch}


tokenized_datasets = dataset.map(
    tokenize, batched=True, remove_columns=dataset["train"].column_names
)
tokenized_datasets.set_format("torch")
tokenized_datasets

Loading cached processed dataset at /dccstor/cgdial/ojasgramo/cache/huggingface/datasets/text/default-ad26ae56ab5a592f/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-f5516df2c58a8e9e.arrow
Loading cached processed dataset at /dccstor/cgdial/ojasgramo/cache/huggingface/datasets/text/default-ad26ae56ab5a592f/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-1d3fe60dad49f47e.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1376
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 270
    })
})

In [6]:
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']

In [7]:
train_dataset

Dataset({
    features: ['input_ids'],
    num_rows: 1376
})

In [8]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)
test_dataloader = DataLoader(test_dataset, batch_size=1)

### Accelerate training loop

In [9]:
from accelerate import Accelerator
from accelerate.utils import set_seed
from accelerate import notebook_launcher

In [10]:
from tqdm.auto import tqdm

In [11]:
import torch
from transformers import AutoModelForCausalLM
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

In [12]:
from torch.nn import CrossEntropyLoss

def causallm_loss(inputs, logits):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()

    preds = shift_logits.view(-1, shift_logits.size(-1))
    targets = shift_labels.view(-1)
    targets = targets.clone()
    targets[:stride-1] = -100
    

    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduction='sum')
    loss = loss_fct(preds, targets)
    # print(loss)
    return loss

In [13]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = 'false'

In [14]:
model_name = "gpt2-xl"

In [15]:
step_losses = []
epoch_losses = []
best = 1

In [16]:
def training_loop(mixed_precision="fp16"):
    
    model_name = "bloom-1b1"
    
    accelerator = Accelerator(mixed_precision = mixed_precision)
    accelerator.print("accelerator initialised")
    
    set_seed(42)
    accelerator.print("seed set")
    model = AutoModelForCausalLM.from_pretrained(f"bigscience/{model_name}")
    accelerator.print("model loaded")

    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)
    test_dataloader = DataLoader(test_dataset, batch_size=1)
    accelerator.print("dataloaders initialised")
    
    train_dataloader, test_dataloader, model, optimizer = accelerator.prepare(
        train_dataloader, test_dataloader, model, optimizer
    )
    
    num_epochs = 10
    warm_up_steps = num_epochs//5 * len(train_dataloader)
    training_steps = 4*num_epochs//5 * len(train_dataloader)

    accelerator.print("scheduler initialised")
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer, num_warmup_steps=warm_up_steps, num_training_steps=training_steps
    )
    
    # Training conditions

    checkpoint = True
    load_checkpoint = False
    evaluate = False
    
    if load_checkpoint:
        model.load_state_dict(torch.load(f'../model/trained_models/{model_name}_multidoc2dial_epoch{epoch}.pth'))

    progress_bar = tqdm(range(training_steps))
    step_losses = []
    epoch_losses = []
    best = 1
    
    model.train()
    accelerator.print("training started")
    for epoch in range(num_epochs):
        for step,batch in enumerate(train_dataloader, start = 1):
            # batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(batch['input_ids']).logits
            loss = causallm_loss(batch['input_ids'],logits)
            # loss.backward()
            accelerator.backward(loss)
            step_losses.append([step,loss.item()])

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        epoch_losses.append(sum(step_losses[-1]))
        if epoch_losses[-1] < epoch_losses[best-1]:
            best = len(epoch_losses)
        
        if(checkpoint):
            torch.save(model.state_dict(),f'../model/trained_models/{model_name}_multidoc2dial_epoch{epoch+1}.pth')        
                
    accelerator.print("training ended")
    accelerator.print(epoch_losses)
    with open("../model/trained_models/logs.txt","w")as f:
        f.write(f"best = {best}\n" + str(epoch_losses))
    # torch.save(model.state_dict(),f'../model/trained_models/{model_name}_harrison_respiratory.pth')
    accelerator.print("best saved")

In [19]:
notebook_launcher(training_loop, num_processes = 1)

Launching training on one GPU.
accelerator initialised
seed set
model loaded
dataloaders initialised
scheduler initialised


  0%|          | 0/11008 [00:00<?, ?it/s]

training started
training ended
[1764.7094421386719, 1554.6121978759766, 1504.0621337890625, 1617.4124298095703, 1533.3802337646484, 1427.6160354614258, 1381.1657495498657, 1376.7595613598824, 1376.4812118709087, 1376.5097506046295]
best saved


In [1]:
a = "[aa,v,b,d]"
a.strip("[]").split(",")

['aa', 'v', 'b', 'd']