In [1]:
import transformers
print(transformers.__version__)

4.24.0


## Generating inference on bloom model

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")

model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")

KeyboardInterrupt: 

In [2]:
prompt = "what is a tiger?"
inputs = tokenizer(prompt, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[ 25915,    632,    267, 144230,     34]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


In [3]:
print(inputs['input_ids'])

tensor([[ 25915,    632,    267, 144230,     34]])


In [4]:
outputs = model.generate(inputs['input_ids'])
print(outputs)



tensor([[ 25915,    632,    267, 144230,     34,    982,    603,   1502,   3548,
           9313,    267,   9160,  10683,     15,    982,   7555,    368,  21380,
             17,   1503]])


In [5]:
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['what is a tiger?”\n\n“It’s a big cat,” said the boy. “']


## Using dummy dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"][100]

Found cached dataset yelp_review_full (/u/ojasgramo/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/2 [00:00<?, ?it/s]

{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. 

In [30]:
# bert tokenizer

from transformers import AutoTokenizer

# tokenizer_bloom = AutoTokenizer.from_pretrained("bigscience/bloom-3b")

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function_bloom(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets_bert = dataset.map(tokenize_function, batched=True)

Loading cached processed dataset at /u/ojasgramo/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-8804bdd2bc004fb8.arrow
Loading cached processed dataset at /u/ojasgramo/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-747c038866a29cbb.arrow


In [31]:
# bloom tokenizer

from transformers import AutoTokenizer

tokenizer_bloom = AutoTokenizer.from_pretrained("bigscience/bloom-3b")

# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function_bloom(examples):
    return tokenizer_bloom(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function_bloom, batched=True)

Loading cached processed dataset at /u/ojasgramo/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-d8b2e38283473510.arrow


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [29]:
len(tokenized_datasets['train'][0]['input_ids'])

512

In [32]:
len(tokenized_datasets_bert['train'][0]['input_ids'])

512

In [1]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

NameError: name 'tokenized_datasets' is not defined

In [34]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloom-3b", num_labels=5)
# model = model.to("cuda")
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-3b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Using Trainer class

In [35]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [36]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [37]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="../model/trained_model", evaluation_strategy="epoch", num_train_epochs = 1)

In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

#### Preparing data (one time)

In [24]:
with open("../extracted_text/sample_train.txt","r") as f:
    text = f.read()
    
with open("../extracted_text/sample_train.txt","w") as f:
    f.write(text.replace('\n',' '))

In [26]:
with open("../extracted_text/sample_test.txt","r") as f:
    text = f.read()
    
with open("../extracted_text/sample_test.txt","w") as f:
    f.write(text.replace('\n',' '))

In [138]:
import numpy as np

with open("../extracted_text/Harrison/Harrison_text_top3.txt","r") as f:
    text = f.read()
    
rat = int(np.floor(0.8*len(text)))
train_text, test_text = text[:rat], text[rat:]

In [None]:
with open("../extracted_text/Harrison/train_data.txt","w") as f:
    f.write(train_text)
    
with open("../extracted_text/Harrison/test_data.txt","w") as f:
    f.write(test_text)

### Using training loop

#### Setting-up dataset

##### Sample data

In [2]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset('text',data_files={'train': "../extracted_text/sample/sample_train.txt", 'test': "../extracted_text/sample/sample_test.txt"})
print(dataset)
# print("example :")
# print(dataset['train'][0])

Found cached dataset text (/u/ojasgramo/.cache/huggingface/datasets/text/default-6ed7dd85d24ac69b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})


##### Med Textbook data

In [1]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset('text',data_files={'train': "../extracted_text/Harrison/train_data.txt", 'test': "../extracted_text/Harrison/test_data.txt"})
print(dataset)
print("example :")
print(dataset['train'][0])

Found cached dataset text (/u/ojasgramo/.cache/huggingface/datasets/text/default-0dbdec9f8a09eed9/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1052
    })
    test: Dataset({
        features: ['text'],
        num_rows: 281
    })
})
example :
{'text': 'Edward T. Naureckas, Julian Solway '}


##### Multidoc2dial dataset

In [None]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset('text',data_files={'train': "../extracted_text/multidoc2dial/title_doc_train.txt", 'test': "../extracted_text/multidoc2dial/title_doc_test.txt"})
print(dataset)
print("example :")
print(dataset['train'][0])

Found cached dataset text (/dccstor/cgdial/ojasgramo/cache/huggingface/datasets/text/default-ad26ae56ab5a592f/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 413
    })
    test: Dataset({
        features: ['text'],
        num_rows: 75
    })
})
example :


In [22]:
context_length = 512
stride = 256

In [41]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
# tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b1")

In [56]:
from transformers import AutoTokenizer, GPT2Model

tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
outputs = tokenizer(
    dataset["train"][:]["text"],
    truncation=True,
    # padding=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    stride=stride,
    return_length=True,
    # padding=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
# print(f"attention mask :\n {outputs['attention_mask']}")

In [58]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
        padding=True,
        stride=stride
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
            
    # print(input_batch)

    padded_batch = [stride*[tokenizer.pad_token_id] + input_batch[0][:stride]]
    padded_batch += input_batch
    # print("input_batch")
    # print(input_batch)
    # print("padded_batch")
    # print(padded_batch)
    # print(input_batch[0])
    return {"input_ids": padded_batch}


tokenized_datasets = dataset.map(
    tokenize, batched=True, remove_columns=dataset["train"].column_names
)
tokenized_datasets.set_format("torch")
tokenized_datasets

Map:   0%|          | 0/413 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1376
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 270
    })
})

In [59]:
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']

In [60]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)
test_dataloader = DataLoader(test_dataset, batch_size=1)

#### Setting-up model

In [6]:
from transformers import AutoModelForCausalLM

# model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-3b")
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m")

In [7]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [8]:
from transformers import get_scheduler

num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [9]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
device

device(type='cuda')

In [10]:
from torch.nn import CrossEntropyLoss

def causallm_loss(inputs, logits):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()

    preds = shift_logits.view(-1, shift_logits.size(-1))
    targets = shift_labels.view(-1)
    targets = targets.clone()
    targets[:stride-1] = -100
    

    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduction='sum')
    loss = loss_fct(preds, targets)
    # print(loss)
    return loss

#### debugging

In [20]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x148cbb7bb6a0>

In [14]:
for i in train_dataloader:
    print(i)
    break

{'input_ids': tensor([[ 72378,  76754,  13748,   5123,    461,  49441,    396,  17552,  44392,
          46737,    445,     17, 209711,   4543,  32955,     15,  89663,  11550,
           5872,   1387,  25357,  16231,    461,    368, 145268,   5431,   2240,
           1025,  58331,    655,  25266,    530,  94362,  27848, 122867,   2240,
          33483,  16708,  13473,   5299,  25266,    530,  40943,   6738,     15,
           2131,  85263,    262,  49885,    461, 145268,  57382,   5299,  25266,
            530,   9119,     17,   3904,   4451,  39825,    361,    368,  51544,
         203971,  17019,     15,   4618,  25266, 137096,   6147, 203971,  25178,
          29826,   2222,    772,  16437,    632,  50363,   1485, 203971,  25178,
           9119,   1331,    660,  47005,  53863,  56609,    461, 161320,    376,
         207123,    530, 186975,  13953,     15,  19483,   2131, 145268]])}


In [29]:
it = iter(train_dataloader)

In [30]:
elem = next(it)
elem

{'input_ids': tensor([[ 11894,   4351,   2162,     37, 106163,  21576,   7011,  27379,   4100,
          142143,   1541,   1306,    267,  97751,  25402,    461,    368,   8390,
           21419,    461,   7011,  27379,     17,   4100, 142143,   1541,   1306,
           55738,    322,  89908, 201582,  24595,    461, 145268,  71321,     15,
           11762,  24279,  67454,   2022,  15323,     15, 197097,     15,  38281,
          113563,     15,    530,     18,    280,   7458,    361,    368,  15966,
             530,  12095,    461,   1999,    363,    440,     17,  12941,   2742,
             791,   2742,   1130,    722,  85736,   1331,   3390,  54533,    461,
           99775,     15,  11762, 104077,     15,   2670,  28617,   1779,     15,
             530,  63439, 149630,     17,   1387, 180761,  10546, 227900,    461,
          149042,   1541,    632,    267,  21419,    461,    267,  14951, 149042,
             529]])}

In [35]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

In [36]:
inputs = elem['input_ids']
inputs = inputs.to(device)
logits = model(inputs).logits
# loss = causallm_loss(inputs,logits)
logits = logits.to(device)

In [37]:
inputs

tensor([[ 11894,   4351,   2162,     37, 106163,  21576,   7011,  27379,   4100,
         142143,   1541,   1306,    267,  97751,  25402,    461,    368,   8390,
          21419,    461,   7011,  27379,     17,   4100, 142143,   1541,   1306,
          55738,    322,  89908, 201582,  24595,    461, 145268,  71321,     15,
          11762,  24279,  67454,   2022,  15323,     15, 197097,     15,  38281,
         113563,     15,    530,     18,    280,   7458,    361,    368,  15966,
            530,  12095,    461,   1999,    363,    440,     17,  12941,   2742,
            791,   2742,   1130,    722,  85736,   1331,   3390,  54533,    461,
          99775,     15,  11762, 104077,     15,   2670,  28617,   1779,     15,
            530,  63439, 149630,     17,   1387, 180761,  10546, 227900,    461,
         149042,   1541,    632,    267,  21419,    461,    267,  14951, 149042,
            529]], device='cuda:0')

In [38]:
inputs[...,1:]

tensor([[  4351,   2162,     37, 106163,  21576,   7011,  27379,   4100, 142143,
           1541,   1306,    267,  97751,  25402,    461,    368,   8390,  21419,
            461,   7011,  27379,     17,   4100, 142143,   1541,   1306,  55738,
            322,  89908, 201582,  24595,    461, 145268,  71321,     15,  11762,
          24279,  67454,   2022,  15323,     15, 197097,     15,  38281, 113563,
             15,    530,     18,    280,   7458,    361,    368,  15966,    530,
          12095,    461,   1999,    363,    440,     17,  12941,   2742,    791,
           2742,   1130,    722,  85736,   1331,   3390,  54533,    461,  99775,
             15,  11762, 104077,     15,   2670,  28617,   1779,     15,    530,
          63439, 149630,     17,   1387, 180761,  10546, 227900,    461, 149042,
           1541,    632,    267,  21419,    461,    267,  14951, 149042,    529]],
       device='cuda:0')

In [39]:
logits[..., :-1,:]

tensor([[[-2.1324, -2.4718,  6.0218,  ..., -2.3309, -2.3309, -2.3310],
         [-1.9499, -2.7147,  3.8685,  ..., -2.0518, -2.0515, -2.0517],
         [-2.3274, -2.8437,  5.7214,  ..., -2.4403, -2.4397, -2.4397],
         ...,
         [-1.5610, -1.6484,  4.1014,  ..., -1.7611, -1.7608, -1.7608],
         [-1.5903, -2.2754,  5.3868,  ..., -1.8332, -1.8329, -1.8329],
         [ 0.1187,  0.3553,  2.9793,  ...,  0.0454,  0.0453,  0.0454]]],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [40]:
shift_labels = inputs[..., 1:].contiguous()
shift_logits = logits[..., :-1, :].contiguous()

In [41]:
inputs

tensor([[ 11894,   4351,   2162,     37, 106163,  21576,   7011,  27379,   4100,
         142143,   1541,   1306,    267,  97751,  25402,    461,    368,   8390,
          21419,    461,   7011,  27379,     17,   4100, 142143,   1541,   1306,
          55738,    322,  89908, 201582,  24595,    461, 145268,  71321,     15,
          11762,  24279,  67454,   2022,  15323,     15, 197097,     15,  38281,
         113563,     15,    530,     18,    280,   7458,    361,    368,  15966,
            530,  12095,    461,   1999,    363,    440,     17,  12941,   2742,
            791,   2742,   1130,    722,  85736,   1331,   3390,  54533,    461,
          99775,     15,  11762, 104077,     15,   2670,  28617,   1779,     15,
            530,  63439, 149630,     17,   1387, 180761,  10546, 227900,    461,
         149042,   1541,    632,    267,  21419,    461,    267,  14951, 149042,
            529]], device='cuda:0')

In [42]:
inputs.shape

torch.Size([1, 100])

In [43]:
logits.shape

torch.Size([1, 100, 250880])

In [44]:
shift_labels

tensor([[  4351,   2162,     37, 106163,  21576,   7011,  27379,   4100, 142143,
           1541,   1306,    267,  97751,  25402,    461,    368,   8390,  21419,
            461,   7011,  27379,     17,   4100, 142143,   1541,   1306,  55738,
            322,  89908, 201582,  24595,    461, 145268,  71321,     15,  11762,
          24279,  67454,   2022,  15323,     15, 197097,     15,  38281, 113563,
             15,    530,     18,    280,   7458,    361,    368,  15966,    530,
          12095,    461,   1999,    363,    440,     17,  12941,   2742,    791,
           2742,   1130,    722,  85736,   1331,   3390,  54533,    461,  99775,
             15,  11762, 104077,     15,   2670,  28617,   1779,     15,    530,
          63439, 149630,     17,   1387, 180761,  10546, 227900,    461, 149042,
           1541,    632,    267,  21419,    461,    267,  14951, 149042,    529]],
       device='cuda:0')

In [45]:
shift_labels.shape

torch.Size([1, 99])

In [46]:
stride

50

In [47]:
preds = shift_logits.view(-1, shift_logits.size(-1))
targets = shift_labels.view(-1)
targets[:stride-1] = -100

In [107]:
preds

tensor([[ 28.6269,  29.0666,  34.2686,  ...,  18.2569,  18.2567,  18.2531],
        [316.8571, 322.1927, 330.4165,  ..., 191.7483, 191.7484, 191.7424],
        [335.4773, 340.5488, 348.1035,  ..., 201.8398, 201.8396, 201.8339],
        ...,
        [396.2276, 398.3240, 411.2265,  ..., 205.2723, 205.2722, 205.2675],
        [403.6878, 405.6578, 419.6461,  ..., 205.8794, 205.8795, 205.8745],
        [393.0273, 395.1537, 412.2231,  ..., 203.2219, 203.2215, 203.2169]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [108]:
targets

tensor([  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,  46777,    488,    769,   6738,   8875,
            17,  53148,   2194,     15,    613,    368,  51544,    427,    722,
          6084,  35173,    361,    683, 198485,   9119,     15,    368,  40943,
          9119, 191462,    461,    267,   8885, 203971, 217020,   6591,    722,
         93740,    427,   3776,  21361,  12725,     17,  12298,     15,    661,
           267,  75162,    461,  60596,    361,  45683, 119623,    530,   2730],
       device='cuda:0')

In [55]:
loss_fct = CrossEntropyLoss(reduction='none')

In [56]:
loss = loss_fct(preds,targets)
loss

tensor([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 7.3755e+00, 1.1336e-01, 2.3923e+00, 4.8047e+00, 1.4806e+00,
        4.7544e+00, 7.3098e-03, 7.0966e-01, 2.3397e-02, 8.1592e-03, 1.1834e+00,
        2.5144e+00, 2.2242e+00, 4.4376e+00, 2.5823e-03, 1.9017e-03, 7.0492e-01,
        1.0569e+00, 1.5093e-02, 1.9259e+00, 2.9703e+00, 1.4238e+00, 2.5263e+00,
        1.1498e+00, 2.3886e+00, 7.7971e-

In [57]:
loss = loss.sum()

In [73]:
loss_beg = loss
loss_beg

tensor(291.8333, device='cuda:0', grad_fn=<SumBackward0>)

In [58]:
loss

tensor(92.2099, device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
inputs

tensor([[ 60596,    361,  45683, 119623,    530,   2730,    573,    525,  10393,
           4657, 117826,    661,   6355,    661,    368,  22886,    461,  80396,
             15,    261,  25559,  22754, 129673,     15,    530,  83076,    322,
          62920,   1485,    368, 110350,  29826,     15,    368, 203971,  17019,
          47030,    361,   3808,  20122,  72053,   1541,    530,  21361,  86452,
           6582,    361,  13923,     17,   5070,    368, 145268,   5431,    427,
          53103,    361,  58331,   4128,  25266,    530, 196208,   7011,     21,
             15,    718,   6591,    722,  11045,    427,  72053,    655,    368,
          51544, 158454,   2194,    530,  20495,    427,  42442,   4323, 203971,
          25178,   9119,     30,    718,   6591,  13842,    613,  21361,  12725,
            461,    368,  11559, 203971, 217020,    361,    267,  33365]])

In [54]:
logits

tensor([[[ 2.3112,  3.1807, 11.7497,  ...,  2.7595,  2.7589,  2.7596],
         [ 0.7225,  2.5615, 10.3917,  ...,  1.7751,  1.7744,  1.7742],
         [ 0.4262,  1.4538, 10.9861,  ...,  1.8934,  1.8941,  1.8931],
         ...,
         [-1.0838,  0.2338, 10.2983,  ...,  1.8735,  1.8717,  1.8732],
         [-1.4234, -0.3629, 10.2829,  ...,  1.4542,  1.4536,  1.4548],
         [-0.5848,  0.1232, 12.1440,  ...,  1.9798,  1.9792,  1.9798]]],
       grad_fn=<UnsafeViewBackward0>)

In [53]:
shift_logits

tensor([[[ 2.3112,  3.1807, 11.7497,  ...,  2.7595,  2.7589,  2.7596],
         [ 0.7225,  2.5615, 10.3917,  ...,  1.7751,  1.7744,  1.7742],
         [ 0.4262,  1.4538, 10.9861,  ...,  1.8934,  1.8941,  1.8931],
         ...,
         [ 0.2009,  1.9065, 11.9122,  ...,  1.7456,  1.7432,  1.7440],
         [-1.0838,  0.2338, 10.2983,  ...,  1.8735,  1.8717,  1.8732],
         [-1.4234, -0.3629, 10.2829,  ...,  1.4542,  1.4536,  1.4548]]],
       grad_fn=<SliceBackward0>)

In [56]:
shift_labels

tensor([[   361,  45683, 119623,    530,   2730,    573,    525,  10393,   4657,
         117826,    661,   6355,    661,    368,  22886,    461,  80396,     15,
            261,  25559,  22754, 129673,     15,    530,  83076,    322,  62920,
           1485,    368, 110350,  29826,     15,    368, 203971,  17019,  47030,
            361,   3808,  20122,  72053,   1541,    530,  21361,  86452,   6582,
            361,  13923,     17,   5070,    368, 145268,   5431,    427,  53103,
            361,  58331,   4128,  25266,    530, 196208,   7011,     21,     15,
            718,   6591,    722,  11045,    427,  72053,    655,    368,  51544,
         158454,   2194,    530,  20495,    427,  42442,   4323, 203971,  25178,
           9119,     30,    718,   6591,  13842,    613,  21361,  12725,    461,
            368,  11559, 203971, 217020,    361,    267,  33365]])

In [60]:
preds = shift_logits.view(-1, shift_logits.size(-1))

In [68]:
targets = shift_labels.view(-1)

In [61]:
shift_logits.shape

torch.Size([1, 97, 250880])

In [62]:
preds.shape

torch.Size([97, 250880])

In [63]:
shift_labels.shape

torch.Size([1, 97])

In [69]:
targets.shape

torch.Size([97])

In [67]:
loss_fct = CrossEntropyLoss(reduce=False)



In [70]:
loss = loss_fct(preds,targets)

In [71]:
loss.shape

torch.Size([97])

#### Accelerator

In [20]:
from accelerate import Accelerator

accelerator = Accelerator('fp16')

In [21]:
train_dataloader, test_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, test_dataloader, model, optimizer
)

#### Training

In [26]:
# Training conditions

checkpoint_steps = 300
checkpoint = True
load_checkpoint = False
evaluate = False

In [27]:
if load_checkpoint:
    model.load_state_dict(torch.load('../model/trained_models/bloom-560m_harrison_respiratory.pth'))

#### Training loop

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
losses = []

model.train()
for epoch in range(num_epochs):
    for step,batch in enumerate(train_dataloader, start = 1):
        # batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(batch['input_ids']).logits
        loss = causallm_loss(batch['input_ids'],logits)
        # loss.backward()
        accelerator.backward(loss)
        losses.append([step,loss.item()])

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        
        if(checkpoint & step%checkpoint_steps == 0):
            torch.save(model.state_dict(),'../model/trained_models/bloom-560m_harrison_respiratory_{}.pth'.format(step))

In [None]:
torch.save(model.state_dict(),'../model/trained_models/bloom-560m_harrison_respiratory.pth')

#### Accelerate training loop

In [61]:
from accelerate import Accelerator
from accelerate.utils import set_seed

In [62]:
from tqdm.auto import tqdm

In [63]:
import torch
from transformers import AutoModelForCausalLM
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

In [64]:
from torch.nn import CrossEntropyLoss

def causallm_loss(inputs, logits):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()

    preds = shift_logits.view(-1, shift_logits.size(-1))
    targets = shift_labels.view(-1)
    targets = targets.clone()
    targets[:stride-1] = -100
    

    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduction='sum')
    loss = loss_fct(preds, targets)
    # print(loss)
    return loss

In [65]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = 'false'

In [68]:
model_name = "bloom-1b1"
model = AutoModelForCausalLM.from_pretrained(f"bigscience/{model_name}")

In [66]:
model_name = "gpt2-xl"
model = GPT2Model.from_pretrained(model_name)

Downloading pytorch_model.bin:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

In [67]:
step_losses = []
epoch_losses = []
best = 1

In [68]:
def training_loop(mixed_precision="fp16"):
    
    model_name = "bloom-1b1"
    
    accelerator = Accelerator(mixed_precision = mixed_precision)
    accelerator.print("accelerator initialised")
    
    set_seed(42)
    accelerator.print("seed set")
    model = AutoModelForCausalLM.from_pretrained(f"bigscience/{model_name}")
    accelerator.print("model loaded")

    optimizer = AdamW(model.parameters(), lr=5e-5)
    
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)
    test_dataloader = DataLoader(test_dataset, batch_size=1)
    accelerator.print("dataloaders initialised")
    
    train_dataloader, test_dataloader, model, optimizer = accelerator.prepare(
        train_dataloader, test_dataloader, model, optimizer
    )
    
    num_epochs = 10
    warm_up_steps = num_epochs//5 * len(train_dataloader)
    training_steps = 4*num_epochs//5 * len(train_dataloader)

    accelerator.print("scheduler initialised")
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer, num_warmup_steps=warm_up_steps, num_training_steps=training_steps
    )
    
    # Training conditions

    checkpoint = True
    load_checkpoint = False
    evaluate = False
    
    if load_checkpoint:
        model.load_state_dict(torch.load(f'../model/trained_models/{model_name}_multidoc2dial_epoch{epoch}.pth'))

    progress_bar = tqdm(range(training_steps))
    step_losses = []
    epoch_losses = []
    best = 1
    
    model.train()
    accelerator.print("training started")
    for epoch in range(num_epochs):
        for step,batch in enumerate(train_dataloader, start = 1):
            # batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(batch['input_ids']).logits
            loss = causallm_loss(batch['input_ids'],logits)
            # loss.backward()
            accelerator.backward(loss)
            step_losses.append([step,loss.item()])

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        epoch_losses.append(sum(step_losses[-1]))
        if epoch_losses[-1] < epoch_losses[best-1]:
            best = len(epoch_losses)
        
        if(checkpoint):
            torch.save(model.state_dict(),f'../model/trained_models/{model_name}_multidoc2dial_epoch{epoch+1}.pth')        
                
    accelerator.print("training ended")
    with open("../model/trained_models/logs.txt","w")as f:
        f.write(f"best = {best}\n" + epoch_losses)
    # torch.save(model.state_dict(),f'../model/trained_models/{model_name}_harrison_respiratory.pth')

In [69]:
from accelerate import notebook_launcher

In [None]:
notebook_launcher(training_loop, num_processes = 2)

Launching training on 2 GPUs.
accelerator initialised
seed set
model loaded
dataloaders initialised
scheduler initialised


  0%|          | 0/5504 [00:00<?, ?it/s]

training started


##### Debugging

In [None]:
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch['input_ids'])
    # outputs = model(**batch)
    # print(outputs)

#### Testing

In [74]:
model = torch.load("../model/trained_models/bloom-3b_sample-data.pth")

In [100]:
import evaluate

metric = evaluate.load("bleu")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(batch["input_ids"], labels=batch["input_ids"])
        print(outputs.loss.sum())
        

#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
metric.compute()

tensor(2.7807)
tensor(3.4755)


In [None]:
model.eval()
losses = []
for step, batch in enumerate(train_dataloader):
    with torch.no_grad():
        outputs = model(batch["input_ids"], labels=batch["input_ids"])
        print(outputs)

    # losses.append(accelerator.gather(outputs.loss))

### Inference

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [2]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
model = torch.load("../model/trained_models/bloom-560m_sample-data.pth")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

In [6]:
prompt = "How to make lungs efficient in gas exchange"
inputs = tokenizer(prompt, return_tensors="pt")
inputs = inputs.to(device)

In [7]:
outputs = model.generate(**inputs, max_new_tokens = 100)

In [8]:
outputs

tensor([[  7572,    427,   5219, 193711,  35173,    361,   9119,  40688,     15,
            718,   6591,    722,  11045,    427,  72053,    655,    368,  51544,
            361,    267,  33365,    861,  85263,    262,    368,  49885,    461,
          57382,   5299,    368,  51544,    530,    368,   6738,     17,   3904,
           4451,  39825,    361,    368,  51544,  29826,     15,   4618,    368,
         145268,   5431,    632,  71941,    461,    267,  16852,    461,  53863,
             15,  53863,     15,  53863,     15,  53863,     15,  53863,     15,
          53863,     15,  53863,     15,  53863,     15,  53863,     15,  53863,
             15,  53863,     15,  53863,     15,  53863,     15,  53863,     15,
          53863,     15,  53863,     15,  53863,     15,  53863,     15,  53863,
             15,  53863,     15,  53863,     15,  53863,     15,  53863,     15,
          53863,     15,  53863,     15,  53863,     15,  53863,     15,  53863]],
       device='cuda:0')

In [9]:
output_text = tokenizer.batch_decode(outputs, skip_special_tokens = True)
output_text

['How to make lungs efficient in gas exchange, it must be able to ventilate the lung in a manner that facilitates the diffusion of gases between the lung and the air. This process occurs in the lung wall, where the respiratory system is composed of a series of thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin, thin']

In [1]:
import torch

In [4]:
a = torch.tensor([1,2,3,4,5])
b = a.clone()
b[2] = 34
a

tensor([1, 2, 3, 4, 5])

In [43]:
from transformers import BertTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
tokenizer.add_tokens(["custom_token1", "custom_token2"], special_tokens=True)
tokenizer.add_tokens(["custom_token3", "custom_token4"], special_tokens=False)

print(tokenizer.tokenize("This is a custom_token1 example sentence."))
print(tokenizer.tokenize("This is a custom_token2 example sentence."))
print(tokenizer.tokenize("This is a custom_token3 example sentence."))
print(tokenizer.tokenize("This is a custom_token4 example sentence."))

['This', 'Ġis', 'Ġa', 'Ġ', 'custom_token1', 'Ġexample', 'Ġsentence', '.']
['This', 'Ġis', 'Ġa', 'Ġ', 'custom_token2', 'Ġexample', 'Ġsentence', '.']
['This', 'Ġis', 'Ġa', 'Ġ', 'custom_token3', 'Ġexample', 'Ġsentence', '.']
['This', 'Ġis', 'Ġa', 'Ġ', 'custom_token4', 'Ġexample', 'Ġsentence', '.']


In [50]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")
# tokenizer.add_tokens(["custom_token1", "custom_token2"], special_tokens=True)
# tokenizer.add_tokens(["custom_token3", "custom_token4"], special_tokens=False)

In [14]:
print(tokenizer.batch_decode(tokenizer("This is a custom_token1 example sentence.")['input_ids']))

['This', ' is', ' a', ' custom', '_', 'token', '1', ' example', ' sentence', '.']


In [73]:
tokenizer.add_tokens(["custom_token3", "custom_token4"], special_tokens=True)

0

In [54]:
print(tokenizer.batch_decode(tokenizer("This is a custom_token3 example sentence.")['input_ids'], skip_special_tokens=True))

['This', ' is', ' a', ' ', '', ' example', ' sentence', '.']


In [34]:
tokenizer('custom_token3')

{'input_ids': [250680], 'attention_mask': [1]}

In [67]:
tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")

In [74]:
tokenizer.get_special_tokens_mask(tokenizer('custom_token3')['input_ids'],already_has_special_tokens=True)

[0]