<a href="https://colab.research.google.com/github/maryamastero/AI_in_Health/blob/master/finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: !pip install transformer!pip istall dataset

!pip install transformers
!pip install datasets


Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

In [3]:
import numpy as np
import torch
from matplotlib import pyplot as pyplot
import random
from tqdm.auto import tqdm
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW, get_scheduler
from datasets import load_dataset, load_metric



In [4]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)
device = torch.device('cuda_0' if torch.cuda.is_available() else 'cpu')
print('Device available:', device)

Device available: cpu


In [5]:
raw_data = load_dataset('glue', 'wnli')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/635 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/71 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/146 [00:00<?, ? examples/s]

In [6]:
raw_data

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 635
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 71
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 146
    })
})

In [7]:
raw_data['train'][0]

{'sentence1': 'I stuck a pin through a carrot. When I pulled the pin out, it had a hole.',
 'sentence2': 'The carrot had a hole.',
 'label': 1,
 'idx': 0}

In [8]:
raw_data['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_entailment', 'entailment'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [9]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
def Tokenize_function(example):
  return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

In [11]:
tokenized_data = raw_data.map(Tokenize_function,batched=True)

Map:   0%|          | 0/635 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

In [12]:
tokenized_data = tokenized_data.remove_columns(['idx','sentence1','sentence2'])
tokenized_data = tokenized_data.rename_column('label','labels')
tokenized_data.set_format('pt')
tokenized_data['train'].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [13]:
raw_data['train'].column_names

['sentence1', 'sentence2', 'label', 'idx']

In [14]:
tokenized_data['train'][0], raw_data['train'][0]

({'labels': tensor(1),
  'input_ids': tensor([  101,  1045,  5881,  1037,  9231,  2083,  1037, 25659,  1012,  2043,
           1045,  2766,  1996,  9231,  2041,  1010,  2009,  2018,  1037,  4920,
           1012,   102,  1996, 25659,  2018,  1037,  4920,  1012,   102]),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
          1, 1, 1, 1, 1]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1])},
 {'sentence1': 'I stuck a pin through a carrot. When I pulled the pin out, it had a hole.',
  'sentence2': 'The carrot had a hole.',
  'label': 1,
  'idx': 0})

In [15]:
train_data = torch.utils.data.DataLoader(tokenized_data['train'],
                                         shuffle = True,
                                         batch_size = 8,
                                         collate_fn = data_collator)
val_data = torch.utils.data.DataLoader(tokenized_data['validation'],
                                         batch_size = 8,
                                         collate_fn = data_collator)
test_data = torch.utils.data.DataLoader(tokenized_data['test'],
                                         batch_size = 8,
                                         collate_fn = data_collator)

In [16]:
for batch in train_data:
  [print('{:>20} : {}'.format(k,v.shape)) for k,v in batch.items()]
  break

              labels : torch.Size([8])
           input_ids : torch.Size([8, 49])
      token_type_ids : torch.Size([8, 49])
      attention_mask : torch.Size([8, 49])


In [17]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
outputs= model(**batch)

In [19]:
print(outputs.loss,outputs.logits.shape)

tensor(0.6737, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [20]:
EPOCHS = 3
NUM_TRAINING_STEPS = EPOCHS * len(train_data)
print(NUM_TRAINING_STEPS)
optimizer = AdamW(model.parameters(), lr = 5e-5)
lr_scheduler = get_scheduler('linear',
                             optimizer= optimizer,
                             num_warmup_steps = 0,
                             num_training_steps = NUM_TRAINING_STEPS)

240




In [21]:
model.to(device)
device

device(type='cpu')

In [22]:
progress_bar = tqdm(range(NUM_TRAINING_STEPS))
model.train()
for epoch in range(EPOCHS):
  for batch in train_data:
    batch = {k:v.to(device) for k,v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/240 [00:00<?, ?it/s]

In [30]:
metric = load_metric('glue', 'wnli')
model.eval()
for batch in val_data:
  batch = {k:v.to(device) for k, v in batch.items()}
  print(batch['labels'], batch['labels'].shape)
  with torch.no_grad():
    outputs = model(**batch)
  logits = outputs.logits
  preds = torch.argmax(logits, dim= -1)
  metric.add_batch(predictions=preds, references=batch['labels'])

metric.compute()


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


tensor([0, 1, 0, 1, 1, 0, 1, 1]) torch.Size([8])
tensor([0, 0, 0, 1, 0, 0, 0, 0]) torch.Size([8])
tensor([1, 0, 0, 0, 0, 0, 0, 1]) torch.Size([8])
tensor([0, 1, 0, 1, 1, 1, 1, 0]) torch.Size([8])
tensor([1, 1, 0, 1, 0, 0, 1, 1]) torch.Size([8])
tensor([0, 0, 0, 1, 0, 0, 1, 0]) torch.Size([8])
tensor([1, 0, 0, 1, 0, 0, 1, 0]) torch.Size([8])
tensor([1, 0, 1, 1, 0, 0, 1, 1]) torch.Size([8])
tensor([0, 1, 1, 0, 1, 0, 0]) torch.Size([7])


{'accuracy': 0.5633802816901409}

In [33]:
preds= []
model.eval()
for batch in test_data:
  batch['labels'] = torch.ones(len(batch['labels'])).type(torch.int64)
  batch = {k:v.to(device) for k,v in batch.items()}

  with torch.no_grad():
    outputs = model(**batch)
  logits = outputs.logits
  yhat = torch.argmax(logits, dim=-1)
  preds.append(yhat)

In [34]:
preds

[tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0])]