In [1]:
!pip install pytorch-pretrained-bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm , trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
#Getting CoLA dataset from its official source
!wget https://nyu-mll.github.io/CoLA/cola_public_1.1.zip
!unzip cola_public_1.1.zip 
!cp ./cola_public/raw/in_domain_train.tsv .
!cp ./cola_public/raw/out_of_domain_dev.tsv .


In [5]:
df = pd.read_csv('in_domain_train.tsv', delimiter='\t', header=None, names=['source', 'label', 'label_notes', 'sentence'])

In [6]:

sentences = df.sentence.values
sentences = ["[CLS]" + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
tokens = [tokenizer.tokenize(sent) for sent in sentences] #tokenized sentences


In [8]:
MAX_LEN = 256
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokens], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokens]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype ="long", truncating="post",padding ="post")
attention_masks = []
for seq in input_ids:
  seq_mask = [float(i > 0) for i in seq]
  attention_masks.append(seq_mask)
     

In [10]:
tr_inputs, val_inputs, tr_labels, val_labels = train_test_split(input_ids, labels, random_state=2023, test_size=0.1)
tr_masks, val_masks , _, _ = train_test_split(attention_masks, input_ids, random_state=2023, test_size=0.1)
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_labels = torch.tensor(tr_labels)
val_labels = torch.tensor(val_labels)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [11]:

batch_size = 16
tr_data = TensorDataset(tr_inputs, tr_masks, tr_labels)
tr_sampler = RandomSampler(tr_data)
tr_dataloader = DataLoader(tr_data, sampler=tr_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [12]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=Fa

In [13]:
# Hyperparameters

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']

optimizer_grouped_parameters = [
    {   'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.01
    },
    {   'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
        'weight_decay_rate': 0.00
    }]

optimizer = BertAdam(optimizer_grouped_parameters, lr=1e-5, warmup=0.1)



In [14]:

# calculates the accuracy of our predictions vs labels

def flat_accuracy(preds, labels):
  pred_flat  = np.argmax(preds , axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat)/len(labels_flat)

In [15]:
# Training Loop
train_loss_set = []
epochs = 4

for _ in trange(epochs, desc="Epoch"):
  model.train()
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  for step, batch in enumerate(tr_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask , b_labels = batch
    optimizer.zero_grad()
    loss = model(b_input_ids, token_type_ids=None, 
                 attention_mask=b_input_mask, labels=b_labels)
    
    train_loss_set.append(loss.item())
    loss.backward()
    optimizer.step()
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
  print("Train loss: {}".format(tr_loss/nb_tr_steps))

model.eval()

# Tracking variables
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in val_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    logits = model(b_input_ids, token_type_ids =None, attention_mask=b_input_mask)
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  tmp_eval_accuracy = flat_accuracy(logits, label_ids)
  eval_accuracy += tmp_eval_accuracy
  nb_eval_steps += 1

print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1485.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Epoch:  25%|██▌       | 1/4 [06:39<19:58, 399.63s/it]

Train loss: 0.4855660913577942


Epoch:  50%|█████     | 2/4 [13:16<13:16, 398.17s/it]

Train loss: 0.2556514017314896


Epoch:  75%|███████▌  | 3/4 [19:53<06:37, 397.69s/it]

Train loss: 0.13768620685850447


Epoch: 100%|██████████| 4/4 [26:30<00:00, 397.70s/it]

Train loss: 0.08910816649493922





Validation Accuracy: 0.8472222222222222


In [17]:
df = pd.read_csv("out_of_domain_dev.tsv", delimiter='\t', header=None,
  names=['source', 'label', 'label_notes', 'sentence'])

# Create sentence) and label lists
sentences = df.sentence.values
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values

tokens = [tokenizer.tokenize(sent) for sent in sentences]
MAX_LEN = 256

input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokens], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokens]

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype ="long", truncating="post",padding ="post")

attention_masks = []
for seq in input_ids:
  seq_mask = [float(i > 0) for i in seq]
  attention_masks.append(seq_mask)

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

batch_size = 16

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)



In [18]:
# Prediction on the test set
model.eval()

# Tracking variables
predictions, true_labels = [], []

# Predict
for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask , b_labels = batch
  # Telling the model not to compute or store gradients,
  # saving memory and speeding up prediction
  with torch.no_grad():
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to("cpu").numpy()
  
  predictions.append(logits)
  true_labels.append(label_ids)

In [21]:
# https://pytorch.org/tutorials/beginner/saving_loading_models.html

torch.save(model.state_dict(), 'bert-base-uncased-GED.pth')

In [22]:
from google.colab import drive
drive.mount('/content/drive')
!cp bert-base-uncased-GED.pth './drive/My Drive/Colab Notebooks'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
