In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 24.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [None]:
from google.colab import drive
import pandas as pd
import sys
drive.mount("/content/gdrive")
sys.path.append('/content/gdrive/MyDrive/Colab Notebooks/SNLP projekt')
df= pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/SNLP projekt/data/train.csv").head(50_000)
df = df.dropna()

Mounted at /content/gdrive


In [None]:
#Group sentences together to form a pair
df['sentence_pair'] = df.apply(lambda row : [row["question1"], row["question2"]], axis = 1)

In [None]:
#CONSTANTS
MAX_LEN = 50 # based on exploratory data analysis
BATCH_SIZE = 40 # Initial guess
MODEL_NAME = "bert-base-cased"
RANDOM_SEED = 42

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
from Sbert import Sbert


if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

print(device)
#Create a proper Pytorch dataset

class Quora_questions_dataset(Dataset):
  def __init__(self, sentence_pairs, targets, tokenizer, max_len):
    self.sentence_pairs = sentence_pairs
    self.targets = targets 
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.sentence_pairs)

  def __getitem__(self, item):
    sent1, sent2 = self.sentence_pairs[item]
    target = self.targets[item]
    encoding1 = self.tokenizer(sent1, max_length = MAX_LEN, padding = "max_length", truncation = True, return_attention_mask = True)
    encoding2 = self.tokenizer(sent2, max_length = MAX_LEN, padding = "max_length", truncation = True,  return_attention_mask = True)
    return {
        "sentence_pair" : self.sentence_pairs[item],
        "input_ids": torch.tensor([encoding1["input_ids"],encoding2["input_ids"]], dtype = torch.int),
        "attention_masks": torch.tensor([encoding1["attention_mask"],encoding2["attention_mask"]], dtype = torch.int),
        "targets": torch.tensor(target, dtype=torch.long)
    }

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = Quora_questions_dataset(
    sentence_pairs=df.sentence_pair.to_numpy(),
    targets=df.is_duplicate.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2
  )

cuda


In [None]:
df_train, df_test = train_test_split(
  df,
  test_size=0.1,
  random_state=RANDOM_SEED
)


df_val, df_test = train_test_split(           
  df_test,
  test_size=0.5,
  random_state=RANDOM_SEED
)

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
model = Sbert(n_classes = 2)
model = model.to(device)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def train_epoch( model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device) #shape [20,2,150]
    attention_masks = d["attention_masks"].to(device)
    targets = d["targets"].to(device)
    sentences = d["sentence_pair"]
    outputs = model(
      input_ids=input_ids,
      attention_masks = attention_masks
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_masks = d["attention_masks"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
      input_ids=input_ids,
      attention_masks = attention_masks
    )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
from transformers import  AdamW, get_linear_schedule_with_warmup
#TRAINING
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)



In [None]:
from collections import defaultdict
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch( model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train) )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state' + str(epoch + 1) + '.bin')
    best_accuracy = val_acc

Epoch 1/10
----------
Train loss 0.565517544137107 accuracy 0.7058666666666668
Val   loss 0.5229958073487357 accuracy 0.736

Epoch 2/10
----------
Train loss 0.45169462214575873 accuracy 0.7891111111111111
Val   loss 0.5368749764230516 accuracy 0.7312000000000001

Epoch 3/10
----------
Train loss 0.3213871528572506 accuracy 0.8637333333333334
Val   loss 0.6448699817771003 accuracy 0.7224

Epoch 4/10
----------


KeyboardInterrupt: ignored

In [None]:
accs = [float(x.cpu()) for x in history["train_acc"]]
vals = [float(x.cpu()) for x in history["val_acc"]]

In [None]:
import matplotlib.pyplot as plt

plt.plot(list(range(1, EPOCHS + 1)), accs, label='train accuracy')
plt.plot(list(range(1, EPOCHS + 1)), vals, label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)
test_acc.item()