In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip3 install datasets
!pip3 install transformers

In [None]:
#reference to https://github.com/ThaddeusSegura/BERT_on_SQuAD/blob/master/SE_classification.ipynb

In [None]:
import pandas as pd
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig, get_linear_schedule_with_warmup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm  # for showing progress bar
from datasets import load_dataset
import torch

device = torch.device('cuda:0')
#Using torch by GPU
if torch.cuda.is_available():
    device = torch.device('cuda:0')
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device('cpu')

In [None]:
trainset = pd.read_csv("/content/gdrive/MyDrive/classifier_bert/train-classifier-v2.0.csv")


model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", # 12-layer BERT
    num_labels = 2, #0:false 1:true
    output_attentions = False, # no attention output
    output_hidden_states = False, # no need for classifier
)

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
model.to(device)    #put model to device

In [None]:
input_ids_list = []
attention_mask_list = []

labels = trainset.answerable.values

for questions, context in zip(trainset['question'], trainset['context']):

    tokenized = tokenizer.encode_plus(questions,context,
                            add_special_tokens=True,    # Add `[CLS]` and `[SEP]`
                            truncation=True,
                            return_attention_mask=True,  # Construct attn. masks.
                            padding='max_length',       #512
                            max_length=512
                           )
    
    input_ids_list.append(tokenized['input_ids'])
    attention_mask_list.append(tokenized['attention_mask'])




In [None]:
input_ids = torch.tensor(input_ids_list)
attention_masks = torch.tensor(attention_mask_list)
labels = torch.tensor(labels)

data_3_elements = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)

batch_size = 32

dataloader = DataLoader(
            data_3_elements,  # The training samples.
            sampler = RandomSampler(data_3_elements), # Select batches randomly
            batch_size = batch_size)

In [None]:
#Data pre complete, now for train
#adam optimizator
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, 
                  eps = 1e-8  
                )

epoch = 4

total_steps = len(dataloader)*epoch
from transformers import get_linear_schedule_with_warmup

#learning rate schedular
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, #default value 
                                            num_training_steps = total_steps)

In [None]:
#train loop
from tqdm.auto import tqdm

for i in range(epoch):
    total_loss = 0    #reset loss
    model.train()   #train mode
    loop = tqdm(dataloader)
    for batch in loop:
        batch_ids = batch[0].to(device)
        batch_mask = batch[1].to(device)
        batch_label = batch[2].to(device)
        
        model.zero_grad()     
        loss=0  #reset this value
        
        output = model(input_ids = batch_ids, 
                            token_type_ids=None, 
                            attention_mask=batch_mask, 
                            labels = batch_label)
        
        loss = output['loss']
        total_loss += loss
        
        loss.backward()     #backward pass to get gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)     #clip norm of gradiedents to 1.0
        optimizer.step()
        scheduler.step()
        loop.set_description(f'Epoch {i}')
        loop.set_postfix(loss=loss.item())
    
    avg_train_loss = total_loss / len(dataloader)    
    print("Epochs:" + str(i) + ", loss:" + str(avg_train_loss))   #this is average loss

    model.save_pretrained('/content/gdrive/MyDrive/classifier_bert/bert_qa_classifier_pt_'+ str(i))
