## Importing Libraries

In [37]:
# Import Hugging Face Library

!pip install transformers
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup



## Checking for GPU

In [3]:
device_name = tf.test.gpu_device_name()

device_name

if torch.cuda.is_available():
  device=torch.device("cuda")

  print('There are %d GPU(s) available.' % torch.cuda.device_count())

  print('We will use the GPU: ', torch.cuda.get_device_name(0))

else:
  print('No GPU available')


There are 1 GPU(s) available.
We will use the GPU:  Tesla P4


## Uploading Files

In [4]:
df = pd.read_csv('in_domain_train.tsv', delimiter = '\t', header=None, names=['source', 'label', 'notes', 'sentence'])
# df.drop(columns=['Unnamed: 0'], inplace=True)
df.sample(5)

Unnamed: 0,source,label,notes,sentence
8089,ad03,1,,Can I keep the screwdriver just like a carpent...
402,bc01,0,*,What the hell do you wonder how to say?
4156,ks08,1,,Fifteen dollars in a week is much.
6923,m_02,1,,Alan made the loaf with strong white flour.
964,bc01,0,*,John tries to meet not Mary.


## Tokenizing

In [0]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
text = df.sentence.values
labels=df.label.values
input_ids = []

for texts in text:
    try:
      encoded_text = tokenizer.encode(
      texts, 
      add_special_tokens = True
  )

      input_ids.append(encoded_text)
    
    except: 
        continue

## Checking the Maximum Length

In [7]:
print( 'Maximum token length is ',max([len(f) for f in input_ids]))


Maximum token length is  47


## Adding Padding to tokens

In [16]:
# Since maximum token length is 47, we can set the maximum allowable input to 64

max_len = 64

input_ids = pad_sequences( input_ids, maxlen = max_len, dtype = 'long', value = 0, truncating = 'post', padding = 'post')

print( 'Example of a padded sequence: ', input_ids[0])

Example of a padded sequence:  [  101  2256  2814  2180  1005  1056  4965  2023  4106  1010  2292  2894
  1996  2279  2028  2057 16599  1012   102     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]


In [0]:
attention_masks = []

for f in input_ids:
  attn_mask = []
  for x in f: 
    if x > 0:
      attn_mask.append(1)
    else:
      attn_mask.append(0)
  attention_masks.append(attn_mask)

## Training and Cross-Validation

In [0]:
x_Train, x_Test, y_Train, y_Test = train_test_split( input_ids, labels, random_state= 1000, test_size=0.2 )
mask_Train, mask_Test, _, _ = train_test_split( attention_masks, labels, random_state= 1000, test_size=0.2 )

In [0]:
# Converting data to torch tensors

x_Train = torch.tensor(x_Train)
x_Test = torch.tensor(x_Test)

y_Train = torch.tensor(y_Train)
y_Test = torch.tensor(y_Test)

mask_Train = torch.tensor(mask_Train)
mask_Test = torch.tensor(mask_Test)

In [0]:
# Data will be trained in batches, hence to do this, we will use DataLoader library 
# Batch size needs to be specified, and for BERT advisable batch size is 16 or 32

batch_size = 32

train_data = TensorDataset( x_Train, mask_Train, y_Train )
train_sampler = RandomSampler( train_data )
train_dataloader = DataLoader( train_data, sampler = train_sampler, batch_size= batch_size)

test_data = TensorDataset( x_Test, mask_Test, y_Test )
test_sampler = RandomSampler( test_data )
test_dataloader = DataLoader( test_data, sampler = test_sampler, batch_size= batch_size)


## Training the model

In [35]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

model.cuda()

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
optimizer = AdamW(model.parameters(), 
                  lr = 3e-5)

In [0]:
# Linear scheduler is used for learning rate decay

epochs = 4

steps = len( train_dataloader ) * epochs

scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps= steps)