In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m82.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [10]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
import random
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.utils.class_weight import compute_class_weight
from torch.optim import SGD

class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 2)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask=mask)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc4(x)
        x = self.softmax(x)
        return x

def train():
  model.train()
  total_loss, total_accuracy = 0, 0
  total_preds=[]
  
  for step,batch in enumerate(train_dataloader):
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    batch = [r.to(device) for r in batch]
    sent_id, mask, labels = batch
    model.zero_grad()        
    preds = model(sent_id, mask)
    loss = cross_entropy(preds, labels)
    total_loss = total_loss + loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    preds=preds.detach().cpu().numpy()
    total_preds.append(preds)

  avg_loss = total_loss / len(train_dataloader)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

# function for evaluating the model
def evaluate():
  print("\nEvaluating...")
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  total_preds = []

  for step,batch in enumerate(val_dataloader):
    if step % 50 == 0 and not step == 0:
      elapsed = format_time(time.time() - t0)
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    with torch.no_grad():
      preds = model(sent_id, mask)
      loss = cross_entropy(preds,labels)
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
      total_preds.append(preds)

  avg_loss = total_loss / len(val_dataloader) 
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


if __name__ == '__main__':
  set_seed(5)

  # import train and test file
  '''
  we are only interested in the 'tweet' and 'sarcastic' columns in the training data
  '''
  raw_df = pd.read_csv("modified.csv")
  df = raw_df.loc[:, ['tweet', 'sarcastic']]
  test_df = pd.read_csv("task_A_En_test.csv")

  # preprocess the data
  '''
  we found that there were emojis that were causing some issues, so to be safe we converted the tweets column to of type string
  '''
  df['tweet'] = df['tweet'].astype(str)

  # data split
  '''
  splitting the data into 90% training and 10% validation
  '''
  train_text, valid_text, train_labels, valid_labels = train_test_split(df['tweet'], df['sarcastic'], random_state=2018, test_size=0.2, stratify=df['sarcastic'])

  # importing the BERT model / BERT tokenizer
  '''
  importing BERT model and BERT tokenizer
  '''
  bert = AutoModel.from_pretrained('bert-base-uncased', return_dict=False)
  tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

  # TODO: label this section
  max_seq_len = 25

  # tokenize data
  '''
  tokenizing the training and validation data
  '''
  tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
  )

  tokens_val = tokenizer.batch_encode_plus(
      valid_text.tolist(),
      max_length = max_seq_len,
      pad_to_max_length=True,
      truncation=True,
      return_token_type_ids=False
  )

  tokens_test = tokenizer.batch_encode_plus(
    test_df['text'].tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
  )

  # converting integer sequences to tensors
  '''
  converting integer sequences to tensors
  '''
  train_seq = torch.tensor(tokens_train['input_ids'])
  train_mask = torch.tensor(tokens_train['attention_mask'])
  train_y = torch.tensor(train_labels.tolist())

  val_seq = torch.tensor(tokens_val['input_ids'])
  val_mask = torch.tensor(tokens_val['attention_mask'])
  val_y = torch.tensor(valid_labels.tolist())

  test_seq = torch.tensor(tokens_test['input_ids'])
  test_mask = torch.tensor(tokens_test['attention_mask'])
  test_y = torch.tensor(test_df['sarcastic'].tolist())

  batch_size = 32
  train_data = TensorDataset(train_seq, train_mask, train_y)                              # wrap tensors
  train_sampler = RandomSampler(train_data)                                               # used for sampling in training
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) # dataloader

  val_data = TensorDataset(val_seq, val_mask, val_y)
  val_sampler = SequentialSampler(val_data)                                               # used for sampling in validation
  val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

  # freezing all the parameters
  '''
  freezing parameters
  '''
  for param in bert.parameters():
    param.requires_grad = False


  # creating a model
  '''
  training will be faster on GPU. check if there is GPU available, if not use CPU. instantiates a model
  '''
  if torch.cuda.is_available():
    device = torch.device("cuda")          
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print(f'The current device is {torch.cuda.get_device_name(0)}')
  else:
      print('CUDA is not available. Using CPU...')
      device = torch.device("cpu")

  model = BERT_Arch(bert)
  model = model.to(device)

  # computing class weights
  '''
  class weights are determined by distribution of training data
  '''
  class_wts = compute_class_weight(class_weight="balanced", classes=np.unique(train_labels), y=train_labels)

  # class weights to sensors
  '''
  converting class weights to sensors
  '''
  weights = torch.tensor(class_wts,dtype=torch.float)
  weights = weights.to(device)

  # loss function
  '''
  defining loss function
  '''
  cross_entropy = nn.NLLLoss(weight=weights)

  # hyperparameters + optimizer
  '''
  defining hyperparameters and creating an optimizer. we decided to go with SGD as our optimizer
  '''
  epochs = 10
  optimizer = SGD(model.parameters(), lr=0.009, momentum=0.9)

  # training the model
  '''
  train the model and the best model's weights to saved_weights.pt
  '''
  best_valid_loss = float('inf')

  train_losses=[]
  valid_losses=[]

  for epoch in range(epochs):
      
      print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
      
      train_loss, _ = train()
      
      valid_loss, _ = evaluate()
      
      if valid_loss < best_valid_loss:
          best_valid_loss = valid_loss
          torch.save(model.state_dict(), 'saved_weights.pt')
      
      train_losses.append(train_loss)
      valid_losses.append(valid_loss)
      
      print(f'\nTraining Loss: {train_loss:.3f}')
      print(f'Validation Loss: {valid_loss:.3f}')

  # load best weights
  '''
  from all the training we did, load the best weights that was saved to saved_weights.pt
  '''
  path = 'saved_weights.pt'
  model.load_state_dict(torch.load(path))

  # get predictions for the test data
  '''
  generate model predicted labels
  '''
  with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

  preds = np.argmax(preds, axis = 1)

  # output
  '''
  create an output file that mathces the test file. the output file will contain the model generated labels which
  is to be used as an input for the evaluation script. output file will be called 'generated_labels.csv' and will
  live in the directory of this file
  '''
  test_df_copy = test_df.copy()
  for i in range(0, len(preds)):
    test_df_copy.loc[i, 'sarcastic'] = preds[i]

  test_df_copy.to_csv('generated_labels.csv', index=False, sep=',')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


There are 1 GPU(s) available.
The current device is Tesla T4

 Epoch 1 / 10

Evaluating...

Training Loss: 0.695
Validation Loss: 0.694

 Epoch 2 / 10

Evaluating...

Training Loss: 0.693
Validation Loss: 0.694

 Epoch 3 / 10

Evaluating...

Training Loss: 0.695
Validation Loss: 0.694

 Epoch 4 / 10

Evaluating...

Training Loss: 0.694
Validation Loss: 0.693

 Epoch 5 / 10

Evaluating...

Training Loss: 0.693
Validation Loss: 0.693

 Epoch 6 / 10

Evaluating...

Training Loss: 0.693
Validation Loss: 0.693

 Epoch 7 / 10

Evaluating...

Training Loss: 0.694
Validation Loss: 0.693

 Epoch 8 / 10

Evaluating...

Training Loss: 0.695
Validation Loss: 0.695

 Epoch 9 / 10

Evaluating...

Training Loss: 0.694
Validation Loss: 0.693

 Epoch 10 / 10

Evaluating...

Training Loss: 0.694
Validation Loss: 0.693
