In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [2]:
pip install -U torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.15.1
    Uninstalling torchtext-0.15.1:
      Successfully uninstalled torchtext-0.15.1
Successfully installed sentencepiece-0.1.98 torchtext-0.6.0


In [3]:
import pandas as pd
import nltk
import string
import re as regexpression
import torch
import sys

from transformers import RobertaTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

<h3>Helper functions</h3>

In [4]:
# returns the length of a tweet
def get_tweet_length(example):
  return len(example.clean_tweet)

# saves the model state and loss
def create_checkpoint(path, model, valid_loss):
  checkpoint = {
      'model_state_dict': model.state_dict(),
      'valid_loss': valid_loss
  }

  torch.save(checkpoint, path)

# load a checkpoint into a model
def boot_from_checkpoint(path, model):
  checkpoint = torch.load(path, map_location=device)
  model.load_state_dict(checkpoint['model_state_dict'])
  valid_loss = checkpoint['valid_loss']

  return valid_loss

In [5]:
torch.manual_seed(91212)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(91212)

<h3>Loading data</h3>

In [6]:
extra_df = pd.read_csv("train.En.csv")
extra_df.head()

Unnamed: 0,tweet,sarcastic
0,i never heard if it all might be raining snowi...,1
1,<user> hold on a minute . are you saying all b...,1
2,"Spiritfarer is so good, I bought it several mo...",0
3,This time last year 🌈💛 there may not be a para...,0
4,got yourself a particularly bloody screening i...,1


<h3>Preprocessing the data</h3>
<ul>
<li>Removes @ from tweets</li>
<li>Removes any URLs that start with "http" or "https" from tweets</li>
<li>Removes any decimal points/number from tweets</li>
<li>Converts tweets to lowercase</li>
<li>Removes any white spaces</li>
<li>Trimmed to a maximum length of 516 characters</li>
</ul>

In [7]:
tweets = list(extra_df["tweet"])

patterns = [
    r'@\S+',
    r'http\S+',
    r'\d*\.\d+'
]

cleaned_reviews = []
for review in range(len(tweets)):
  cleaned_review = str(tweets[review])
  for pattern in patterns:
    cleaned_review = regexpression.sub(pattern, ' ', cleaned_review)

  cleaned_review = ' '.join(cleaned_review.lower().split())

  cleaned_reviews.append(cleaned_review)

extra_df = extra_df.assign(clean_tweet = cleaned_reviews)

extra_df = extra_df.drop_duplicates(subset=["clean_tweet"])

extra_df['clean_tweet'] = extra_df['clean_tweet'].apply(lambda x: " ".join(x.split()[:516]))
extra_df.to_csv("preprocessed_data.csv")

<h3>GPU / CPU setup</h3>

In [8]:
if torch.cuda.is_available():
    device = torch.device("cuda")

    print("Using GPU")
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False          
else:
    print('No GPU is available, using CPU.')
    device = torch.device("cpu")

Using GPU


<h3>Initialize tokenizer</h3>
<p>Using fields to represent the labels and tweets.</p>

In [9]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

max_seq_len = 256
batch_size = 16

padding_indices = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
unkown_indices = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

label_representation = Field(
    sequential=False,          # represents a single label value
    use_vocab=False,           # label is not text, no need to build a vocabulary
    batch_first=True           # first dimension of input data is the batch size
)

text_representation = Field(
    use_vocab=False,           # does not need to build a vocabulary, tokenizer is used to tokenize/encode the data
    tokenize=tokenizer.encode, # tokenizer that is used to encode the text
    include_lengths=False,     # we have a fixed length, no need to output
    batch_first=True,
    fix_length=max_seq_len,    # maximum length of text sequences. if length is shorter, padding will be applied. if longer, it is truncated
    pad_token=tokenizer.convert_tokens_to_ids(tokenizer.pad_token),
    unk_token=tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

<h3>Split the data</h3>
<p>We convert the data into iterators which is what is used to iterate over batches during pretraining and training. We are also splitting the day into 65% training, 25% validation and 10% test data.</p>

In [10]:
fields = {}
fields['clean_tweet'] = ('clean_tweet', text_representation)
fields['sarcastic'] = ('sarcastic', label_representation)

train_data, validation_data, test_data = TabularDataset(
    path="preprocessed_data.csv",
    format='CSV',
    fields=fields,
    skip_header=False
).split(
    split_ratio=[0.65, 0.25, 0.1],
    stratified=True,                        # maintain class distribution across splits
    strata_field='sarcastic'
)

# iterator groups together examples with similar lengths
train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, validation_data),          # create one iterator for training data and another for validation data
    batch_size = batch_size,
    device=device,
    shuffle=True,                           # shuffle the examples before each epoch in during training
    sort_key=get_tweet_length,              # returns length of an example (used to group examples of similar lengths)
    sort=True,
    sort_within_batch=False
)

# no need to shuffle examples for test iterator
test_iterator = Iterator(
    test_data,
    batch_size=batch_size,
    device=device,
    train=False,
    shuffle=False,
    sort=False
)

<h3>Model Architecture</h3>
<ul>
<li>We decided to add three linear layers and three dropout layers on top of the RoBERTa model.</li>
<li>The dropout layers randomly dropped some of the output values to zero; preventing overfitting.</li>
</ul>

In [11]:
class ROBERTA(torch.nn.Module):
    def __init__(self):
        super(ROBERTA, self).__init__()
        
        self.roberta = AutoModel.from_pretrained('roberta-base',return_dict=False)
        self.dropout_1 = torch.nn.Dropout(0.4)
        self.linear_1 = torch.nn.Linear(768, 58)
        self.batch_normalization = torch.nn.LayerNorm(58)
        self.dropout_2 = torch.nn.Dropout(0.4)
        self.linear_2 = torch.nn.Linear(58, 32)
        self.dropout_3 = torch.nn.Dropout(0.4)
        self.linear_3 = torch.nn.Linear(32, 2)
        
    def forward(self, input_ids, attention_mask):
        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = self.dropout_1(x)
        x = self.linear_1(x)
        x = self.batch_normalization(x)
        x = torch.nn.Tanh()(x)
        x = self.dropout_2(x)
        x = self.linear_2(x)
        x = self.dropout_3(x)
        x = self.linear_3(x)
        return x  


<h3>Pretrain the model</h3>
<p>Trains on the dataset using `train_iterator` and evaluates the model using `valid_iterator`. We are using the CrossEntropyLoss loss function update the model's parameters using the optimizer. A scheduler is used to adjust the learning rate during training. We stop pretraining at 5 epochs or whenever the model starts to overfit for 3+ epochs; whichever comes first.</p>

In [12]:
def pretrain(model, optimizer, train_iterator, valid_iterator, scheduler = None, evaluation_period = len(train_iterator), epochs=5):
    tolerance = 0
    max_tolerance = 3

    # freeze parameters
    for param in model.roberta.parameters():
        param.requires_grad = False
    
    model.train()                                                               # switch model to training mode
    
    training_loss = 0.0
    validation_loss = 0.0   
    period = 0  
    
    for epoch in range(epochs):
        for batch in train_iterator:
            (tweet, label), _ = batch

            mask = (tweet != padding_indices).type(torch.uint8)                 # ignore padding tokens
            
            # forward pass and calculate loss
            y_pred = model(input_ids=tweet, attention_mask=mask)
            loss = torch.nn.CrossEntropyLoss()(y_pred, label)

            # backward pass and optimization
            loss.backward()                                                     # compute gradients
            optimizer.step()                                                    # update parameters
            scheduler.step()                                                    # update learning rate
            optimizer.zero_grad()                                               # reset gradients
            training_loss = training_loss + loss.item()
            period = period + 1

            # evaluate model on validation set
            if period % evaluation_period == 0:
                model.eval()
                
                with torch.no_grad():                    
                    for batch in valid_iterator:
                        (tweet, label), _ = batch
                        mask = (tweet != padding_indices).type(torch.uint8)
                        y_pred = model(input_ids=tweet, attention_mask=mask)
                        loss = torch.nn.CrossEntropyLoss()(y_pred, label)
                        validation_loss += loss.item()

                training_loss = training_loss / evaluation_period
                validation_loss = validation_loss / len(valid_iterator)

                if training_loss < validation_loss:
                  tolerance = tolerance + 1
                
                model.train()

                print('Epoch [{}/{}], tolerance [{}/{}], training loss: {:.4f}, validation loss: {:.4f}'
                      .format(epoch + 1, epochs, tolerance, max_tolerance, training_loss, validation_loss))
                
                training_loss = 0.0                
                validation_loss = 0.0

                # stop pretraining if we are overfitting too much
                if tolerance >= max_tolerance:
                  break
                  
        if tolerance >= max_tolerance:
          break
    
    for param in model.roberta.parameters():
        param.requires_grad = True

<h3>Train the model</h3>
<p>Iterates through training data, calculates the loss and updates the model's parameters using the optimizer and updates the scheduler which helps adjust the learning rate. The model is evaluated on the validation set at a fixed interval. If the validation loss is the best we found yet we'll save the model by creating a checkpoint. If the model is overfitting for 3+ epochs the model will stop training.</p>

In [13]:
def train(model, optimizer, train_iterator, valid_iterator, scheduler = None, num_epochs = 5, evaluation_period = len(train_iterator)):
    tolerance = 0
    max_tolerance = 3
    training_loss = 0.0
    validation_loss = 0.0
    best_validation_loss = sys.float_info.max
    period = 0
    
    model.train()
    
    for epoch in range(num_epochs):
        for batch in train_iterator:
            (tweet, label), _ = batch
            
            mask = (tweet != padding_indices).type(torch.uint8)                 # ignore padding tokens

            # forward pass and calculate the loss
            y_pred = model(input_ids=tweet, attention_mask=mask)
            loss = torch.nn.CrossEntropyLoss()(y_pred, label)

            # backward pass and optimization
            loss.backward()                                                     # compute gradients
            optimizer.step()                                                    # update parameters
            scheduler.step()                                                    # adjust learning rate
            optimizer.zero_grad()                                               # reset gradients 

            # update training loss and period
            training_loss = training_loss + loss.item()
            period = period + 1

            # evaluate model on validation set
            if period % evaluation_period == 0:
                model.eval()
                
                with torch.no_grad():                    
                    for batch in valid_iterator:
                        (tweet, label), _ = batch
                        mask = (tweet != padding_indices).type(torch.uint8)
                        y_pred = model(input_ids=tweet, attention_mask=mask)
                        loss = torch.nn.CrossEntropyLoss()(y_pred, label)
                        validation_loss += loss.item()

                training_loss = training_loss / evaluation_period
                validation_loss = validation_loss / len(valid_iterator)

                if training_loss < validation_loss:
                  tolerance = tolerance + 1

                print('epoch [{}/{}], tolerance [{}/{}], training loss: {:.4f}, validation loss: {:.4f}'
                      .format(epoch+1, num_epochs, tolerance, max_tolerance, training_loss, validation_loss))
                
                # create checkpoint if has best validation loss
                if best_validation_loss > validation_loss:
                    best_validation_loss = validation_loss
                    create_checkpoint('model.pkl', model, best_validation_loss)

                # stop training if we are overfitting too much
                if tolerance >= max_tolerance:
                  break
                        
                training_loss = 0.0                
                validation_loss = 0.0
                model.train()

        if tolerance >= max_tolerance:
          break

<h3>Pretraining the model</h3>

In [14]:
model = ROBERTA()
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(
  optimizer, 
  num_warmup_steps=len(train_iterator), 
  num_training_steps=len(train_iterator)*7
)

pretrain(model=model,
  train_iterator=train_iterator,
  valid_iterator=valid_iterator,
  optimizer=optimizer,
  scheduler=scheduler,
  epochs=7
)

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch [1/7], tolerance [0/3], training loss: 0.7124, validation loss: 0.6933
Epoch [2/7], tolerance [0/3], training loss: 0.7041, validation loss: 0.6902
Epoch [3/7], tolerance [0/3], training loss: 0.7015, validation loss: 0.6901
Epoch [4/7], tolerance [0/3], training loss: 0.6960, validation loss: 0.6902
Epoch [5/7], tolerance [0/3], training loss: 0.6958, validation loss: 0.6901
Epoch [6/7], tolerance [0/3], training loss: 0.6945, validation loss: 0.6897
Epoch [7/7], tolerance [0/3], training loss: 0.6955, validation loss: 0.6894


<h3>Training the model</h3>

In [15]:
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=len(train_iterator) * 2, 
    num_training_steps=len(train_iterator) * 8
)

train(model=model, 
  train_iterator=train_iterator, 
  valid_iterator=valid_iterator, 
  optimizer=optimizer, 
  scheduler=scheduler, 
  num_epochs=8
)

epoch [1/8], tolerance [0/3], training loss: 0.6648, validation loss: 0.6146
epoch [2/8], tolerance [0/3], training loss: 0.5879, validation loss: 0.5578
epoch [3/8], tolerance [1/3], training loss: 0.5136, validation loss: 0.5283
epoch [4/8], tolerance [2/3], training loss: 0.4200, validation loss: 0.6971
epoch [5/8], tolerance [3/3], training loss: 0.3217, validation loss: 0.5432


<h3>Evaluation functions</h3>
<ul>
<li>The `model_predict` function returns the predicted labels given an iterator that goes over the test dataset.</li>
</ul>

In [16]:
def model_predict(model, test_loader):
  y_pred = []
  y_true = []

  model.eval()
  with torch.no_grad():
    for (source, target), _ in test_loader:
      mask = (source != padding_indices).type(torch.uint8)
                
      output = model(source, attention_mask=mask)

      y_pred.extend(torch.argmax(output, axis=-1).tolist())
      y_true.extend(target.tolist())

  return y_true, y_pred

<h3>Setting up the test set iterator</h3>

In [18]:
fields = {'text' : ('text', text_representation), 'sarcastic' : ('sarcastic', label_representation)}
test_data = TabularDataset(path="task_A_En_test.csv", format='CSV', fields=fields)
test_iter = Iterator(test_data, batch_size=batch_size, device=device, train=False, shuffle=False, sort=False)

<h3>Creating the model</h3>
<p>Here we are creating the model and based off the best checkpoint found during training.</p>

In [19]:
model = ROBERTA()
model = model.to(device)

boot_from_checkpoint('model.pkl', model)

y_true, y_pred = model_predict(model, test_iter)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
generated_labels_df = pd.read_csv('task_A_En_test.csv')

for i in range(len(y_pred)):
  generated_labels_df.loc[i, 'sarcastic'] = y_pred[i]

generated_labels_df.to_csv('generated_labels.csv', index=False, sep=',')