In [None]:
!pip install transformers

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import json
from sklearn.model_selection import train_test_split

In [None]:
# The Huggingface model trainer uses Dataset objects extended from the PyTorch Dataset type
# This custom subclass returns each datum in the format the trainer will be looking for
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Read in data
with open('train.jsonl') as f:
  train_data = [json.loads(jline) for jline in f.readlines()]

with open('test.jsonl', 'r') as f:
  test_data = [json.loads(jline) for jline in f.readlines()]

# initialize the BERT pretrained model (with associated tokenizer)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

In [7]:
# Method to remove whitespace and a few tokens that are meaningless due to sanitazation
def process_text(text):
  return text.replace('@USER', '').replace('<URL>', '').strip()


# Also replace the label text with 0/1 and append the most immediate context tweet to the response
# This ended up helping accuracy significantly
train_labels = [ 1 if x['label'] == 'SARCASM' else 0 for x in train_data]
train_text = [process_text(x['context'][-1]) + ' ' + process_text(x['response']) for x in train_data]
test_text = [process_text(x['context'][-1]) + ' ' + process_text(x['response']) for x in test_data]

# Create training, validation sets, along with the final test set to make predictions against
train_text, val_text, train_labels, val_labels = train_test_split(train_text, train_labels, test_size=0.05)

# The BERT model takes in encoded values standing in for the text
train_encoding = tokenizer(train_text, return_tensors='pt', padding=True, truncation=True)
val_encoding = tokenizer(val_text, return_tensors='pt', padding=True, truncation=True)
test_encoding = tokenizer(test_text, return_tensors='pt', padding=True, truncation=True)

train_ds = CustomDataset(train_encoding, train_labels)
val_ds = CustomDataset(val_encoding, val_labels)
test_ds = CustomDataset(test_encoding, [0 for i in range(len(test_data))])

In [4]:
# Train the BERT model with our situational data. 5 epochs ended up enough to beat the baseline
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds
)

trainer.train()

In [None]:
# Predictions are returned as logit ranges, so they need to be processed with softmax to get probabilites
pred = trainer.predict(test_ds)
pred_tensor = torch.tensor(pred.predictions)
probs = torch.nn.functional.softmax(pred_tensor, dim=-1).tolist()

# Write predictions to desired format for competition
with open('answer.txt', 'w') as f:
  for i in range(len(probs)):
    result = 'SARCASM' if probs[i][0] < probs[i][1] else 'NOT_SARCASM'
    f.write('twitter_' + str(i+1) + ',' + result + '\n')