In [1]:
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer

import csv
import pandas as pd

import nltk
import torch

In [2]:
tweets_train = pd.read_csv('data/twitter-train.tsv', sep='\t', quoting=csv.QUOTE_NONE, usecols=[0,1,2],
                           names=['id', 'label', 'message'], index_col=0, dtype={'label': 'category'})

tweets_train = tweets_train.dropna()

In [3]:
tweets_valid = pd.read_csv('data/twitter-dev.tsv', sep='\t', quoting=csv.QUOTE_NONE, usecols=[0,1,2],
                           names=['id', 'label', 'message'], index_col=0, dtype={'label': 'category'})

tweets_valid = tweets_valid.dropna()

In [4]:
tweets_test = pd.read_csv('data/SemEval2017-task4-test.subtask-A.english.txt', sep='\t', quoting=csv.QUOTE_NONE, usecols=[0,1,2],
                          names=['id', 'label', 'message'], index_col=0, dtype={'label': 'category'})

In [5]:
tweets_train.shape, tweets_valid.shape, tweets_test.shape

((50333, 2), (20632, 2), (12284, 2))

In [6]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)

train_encodings = tokenizer(list(tweets_train['message']), padding=True)
valid_encodings = tokenizer(list(tweets_valid['message']), padding=True)
test_encodings = tokenizer(list(tweets_test['message']), padding=True)

label2idx = {'neutral': 0, 'positive': 1, 'negative': 2}
train_labels = list(map(lambda label: [label2idx[label]], tweets_train['label']))
valid_labels = list(map(lambda label: [label2idx[label]], tweets_valid['label']))
test_labels = list(map(lambda label: [label2idx[label]], tweets_test['label']))

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [7]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, train_labels)
valid_dataset = TweetDataset(valid_encodings, valid_labels)
test_dataset = TweetDataset(test_encodings, test_labels)

In [8]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./checkpoints',      # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=96,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
)

bertweet = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=3)

trainer = Trainer(
    model=bertweet,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                     # training arguments, defined above
    train_dataset=train_dataset,            # training dataset
    eval_dataset=valid_dataset,             # evaluation dataset
)

trainer.train()

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

Step,Training Loss
100,1.0315
200,0.7292
300,0.6398
400,0.6293
500,0.6032
600,0.5457
700,0.5129
800,0.5163
900,0.5258
1000,0.497


TrainOutput(global_step=2625, training_loss=0.3965902489253453, metrics={'train_runtime': 2075.2549, 'train_samples_per_second': 1.265, 'total_flos': 21184912967634000, 'epoch': 5.0})

In [9]:
trainer.evaluate(valid_dataset)

{'eval_loss': 0.09052638709545135,
 'eval_runtime': 34.2841,
 'eval_samples_per_second': 601.796,
 'epoch': 5.0}

In [10]:
outputs = trainer.predict(test_dataset)
predictions = torch.argmax(torch.nn.functional.softmax(torch.Tensor(outputs[0])), dim=1).tolist()

  


In [11]:
idx2label = {0: 'neutral', 1: 'positive', 2: 'negative'}

with open('outputs/bertweet-predictions.txt', 'w') as f:
    for tweet_id, pred in zip(list(tweets_test.index), predictions):
        f.write(str(tweet_id) + '\t' + idx2label[pred] + '\n')

In [13]:
!perl SemEval2017_task4_test_scorer_subtaskA.pl data/SemEval2017_task4_subtaskA_test_english_gold.txt outputs/bertweet-predictions.txt

bertweet-predictions.txt	0.724	0.720	0.708	
