In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import emoji as em
import torch
import spacy
import json

from spacy.lang.en import English

from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support

from torch.utils.data import Dataset, SubsetRandomSampler, DataLoader

from transformers import AutoTokenizer, AdamW, get_scheduler, AutoModelForSequenceClassification

import warnings

warnings.filterwarnings('ignore')

import logging

logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [2]:
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')

print(device)

In [3]:
train_dataset_path = "../input/tweetseval3/dataset/SemEval2018-T3-train-taskA_emoji.txt"
test_dataset_path = "../input/tweetseval3/dataset/SemEval2018-T3_gold_test_taskA_emoji.txt"
emojis_path  = "../input/tweetseval3/dataset/emoticons.json"

In [None]:
f1 = open('/content/bert_test_preprocessed.txt', 'r+')
f2 = open('/content/new_bert_test_preprocessed.txt', 'w+')
import re

for line in f1:
    words = line.split(" ")
    for word in words:
      if not (word.startswith(':') and (word.endswith(':') or word.endswith(':\n'))and len(word) > 1):
        #print(word)
        if word.endswith('\n'):
          f2.write(word)
        else:
          f2.write(word + ' ')
      #else:
        #print(word)

In [4]:
train_df = pd.read_csv(train_dataset_path, sep="\t")
train_df.rename(columns={"Tweet index": "index", "Label": "label", "Tweet text": "text"},
                inplace=True)

test_df = pd.read_csv(test_dataset_path, sep="\t")
test_df.rename(columns={"Tweet index": "index", "Label": "label", "Tweet text": "text"},
               inplace=True)
test_df.head()

In [5]:
plt.style.use("ggplot")

plt.figure(figsize=(10, 8))
train_df["length"] = train_df["text"].apply(lambda x: len(x.split()))
sns.histplot(train_df["length"])
plt.title("Frequency of documents of a given length", fontsize=14)
plt.xlabel("length", fontsize=14)

In [6]:
def load_dict(filepath):
    """Loads dict from json file"""
    file = open(filepath, "r", encoding="utf8")
    loaded_dict = file.read()
    return json.loads(loaded_dict)

spacy_tokenizer = spacy.tokenizer.Tokenizer(English().vocab)
emoji_dict = load_dict(emojis_path)

def preprocess(text):
    processed = []
    for token in spacy_tokenizer(text):
        token = token.text
        if '@' in token and len(token) > 1:
            token = '@user'
        elif 'http' in token.lower():
            token = 'http'
        elif token in emoji_dict:
            token = emoji_dict[token]
            token = em.demojize(token, delimiters=("", ""))
        processed.append(token)
    return " ".join(processed)

In [7]:
train_df["text"] = train_df["text"].apply(lambda x: preprocess(x))
test_df["text"] = test_df["text"].apply(lambda x: preprocess(x))

test_df.head()

In [8]:
class SarcasticSentenceDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len=128):
        if len(sentences) != len(labels):
            raise ValueError("Sentences and labels should have the same number of elements.")

        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index: int):
        inputs = self.tokenizer(self.sentences[index],
                                truncation=True,
                                pad_to_max_length=True,
                                return_tensors="pt",
                                max_length=self.max_len)

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[index], dtype=torch.long)
        }

    def __len__(self):
        return len(self.sentences)

In [9]:
from tqdm.auto import tqdm
import copy


def train(model, 
          train_loader, 
          eval_loader,
          device, 
          lr=5e-5,
          num_epochs=5,
          batch_size=8):
        
    num_training_steps = len(train_loader) * num_epochs
    optimizer, lr_scheduler = setup_optimizer_and_scheduler(model,
                                                         lr,
                                                         0,
                                                         num_training_steps)

    progress_bar = tqdm(range(num_training_steps))
    
    best_f1 = 0
    best_epoch = -1
    best_params = copy.deepcopy(model.state_dict())
    patience = 0
    
    for epoch in range(num_epochs):
        model.train()
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            progress_bar.update(1)
            
        metrics = evaluate(model, eval_loader, device)
        print(f"valid accuracy: {metrics['accuracy']}\n"
              f"valid precision: {metrics['precision']}\n"
              f"valid recall: {metrics['recall']}\n"
              f"valid f1: {metrics['f1']}\n")
            
        if metrics["f1"] > best_f1:
            best_f1 = metrics["f1"]
            best_epoch = epoch
            best_params = copy.deepcopy(model.state_dict())
            patience = 0
        else:
            patience += 1

        print(f"patience: {patience}\n")
        if patience == 3:
            break
        
    print(f"best epoch: {best_epoch}\n"
          f"best f1: {best_f1}\n")

    model.load_state_dict(best_params)
    return model


def setup_optimizer_and_scheduler(model, lr, num_warmup_steps, num_training_steps):
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = lr_scheduler = get_scheduler(name="linear", 
                                             optimizer=optimizer,
                                             num_warmup_steps=num_warmup_steps,
                                             num_training_steps=num_training_steps)
    return optimizer, scheduler


def evaluate(model, eval_loader, device):
    model.eval()
    labels_list = []
    preds_list = []
    with torch.no_grad():
        for batch in eval_loader:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            _, preds = torch.max(outputs.logits, dim=1, keepdim=False)
            labels_list.extend(batch["labels"].cpu().numpy().tolist())
            preds_list.extend(preds.cpu().numpy().tolist())

    return compute_metrics(labels_list, preds_list)


def compute_metrics(y_true, y_pred):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "confusion_matrix": confusion_matrix(y_true, y_pred)
    }

In [10]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
train_dataset = SarcasticSentenceDataset(sentences=train_df["text"].tolist(),
                                         labels=train_df["label"].tolist(),
                                         tokenizer=tokenizer)

test_dataset = SarcasticSentenceDataset(sentences=test_df["text"].tolist(),
                                        labels=test_df["label"].tolist(),
                                        tokenizer=tokenizer)

item = train_dataset[0]
print(f"sentence: {train_df['text'][0]}\n"
      f"ids: {item['input_ids']}\n"
      f"attention_mask: {item['attention_mask']}\n"
      f"label: {item['labels']}")

In [11]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)

In [12]:
model = train(model,
              DataLoader(train_dataset, batch_size=8),
              DataLoader(test_dataset, batch_size=8),
              device,
              num_epochs=20,
              lr=5e-5)

In [13]:
metrics = evaluate(model, DataLoader(test_dataset, batch_size=8), device)
print(f"test accuracy: {metrics['accuracy']}\n"
      f"test precision: {metrics['precision']}\n"
      f"test recall: {metrics['recall']}\n"
      f"test f1: {metrics['f1']}\n")

sns.heatmap(metrics["confusion_matrix"], annot=True, cmap='Blues', fmt="d")

In [14]:
torch.save(model.state_dict(), "model.pth")
print("model params saved")