In [1]:
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

In [2]:
import pickle
def serializeObject(object_,file_name):
    file_object = open(file_name,'wb')
    pickle.dump(object_, file_object,protocol = 2)
    file_object.close()
    return
def deserializeObject(file_name):
    file_object = open(file_name,'rb')
    object_ = pickle.load(file_object)
    file_object.close() 
    return object_
def read_file(file):
    lst = []
    with open(file,'r') as f:
        for readline in f: 
            line_strip = readline.strip()
            lst.append(line_strip)
    return lst

In [3]:
def read_dataset(tweet_file,label_file):
    tweets = read_file(tweet_file)
    labels = np.array([int(i) for i in read_file(label_file)], dtype = np.int64)
    return tweets,labels

In [5]:
#root_address = 'offensive/'
root_address = './data/hate/'
tweets_train, labels_train = read_dataset(root_address+'train_text.txt', root_address+'train_labels.txt')
tweets_test, labels_test = read_dataset(root_address+'test_text.txt', root_address+'test_labels.txt')
tweets_validation, labels_validation = read_dataset(root_address+'val_text.txt', root_address+'val_labels.txt')


In [6]:

model_name = "cardiffnlp/twitter-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(tweets_train,  padding=True)
val_encodings = tokenizer(tweets_validation, padding=True)

In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, labels_train)
valid_dataset = Dataset(val_encodings, labels_validation)

In [10]:

save_dir = 'results/hate/twitter_roberta_base/augmented_wise/fasttext_0.2/model/model_best'
model = AutoModelForSequenceClassification.from_pretrained(save_dir)


In [66]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [67]:
from sklearn.metrics import confusion_matrix
def classification_aacuracy(pred,labels):
    cm = confusion_matrix(labels, pred)
    accuracy_per_class = cm.diagonal()/cm.sum(axis=1)
    accuracy_all, accuracy_0, accuracy_1 = cm.trace()/cm.sum(), accuracy_per_class[0], accuracy_per_class[1]
    return accuracy_all, accuracy_0, accuracy_1

In [68]:
def get_prediction_single_tweet(text):
    inputs = tokenizer(text, padding=True, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return torch.argmax(probs).item()

In [69]:
def get_prediction_set(text_set,labels):
    global predicted
    predicted = [get_prediction_single_tweet(tweet) for tweet in text_set]
    return classification_aacuracy(np.array(predicted),labels)

In [70]:
model.cuda()
model.eval()
test_acc, class_negative, class_positive = get_prediction_set(tweets_test,labels_test)
print(test_acc, class_negative, class_positive)

0.8476744186046512 0.9225806451612903 0.6541666666666667
