In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import json, pickle, gzip
from gensim.models import Word2Vec
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import classification_report

In [2]:
post_id_2_text = pickle.load(open("../../data/non-graph/posts_id_2_text.pkl", "rb"))

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
from transformers import AutoTokenizer, AutoModel
model_name = "bert-base-uncased"
# model_name = "Hate-speech-CNERG/bert-base-uncased-hatexplain"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(device)

cuda:0


In [4]:
class Classifier(nn.Module):
    
    def __init__(self, inp_dim, out_logits):
        super(Classifier, self).__init__()

        self.inp_dim = inp_dim
        self.out_dim = out_logits

        self.fc1 = nn.Linear(inp_dim, 128)
        self.fc2 = nn.Linear(128, out_logits)
    
    def forward(self, x):
        assert x.shape[1] == self.inp_dim

        x = self.fc1(x)
        x = self.fc2(torch.relu(x))
        return torch.log_softmax(x, dim=1)

In [5]:
def get_batches(post_ids, y, device, batch_size, shuffle=True):
    num_of_batches = y.shape[0] // batch_size + (1 if y.shape[0] % batch_size else 0)
    if shuffle:
        shuffled_idxs = np.random.permutation(np.arange(y.shape[0]))
    else:
        shuffled_idxs = np.arange(y.shape[0])
    for i in range(num_of_batches):
        batch_idxs = shuffled_idxs[i*batch_size:(i+1)*batch_size]
        X = np.zeros((batch_idxs.shape[0], 1, max_len), dtype=np.float32)
        for dim1, batch_idx in enumerate(batch_idxs):
            post_id = int(post_ids[batch_idx][:-4])
            batch_post_ids = [post_id]
            for dim2, post_id in enumerate(batch_post_ids):
                text = post_id_2_text[post_id]
                ids = tokenizer(text, max_length=max_len, padding="max_length", truncation=True)['input_ids']
                X[dim1, dim2] = np.array(ids, dtype=int)
        yield torch.IntTensor(X.squeeze()).to(device),\
                torch.LongTensor(y[batch_idxs]).to(device)

In [7]:
def label_train(epoch_num, data, labels, device, optimizer1, optimizer2, criterion, enc, clf, batch_size=32, verbose=False):
    print("Training | Epoch:", epoch_num)
    enc.train()
    clf.train()
    losses = []
    for i, batch in enumerate(tqdm(get_batches(data, labels, device, batch_size),
                                    total=data.shape[0]//batch_size+(1 if data.shape[0]%batch_size else 0))):
        inp = enc(batch[0])['pooler_output']
        out = clf(inp)
        loss = criterion(out, batch[1])
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        loss.backward()
        optimizer2.step()
        optimizer1.step()
        losses.append(loss.item())
        if verbose:
            print("Epoch:", epoch_num, "| Iter:", i+1, "| Loss:", round(loss.item(), 4))
    clf.eval()
    enc.eval()
    return losses

In [6]:
def get_ids_and_labels(split):
    ids = [i for i in pid[split] if 'gab' in i]
    labels = [label_dict[Counter([j["label"] for j in data[i]["annotators"]]).most_common(1)[0][0]]\
              for i in pid[split] if 'gab' in i]
    return np.array(ids), np.array(labels)

In [32]:
def evaluate(epoch_num, data, labels, device, enc, clf, batch_size=64):
    print("Evaluating | Epoch:", epoch_num)
    enc.eval()
    clf.eval()
    y_preds = []
    y_tests = []
    for batch in tqdm(get_batches(data, labels, device, batch_size, False),\
                                    total=data.shape[0]//batch_size+(1 if data.shape[0]%batch_size else 0)):
        with torch.no_grad():
            inp = enc(batch[0])['pooler_output']
        out = clf(inp)
        for y in batch[1].cpu().numpy():
            y_tests.append(y)
        for y in out.argmax(dim=1).cpu().numpy():
            y_preds.append(y)
    print(classification_report(y_tests, y_preds))
    return y_tests

In [8]:
data_path = '../../data/commons/dataset.json'
pid_path = '../../data/commons/post_id_divisions.json'

with open(pid_path) as f:
    pid = json.load(f)
with open(data_path) as f:
    data = json.load(f)

label_dict = {'normal': 0, 'offensive': 1, 'hatespeech': 2}

train_ids, train_labels = get_ids_and_labels("train")
val_ids, val_labels = get_ids_and_labels("val")
test_ids, test_labels = get_ids_and_labels("test")

In [9]:
max_len = 128
encoder = AutoModel.from_pretrained(model_name).to(device)
classifier = Classifier(768, 3).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
encoder.load_state_dict(torch.load("encoder_bert_user_class.pth"))
classifier.load_state_dict(torch.load("classifier_bert_user_class.pth"))

<All keys matched successfully>

In [33]:
ys = evaluate(-1, test_ids, test_labels, device, encoder, classifier, batch_size=32)

Evaluating | Epoch: -1


100%|██████████| 33/33 [00:02<00:00, 11.67it/s]

              precision    recall  f1-score   support

           0       0.63      0.60      0.62       216
           1       0.55      0.58      0.56       305
           2       0.82      0.80      0.81       515

    accuracy                           0.69      1036
   macro avg       0.67      0.66      0.66      1036
weighted avg       0.70      0.69      0.70      1036






In [34]:
for y in ys:
    print(",",y,end="")

, 2, 1, 2, 0, 2, 1, 2, 1, 2, 2, 2, 2, 2, 0, 0, 2, 1, 2, 2, 2, 0, 1, 2, 2, 2, 2, 0, 0, 1, 2, 2, 0, 2, 1, 1, 0, 2, 2, 0, 2, 1, 0, 0, 1, 1, 2, 2, 2, 2, 0, 2, 2, 2, 1, 1, 2, 0, 2, 1, 2, 2, 2, 2, 2, 1, 0, 2, 0, 1, 2, 2, 2, 1, 0, 2, 0, 2, 2, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, 1, 0, 2, 0, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 0, 1, 1, 0, 2, 0, 2, 2, 2, 2, 2, 0, 1, 2, 0, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2, 0, 0, 0, 2, 0, 2, 2, 2, 2, 2, 1, 0, 1, 2, 1, 0, 1, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 1, 0, 2, 2, 2, 0, 2, 1, 1, 1, 1, 2, 1, 0, 0, 2, 0, 1, 1, 0, 2, 1, 1, 2, 0, 2, 2, 2, 0, 2, 0, 1, 1, 2, 2, 1, 1, 0, 1, 2, 2, 2, 2, 2, 0, 2, 1, 0, 0, 1, 0, 1, 1, 1, 1, 2, 1, 0, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 0, 1, 1, 1, 0, 1, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 2, 1, 1, 2, 0, 2, 0, 2, 2, 1, 0, 2, 2, 2, 0, 2, 0, 2, 1, 0, 2,