In [None]:
# p = 0.02
# p = 0.05
# p = 0.1
# p = 0.2
p = 1.0

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import json, pickle, gzip
from gensim.models import Word2Vec
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import classification_report

In [3]:
! nvidia-smi

Sun Nov 14 02:11:32 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| 37%   62C    P8    19W / 250W |    360MiB / 11018MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:03:00.0 Off |                  N/A |
| 36%   40C    P8    12W / 175W |     10MiB /  7982MiB |      0%      Default |
|       

In [4]:
post_id_2_text = pickle.load(open("../../data/non-graph/posts_id_2_text.pkl", "rb"))

In [5]:
post2user = pickle.load(open("../../data/non-graph/post_id_2_user_id.pkl", "rb"))
user2posts = pickle.load(open("user_id_posts_idx_list.pkl", "rb"))
for post_id in tqdm(post2user):
    user_id = post2user[post_id]
    post_ids = set(user2posts[user_id])
    post_ids.add(post_id)
    user2posts[user_id] = sorted(list(post_ids))

100%|██████████| 10423/10423 [00:10<00:00, 1029.78it/s]


In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
from transformers import AutoTokenizer, AutoModel
model_name="bert-base-uncased"
# model_name="GroNLP/hateBERT"
# model_name="Hate-speech-CNERG/bert-base-uncased-hatexplain"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(device)

cuda:0


In [7]:
class Classifier(nn.Module):
    
    def __init__(self, inp_dim, hid_dim, out_logits):
        super(Classifier, self).__init__()

        self.inp_dim = inp_dim
        self.out_dim = out_logits

        self.fc1 = nn.Linear(inp_dim, hid_dim)
        self.fc2 = nn.Linear(hid_dim, out_logits)
    
    def forward(self, x):
        assert x.shape[1] == self.inp_dim

        x = self.fc1(x)
        x = self.fc2(torch.relu(x))
        return torch.log_softmax(x, dim=1)

In [8]:
def get_contrastive_batches(post_ids, y, device, batch_size):
    num_of_batches = y.shape[0] // batch_size + (1 if y.shape[0] % batch_size else 0)
    shuffled_idxs = np.random.permutation(np.arange(y.shape[0]))
    for i in range(num_of_batches):
        batch_idxs = shuffled_idxs[i*batch_size:(i+1)*batch_size]
        X = np.zeros((batch_idxs.shape[0], 15, max_len), dtype=np.float32)
        for dim1, batch_idx in enumerate(batch_idxs):
            post_id = post_ids[batch_idx][:-4]
#             positive_sample = post_id
#             while positive_sample != post_id:
#                 positive_sample = np.random.choice(train_class_ids[y[batch_idx]])[:-4]
            positive_sample = np.random.choice(train_class_ids[y[batch_idx]])[:-4]
            batch_post_ids = [post_id, positive_sample]
            negative_classes = [label for label in range(3) if label != y[batch_idx]]
#             negative_samples = set()
#             while len(negative_samples) != 2:
#                 negative_samples.add(np.random.choice(train_class_ids[negative_classes[0]])[:-4])
            negative_samples = []
            negative_samples.append(np.random.choice(train_class_ids[negative_classes[0]])[:-4])
            negative_samples.append(np.random.choice(train_class_ids[negative_classes[0]])[:-4])
#             while len(negative_samples) != 4:
#                 negative_samples.add(np.random.choice(train_class_ids[negative_classes[1]])[:-4])
            negative_samples.add(np.random.choice(train_class_ids[negative_classes[1]])[:-4])
            negative_samples.add(np.random.choice(train_class_ids[negative_classes[1]])[:-4])
            batch_post_ids.extend(list(negative_samples))
            user_id = post2user[int(post_id)]
#             positive_sample = post_id
#             while positive_sample != post_id:
#                 positive_sample = str(np.random.choice(user2posts[user_id]))
            positive_sample = str(np.random.choice(user2posts[user_id]))
            batch_post_ids.append(positive_sample)
            negative_user_ids = [userid for userid in user2posts if userid != user_id]
            negative_users = set()
            negative_samples = []
            while len(negative_users) < 8:
                negative_users.add(np.random.choice(negative_user_ids))
            for userid in list(negative_users):
                negative_samples.append(str(np.random.choice(user2posts[userid])))
            batch_post_ids.extend(negative_samples)
            for dim2, post_id in enumerate(batch_post_ids):
                text = post_id_2_text[int(post_id)]
                ids = tokenizer(text, max_length=max_len, padding="max_length", truncation=True)['input_ids']
                X[dim1, dim2] = np.array(ids, dtype=int)
        yield torch.IntTensor(X).to(device),\
                torch.LongTensor(y[batch_idxs]).to(device)

In [9]:
def get_batches(post_ids, y, device, batch_size, shuffle=True):
    num_of_batches = y.shape[0] // batch_size + (1 if y.shape[0] % batch_size else 0)
    if shuffle:
        shuffled_idxs = np.random.permutation(np.arange(y.shape[0]))
    else:
        shuffled_idxs = np.arange(y.shape[0])
    for i in range(num_of_batches):
        batch_idxs = shuffled_idxs[i*batch_size:(i+1)*batch_size]
        X = np.zeros((batch_idxs.shape[0], 1, max_len), dtype=np.float32)
        for dim1, batch_idx in enumerate(batch_idxs):
            post_id = int(post_ids[batch_idx][:-4])
            batch_post_ids = [post_id]
            for dim2, post_id in enumerate(batch_post_ids):
                text = post_id_2_text[post_id]
                ids = tokenizer(text, max_length=max_len, padding="max_length", truncation=True)['input_ids']
                X[dim1, dim2] = np.array(ids, dtype=int)
        yield torch.IntTensor(X.squeeze()).to(device),\
                torch.LongTensor(y[batch_idxs]).to(device)

In [10]:
def contrastive_train(epoch_num, data, labels, device, optimizers, criterion, enc, clf, batch_size=32, verbose=False):
    print("Training | Epoch:", epoch_num)
    enc.train()
    # clf.train()
    losses = []
    for i, batch in enumerate(tqdm(get_contrastive_batches(data, labels, device, batch_size),
                                    total=data.shape[0]//batch_size+(1 if data.shape[0]%batch_size else 0))):
        # given actual post
        out = enc(batch[0][:,0])['pooler_output']
        # given class level contrastive samples
        similarities1 = torch.zeros(batch[0].shape[0], 5).to(device)
        for j in range(1, 6):
            with torch.no_grad():
                out1 = enc(batch[0][:,j])['pooler_output']
            similarities1[:, j-1] = torch.mul(out, out1).sum(dim=1)
        loss1 = criterion(torch.log_softmax(similarities1, dim=1), torch.zeros(batch[0].shape[0], dtype=int).to(device))
        # given user level contrastive samples
        similarities2 = torch.zeros(batch[0].shape[0], 9).to(device)
        for j in range(6, 15):
            with torch.no_grad():
                out2 = enc(batch[0][:,j])['pooler_output']
            similarities2[:, j-6] = torch.mul(out, out2).sum(dim=1)
        loss2 = criterion(torch.log_softmax(similarities2, dim=1), torch.zeros(batch[0].shape[0], dtype=int).to(device))
        batch_loss = loss1 + loss2
        for optimizer in optimizers:
            optimizer.zero_grad()
        batch_loss.backward()
        for optimizer in reversed(optimizers):
            optimizer.step()
        losses.append(batch_loss.item())
        if verbose:
            print("Epoch:", epoch_num, "| Iter:", i+1, "| Loss1:", round(loss1.item(), 4),\
                  "| Loss2:", round(loss2.item(), 4), "| Total Loss:", round(batch_loss.item(), 4))
    enc.eval()
    clf.eval()
    return losses

In [11]:
def label_finetune(epoch_num, data, labels, device, optimizer, criterion, enc, clf, batch_size=32, verbose=False):
    print("Training | Epoch:", epoch_num)
    enc.train()
    clf.train()
    losses = []
    for i, batch in enumerate(tqdm(get_batches(data, labels, device, batch_size),
                                    total=data.shape[0]//batch_size+(1 if data.shape[0]%batch_size else 0))):
        inp = enc(batch[0])['pooler_output']
        out = clf(inp)
        loss = criterion(out, batch[1])
        for optimizer in optimizers:
            optimizer.zero_grad()
        loss.backward()
        for optimizer in reversed(optimizers):
            optimizer.step()
        losses.append(loss.item())
        if verbose:
            print("Epoch:", epoch_num, "| Iter:", i+1, "| Loss:", round(loss.item(), 4))
    enc.eval()
    clf.eval()
    return losses

In [12]:
def evaluate(epoch_num, data, labels, device, enc, clf, batch_size=64, shuffle=True):
    print("Evaluating | Epoch:", epoch_num)
    enc.eval()
    clf.eval()
    y_preds = []
    y_tests = []
    for batch in tqdm(get_batches(data, labels, device, batch_size, shuffle=shuffle),\
                                    total=data.shape[0]//batch_size+(1 if data.shape[0]%batch_size else 0)):
        with torch.no_grad():
            inp = enc(batch[0])['pooler_output']
            out = clf(inp)
        for y in batch[1].cpu().numpy():
            y_tests.append(y)
        for y in out.argmax(dim=1).cpu().numpy():
            y_preds.append(y)
    print(classification_report(y_tests, y_preds, digits=4))
    return y_preds

In [13]:
def get_ids_and_labels(split):
    ids = [i for i in pid[split] if 'gab' in i]
    labels = [label_dict[Counter([j["label"] for j in data[i]["annotators"]]).most_common(1)[0][0]] for i in pid[split] if 'gab' in i]
    return np.array(ids), np.array(labels)

In [14]:
data_path = '../../data/commons/dataset.json'
pid_path = '../../data/commons/post_id_divisions.json'

with open(pid_path) as f:
    pid = json.load(f)
with open(data_path) as f:
    data = json.load(f)

label_dict = {'normal': 0, 'offensive': 1, 'hatespeech': 2}

train_ids, train_labels = get_ids_and_labels("train")
val_ids, val_labels = get_ids_and_labels("val")
test_ids, test_labels = get_ids_and_labels("test")

In [15]:
train_ids = train_ids[:int(len(train_labels)*p)]
train_labels = train_labels[:int(len(train_labels)*p)]

In [16]:
train_class_ids = []
train_class_ids.append(sorted(list(train_ids[train_labels==0])))
train_class_ids.append(sorted(list(train_ids[train_labels==1])))
train_class_ids.append(sorted(list(train_ids[train_labels==2])))

In [21]:
# max_len = 128
# encoder = AutoModel.from_pretrained("../../../../mlm/checkpoint-201500").to(device)
# classifier = Classifier(768, 128, 3).to(device)
# optimizer1 = optim.Adam(encoder.parameters(), lr=1e-5)
# optimizer2 = optim.Adam(classifier.parameters(), lr=1e-4)
# optimizers = [optimizer1, optimizer2]
# criterion = nn.NLLLoss()

# aux_losses = []
# for i in range(5):
#     aux_losses.extend(contrastive_train(i+1, train_ids, train_labels, device, optimizers, criterion, encoder, classifier, 32))
# plt.plot(aux_losses)
# plt.show()
# torch.save(encoder.state_dict(), "encoder_hxbert_user_class_NLL.pth")

# encoder.load_state_dict(torch.load("encoder_hxbert_user_class_NLL.pth"))
encoder.load_state_dict(torch.load("encoder_hxbert_user_class.pth"))
classifier.load_state_dict(torch.load("classifier_hxbert_user_class.pth"))

<All keys matched successfully>

In [19]:
losses = []
for i in range(1):
    losses.extend(label_finetune(i+1, train_ids, train_labels, device, optimizers, criterion, encoder, classifier, 32))
    evaluate(i+1, test_ids, test_labels, device, encoder, classifier, batch_size=32)
# plt.plot(losses)
# plt.show()

Training | Epoch: 1


100%|██████████| 262/262 [01:10<00:00,  3.74it/s]


Evaluating | Epoch: 1


100%|██████████| 33/33 [00:02<00:00, 11.16it/s]

              precision    recall  f1-score   support

           0     0.5917    0.6574    0.6228       216
           1     0.6250    0.4262    0.5068       305
           2     0.7653    0.8738    0.8160       515

    accuracy                         0.6969      1036
   macro avg     0.6607    0.6525    0.6485      1036
weighted avg     0.6878    0.6969    0.6847      1036






In [None]:
# torch.save(encoder.state_dict(), "encoder_hatebert_user_class.pth")
# torch.save(classifier.state_dict(), "classifier_hatebert_user_class.pth")

In [22]:
test_y_hx = evaluate(1, test_ids, test_labels, device, encoder, classifier, batch_size=48, shuffle=False)

Evaluating | Epoch: 1


100%|██████████| 22/22 [00:02<00:00,  8.16it/s]

              precision    recall  f1-score   support

           0     0.6650    0.6065    0.6344       216
           1     0.5813    0.5508    0.5657       305
           2     0.7873    0.8408    0.8131       515

    accuracy                         0.7066      1036
   macro avg     0.6779    0.6660    0.6711      1036
weighted avg     0.7011    0.7066    0.7030      1036






In [23]:
i = 1
for idx, val in enumerate((np.array(test_y_hx) == test_labels) == (np.array(test_y_h) != test_labels)):
    if val == False:
        continue
    post_id = test_ids[idx]
    print(i, "|", test_labels[idx], "|",  post_id_2_text[int(post_id[:-4])])
    i += 1

1 | 2 | i hope there are not many of my fellow asians in sweden high chance of falling victim to islamic rape gangs in that country
2 | 2 | jew faggot milo is king of the alt right great it not the alt right anymore
3 | 1 | imagine being so disconnected that youd think a dickies work shirt and some black pants are comparable to a fur suit imagine crying about optics while writing for a site that screams nigger in every title like a <number> year old on xbox love who d just found 4 chan
4 | 2 | america is supposed to be home to free speech except there is one problem most of the platforms are privately owned by jews and the jews could not care less about your free speech wake up white people the jews control what you see and hear be vigilant
5 | 2 | if they are seeking asylum from a terrible mussie country why then are they working so hard to turn these gracious countries into the very mussie countries they claim to be fleeing we all know that answer immediately
6 | 1 | <user> they resi

134 | 0 | roflol as if a bunch of queers has any concept of what a good christian or any kind of christian is or does or would do my sides
135 | 0 | an illegal immigrant that came here to commit crimes is not the same as someone who comes to work abide by our american laws live in peace deport zarate justicefor kate
136 | 1 | and a closet muslim like obama jerks off whenever the rads go boom
137 | 0 | when their still putting the dead in body bags but all you can think about is the backlash islamophobia the moslem community might face
138 | 1 | hey nazi dirtbag is your black bull keeping you occupied
139 | 2 | canfam next make sure that everyone you know sees video footage of taharrush gamea from europe and let them know europe has imported the muslim gang rape game and that the euro press has totally blacked out this epidemic of horrifically violent gang rape and beatings of european women
140 | 1 | the question i ask myself most often is why the negro race keeps running away from the