In [1]:
# load libraries
import torch
import torch.nn as nn
import torch.optim as optim

import pickle
import json
import numpy as np
from collections import Counter
from gensim.models import Word2Vec
from tqdm import tqdm
from sklearn.metrics import classification_report, f1_score

device = torch.device("cuda:0")

In [2]:
! nvidia-smi

Tue Sep 28 10:21:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.40.04    Driver Version: 418.40.04    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  On   | 00000000:3B:00.0 Off |                    0 |
| N/A   66C    P0   187W / 250W |  10283MiB / 16280MiB |    100%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  On   | 00000000:D8:00.0 Off |                    0 |
| N/A   55C    P0   125W / 250W |  16177MiB / 16280MiB |    100%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------

In [3]:
# load word2vec
word2vec_model = Word2Vec.load("../../data/commons/word2vec.model")

In [4]:
# load node -> neighbor list
post_id_2_neigbor_list = pickle.load(open("../../data/non-graph/neighbors_list_dict.pkl", "rb"))

In [5]:
# load node -> user mapping
post_id_2_user = pickle.load(open("../../data/non-graph/post_id_2_user_id.pkl", "rb"))

In [6]:
# load user -> posts list
user_id_2_post_ids_list = pickle.load(open("../../data/non-graph/user_id_posts_idx_list.pkl", "rb"))

In [7]:
# load text corresponding to post
post_id_2_text = pickle.load(open("../../data/non-graph/posts_id_2_text.pkl", "rb"))

In [8]:
def get_ids_and_labels(split):
    ids = [i for i in pid[split] if 'gab' in i]
    labels = [label_dict[Counter([j["label"] for j in data[i]["annotators"]]).most_common(1)[0][0]] for i in pid[split] if 'gab' in i]
    return np.array(ids), np.array(labels)

In [9]:
# load train/val/test split
data_path = '../../data/commons/dataset.json'
pid_path = '../../data/commons/post_id_divisions.json'

with open(pid_path) as f:
    pid = json.load(f)
with open(data_path) as f:
    data = json.load(f)

label_dict = {'normal': 0, 'offensive': 1, 'hatespeech': 2}

train_ids, train_labels = get_ids_and_labels("train")
val_ids, val_labels = get_ids_and_labels("val")
test_ids, test_labels = get_ids_and_labels("test")

In [10]:
def generate_batch(
    post_ids, post_labels, post_id_2_neigbor_list,\
    post_id_2_user, user_id_2_post_ids_list,\
    post_id_2_text, device, batch_size=32,\
    num_neighbors=2, num_user_posts=5, dim=200,\
    max_len=70, shuffle=True, mode="shuffle"
):
    if shuffle:
        shuffled_idxs = np.random.permutation(np.arange(len(post_ids)))
    else:
        shuffled_idxs = np.arange(len(post_ids))
    num_of_batches = (len(post_ids)//batch_size)+(1 if len(post_ids)%batch_size else 0)
    for i in range(num_of_batches):
        batch_idxs = shuffled_idxs[i*batch_size:(i+1)*batch_size]
        X = np.zeros((batch_size, 1 + num_neighbors + num_user_posts, max_len, dim), dtype=np.float32)
        y = post_labels[batch_idxs]
        for dim1, batch_idx in enumerate(batch_idxs):
            post_id = int(post_ids[batch_idx][:-4])
            batch_post_ids = [post_id]
            # batch_post_ids = []
            sampled_neighbors = np.random.choice(post_id_2_neigbor_list[post_id], num_neighbors)
            user_id = post_id_2_user[post_id]
            sampled_user_posts = np.random.choice(user_id_2_post_ids_list[user_id], num_user_posts)
            batch_post_ids.extend([id for id in sampled_neighbors])
            batch_post_ids.extend([id for id in sampled_user_posts])
            for dim2, post_id in enumerate(batch_post_ids):
                vecs = []
                for word in post_id_2_text[post_id].split():
                    try:
                        vecs.append(word2vec_model.wv.get_vector(word))
                    except:
                        pass
                sent_len = len(vecs)
                if sent_len >= max_len:
                    vecs = vecs[:max_len]
                else:
                    pad_len = max_len - sent_len
                    pad_vecs = []
                    for _ in range(pad_len):
                        pad_vecs.append(np.zeros((dim,)))
                    vecs = pad_vecs + vecs
                X[dim1, dim2] = np.array(vecs)
        yield torch.tensor(X[:y.shape[0]]).to(device), torch.LongTensor(y).to(device)

In [11]:
class Encoder(nn.Module):

    def __init__(self, seq_len, inp_dim):
        super(Encoder, self).__init__()

        self.seq_len = seq_len
        self.inp_dim = inp_dim

        self.rnn = nn.LSTM(inp_dim, inp_dim, 2, batch_first=True, bidirectional=False)
    
    def forward(self, x):
        assert x.shape[1] == self.seq_len
        assert x.shape[2] == self.inp_dim

        x, _ = self.rnn(x)
        return x

In [12]:
class Classifier(nn.Module):
    
    def __init__(self, inp_dim, out_logits):
        super(Classifier, self).__init__()

        self.inp_dim = inp_dim
        self.out_dim = out_logits

        self.fc = nn.Linear(inp_dim, out_logits)
    
    def forward(self, x):
        assert x.shape[1] == self.inp_dim

        x = self.fc(x)
        return torch.softmax(x, dim=1)

In [13]:
class Network(nn.Module):

    def __init__(self, num_seq, seq_len, inp_dim, out_logits, device):
        super(Network, self).__init__()

        self.num_seq = num_seq
        self.seq_len = seq_len
        self.inp_dim = inp_dim
        self.out_dim = out_logits
        self.device = device

        self.enc = Encoder(seq_len, inp_dim)
        # self.enc1 = Encoder(seq_len, inp_dim)
        # self.enc2 = Encoder(seq_len, inp_dim)
        self.clf = Classifier(seq_len*inp_dim*num_seq, out_logits)

    def forward(self, x):

        xs = torch.zeros(x.shape[0], self.num_seq*self.seq_len*self.inp_dim).to(self.device)
        for i in range(self.num_seq):
            xs[:,i*self.seq_len*self.inp_dim:(i+1)*self.seq_len*self.inp_dim] =\
                                        self.enc(x[:,i]).reshape(-1, self.seq_len*self.inp_dim)
        x = self.clf(xs)
        return x

In [14]:
def train(epoch_num, device, optimizer, criterion, model, batch_size=32, verbose=False):
    print("Training | Epoch:", epoch_num)
    model.train()
    if not verbose:
        for i, batch in enumerate(tqdm(generate_batch(train_ids, train_labels, post_id_2_neigbor_list,\
                                             post_id_2_user, user_id_2_post_ids_list,\
                                             post_id_2_text, device, batch_size, 0, 15),
                                    total=len(train_ids)//batch_size+(1 if len(train_ids)%batch_size else 0))):
            out = model(batch[0])
            loss = criterion(out, batch[1])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    else:
        for i, batch in enumerate(generate_batch(train_ids, train_labels, post_id_2_neigbor_list,\
                                                 post_id_2_user, user_id_2_post_ids_list,\
                                                 post_id_2_text, device, batch_size, 0, 15)):
            out = model(batch[0])
            loss = criterion(out, batch[1])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print("Epoch:", epoch_num, "| Iter:", i+1, "| Loss:", round(loss.item(), 4))
    model.eval()

In [20]:
def evaluate(epoch_num, device, model, batch_size=64, full=False):
    print("Evaluating | Epoch:", epoch_num)
    y_preds = []
    y_tests = []
    for batch in generate_batch(test_ids, test_labels, post_id_2_neigbor_list,\
                                 post_id_2_user, user_id_2_post_ids_list,\
                                 post_id_2_text, device, batch_size, 0, 15):
        out = model(batch[0])
        for y in batch[1].cpu().numpy():
            y_tests.append(y)
        for y in out.argmax(dim=1).cpu().numpy():
            y_preds.append(y)
    if full:
        print(classification_report(y_tests, y_preds))
    else:
        print(round(f1_score(y_tests, y_preds, average="macro"), 4), round(f1_score(y_tests, y_preds, average="micro"), 4))

In [18]:
model = Network(16, 70, 200, 3, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss()

for epoch in range(20):
    train(epoch+1, device, optimizer, criterion, model, 64, verbose=False)
    evaluate(epoch+1, device, model, 64)

  0%|          | 0/131 [00:00<?, ?it/s]

Training | Epoch: 1


100%|██████████| 131/131 [01:53<00:00,  1.16it/s]


Evaluating | Epoch: 1


  0%|          | 0/131 [00:00<?, ?it/s]

0.6048 0.6506
Training | Epoch: 2


100%|██████████| 131/131 [01:52<00:00,  1.16it/s]


Evaluating | Epoch: 2


  0%|          | 0/131 [00:00<?, ?it/s]

0.6069 0.6622
Training | Epoch: 3


100%|██████████| 131/131 [01:58<00:00,  1.10it/s]


Evaluating | Epoch: 3


  0%|          | 0/131 [00:00<?, ?it/s]

0.5947 0.6554
Training | Epoch: 4


100%|██████████| 131/131 [01:27<00:00,  1.50it/s]


Evaluating | Epoch: 4


  0%|          | 0/131 [00:00<?, ?it/s]

0.6043 0.6564
Training | Epoch: 5


100%|██████████| 131/131 [01:29<00:00,  1.47it/s]


Evaluating | Epoch: 5


  0%|          | 0/131 [00:00<?, ?it/s]

0.6121 0.6622
Training | Epoch: 6


100%|██████████| 131/131 [01:26<00:00,  1.51it/s]


Evaluating | Epoch: 6


  0%|          | 0/131 [00:00<?, ?it/s]

0.6281 0.668
Training | Epoch: 7


100%|██████████| 131/131 [01:27<00:00,  1.49it/s]


Evaluating | Epoch: 7


  0%|          | 0/131 [00:00<?, ?it/s]

0.6105 0.6564
Training | Epoch: 8


100%|██████████| 131/131 [01:28<00:00,  1.47it/s]


Evaluating | Epoch: 8


  0%|          | 0/131 [00:00<?, ?it/s]

0.5866 0.6477
Training | Epoch: 9


100%|██████████| 131/131 [01:24<00:00,  1.54it/s]


Evaluating | Epoch: 9


  0%|          | 0/131 [00:00<?, ?it/s]

0.6197 0.667
Training | Epoch: 10


100%|██████████| 131/131 [01:26<00:00,  1.52it/s]


Evaluating | Epoch: 10
0.6184 0.6622


In [21]:
evaluate(1, device, model, 64, True)

Evaluating | Epoch: 1
              precision    recall  f1-score   support

           0       0.64      0.53      0.58       216
           1       0.53      0.52      0.52       305
           2       0.76      0.82      0.79       515

    accuracy                           0.67      1036
   macro avg       0.64      0.62      0.63      1036
weighted avg       0.67      0.67      0.67      1036



In [22]:
torch.save(model.state_dict(), "1+Rand_15_63_67.pth")

In [44]:
t = model.state_dict()['clf.fc.weight'][0].cpu().numpy()
t2 = np.abs(t.reshape(16,70,-1)[:,-20:,:].reshape(16,-1)).mean(axis=1)
print(t2 / t2.sum())
t = model.state_dict()['clf.fc.weight'][1].cpu().numpy()
t2 = np.abs(t.reshape(16,70,-1)[:,-20:,:].reshape(16,-1)).mean(axis=1)
print(t2 / t2.sum())
t = model.state_dict()['clf.fc.weight'][2].cpu().numpy()
t2 = np.abs(t.reshape(16,70,-1)[:,-20:,:].reshape(16,-1)).mean(axis=1)
print(t2 / t2.sum())

[0.32450047 0.04666138 0.04523772 0.04329785 0.046474   0.04490394
 0.04590655 0.04632129 0.04313574 0.04249322 0.04831655 0.04731193
 0.04191757 0.04403957 0.04657401 0.04290824]
[0.29215732 0.04692858 0.04612512 0.0466164  0.04958789 0.04821777
 0.04744578 0.04680758 0.04759061 0.04788237 0.0482173  0.04694767
 0.04516486 0.04593274 0.04684975 0.04752821]
[0.29011226 0.0483922  0.0452545  0.04575239 0.04779587 0.04903829
 0.04844868 0.04670522 0.04592193 0.04775837 0.04891448 0.04624263
 0.04636228 0.04830711 0.04816526 0.04682856]
