In [1]:
# load libraries
import torch
import torch.nn as nn
import torch.optim as optim

import pickle
import json
import numpy as np
from collections import Counter
from gensim.models import Word2Vec
from tqdm import tqdm
from sklearn.metrics import classification_report, f1_score

device = torch.device("cuda:0")

In [2]:
# load word2vec
word2vec_model = Word2Vec.load("../../data/commons/word2vec.model")

In [3]:
# load text corresponding to post
post_id_2_text = pickle.load(open("../../data/non-graph/posts_id_2_text.pkl", "rb"))

In [4]:
# loading timeline list
post_id_2_left_list = pickle.load(open("../../data/non-graph/post_id_2_left_timestamp_post_ids.pkl", "rb"))
post_id_2_right_list = pickle.load(open("../../data/non-graph/post_id_2_right_timestamp_post_ids.pkl", "rb"))

In [5]:
def get_ids_and_labels(split):
    ids = [i for i in pid[split] if 'gab' in i]
    labels = [label_dict[Counter([j["label"] for j in data[i]["annotators"]]).most_common(1)[0][0]] for i in pid[split] if 'gab' in i]
    return np.array(ids), np.array(labels)

In [6]:
# load train/val/test split
data_path = '../../data/commons/dataset.json'
pid_path = '../../data/commons/post_id_divisions.json'

with open(pid_path) as f:
    pid = json.load(f)
with open(data_path) as f:
    data = json.load(f)

label_dict = {'normal': 0, 'offensive': 1, 'hatespeech': 2}

train_ids, train_labels = get_ids_and_labels("train")
val_ids, val_labels = get_ids_and_labels("val")
test_ids, test_labels = get_ids_and_labels("test")

In [7]:
def generate_batch(
    post_ids, post_labels, post_id_2_text,\
    post_id_2_left_list, post_id_2_right_list,\
    device, batch_size=32, dim=200,\
    max_len=70, shuffle=True, mode="shuffle"
):
    if shuffle:
        shuffled_idxs = np.random.permutation(np.arange(len(post_ids)))
    else:
        shuffled_idxs = np.arange(len(post_ids))
    num_of_batches = (len(post_ids)//batch_size)+(1 if len(post_ids)%batch_size else 0)
    for i in range(num_of_batches):
        batch_idxs = shuffled_idxs[i*batch_size:(i+1)*batch_size]
        X = np.zeros((batch_size, 15, max_len, dim), dtype=np.float32)
        y = post_labels[batch_idxs]
        for dim1, batch_idx in enumerate(batch_idxs):
            post_id = int(post_ids[batch_idx][:-4])
            # batch_post_ids = [post_id] + post_id_2_left_list[post_id] + post_id_2_right_list[post_id]
            batch_post_ids = post_id_2_left_list[post_id] + post_id_2_right_list[post_id]
            for dim2, post_id in enumerate(batch_post_ids):
                vecs = []
                for word in post_id_2_text[post_id].split():
                    try:
                        vecs.append(word2vec_model.wv.get_vector(word))
                    except:
                        pass
                sent_len = len(vecs)
                if sent_len >= max_len:
                    vecs = vecs[:max_len]
                else:
                    pad_len = max_len - sent_len
                    pad_vecs = []
                    for _ in range(pad_len):
                        pad_vecs.append(np.zeros((dim,)))
                    vecs = pad_vecs + vecs
                X[dim1, dim2] = np.array(vecs)
        yield torch.tensor(X[:y.shape[0]]).to(device), torch.LongTensor(y).to(device)

In [8]:
class Encoder(nn.Module):

    def __init__(self, seq_len, inp_dim):
        super(Encoder, self).__init__()

        self.seq_len = seq_len
        self.inp_dim = inp_dim

        self.rnn = nn.LSTM(inp_dim, inp_dim, 2, batch_first=True, bidirectional=False)
    
    def forward(self, x):
        assert x.shape[1] == self.seq_len
        assert x.shape[2] == self.inp_dim

        x, _ = self.rnn(x)
        return x

In [9]:
class Classifier(nn.Module):
    
    def __init__(self, inp_dim, out_logits):
        super(Classifier, self).__init__()

        self.inp_dim = inp_dim
        self.out_dim = out_logits

        self.fc = nn.Linear(inp_dim, out_logits)
    
    def forward(self, x):
        assert x.shape[1] == self.inp_dim

        x = self.fc(x)
        return torch.softmax(x, dim=1)

In [10]:
class Network(nn.Module):

    def __init__(self, num_seq, seq_len, inp_dim, out_logits, device):
        super(Network, self).__init__()

        self.num_seq = num_seq
        self.seq_len = seq_len
        self.inp_dim = inp_dim
        self.out_dim = out_logits
        self.device = device

        self.enc = Encoder(seq_len, inp_dim)
        # self.enc1 = Encoder(seq_len, inp_dim)
        # self.enc2 = Encoder(seq_len, inp_dim)
        self.clf = Classifier(seq_len*inp_dim*num_seq, out_logits)

    def forward(self, x):

        xs = torch.zeros(x.shape[0], self.num_seq*self.seq_len*self.inp_dim).to(self.device)
        for i in range(self.num_seq):
            xs[:,i*self.seq_len*self.inp_dim:(i+1)*self.seq_len*self.inp_dim] =\
                                        self.enc(x[:,i]).reshape(-1, self.seq_len*self.inp_dim)
        x = self.clf(xs)
        return x

In [11]:
def train(epoch_num, device, optimizer, criterion, model, batch_size=32, verbose=False):
    print("Training | Epoch:", epoch_num)
    model.train()
    if not verbose:
        for i, batch in enumerate(tqdm(generate_batch(train_ids, train_labels, post_id_2_text,\
                                                      post_id_2_left_list, post_id_2_right_list,\
                                                      device, batch_size),
                                    total=len(train_ids)//batch_size+(1 if len(train_ids)%batch_size else 0))):
            out = model(batch[0])
            loss = criterion(out, batch[1])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    else:
        for i, batch in enumerate(generate_batch(train_ids, train_labels, post_id_2_text,\
                                                 post_id_2_left_list, post_id_2_right_list,\
                                                 device, batch_size)):
            out = model(batch[0])
            loss = criterion(out, batch[1])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print("Epoch:", epoch_num, "| Iter:", i+1, "| Loss:", round(loss.item(), 4))
    model.eval()

In [12]:
def evaluate(epoch_num, device, model, batch_size=64, full=False):
    model.eval()
    print("Evaluating | Epoch:", epoch_num)
    y_preds = []
    y_tests = []
    for batch in generate_batch(test_ids, test_labels, post_id_2_text,\
                                post_id_2_left_list, post_id_2_right_list,\
                                device, batch_size):
        out = model(batch[0])
        for y in batch[1].cpu().numpy():
            y_tests.append(y)
        for y in out.argmax(dim=1).cpu().numpy():
            y_preds.append(y)
    if full:
        print(classification_report(y_tests, y_preds))
    else:
        print(round(f1_score(y_tests, y_preds, average="macro"), 4), round(f1_score(y_tests, y_preds, average="micro"), 4))

In [13]:
def evaluate_train(epoch_num, device, model, batch_size=64, full=False):
    model.eval()
    print("Evaluating | Epoch:", epoch_num)
    y_preds = []
    y_tests = []
    for batch in generate_batch(train_ids, train_labels, post_id_2_text,\
                                post_id_2_left_list, post_id_2_right_list,\
                                device, batch_size):
        out = model(batch[0])
        for y in batch[1].cpu().numpy():
            y_tests.append(y)
        for y in out.argmax(dim=1).cpu().numpy():
            y_preds.append(y)
    if full:
        print(classification_report(y_tests, y_preds))
    else:
        print(round(f1_score(y_tests, y_preds, average="macro"), 4), round(f1_score(y_tests, y_preds, average="micro"), 4))

In [15]:
# model = Network(15, 70, 200, 3, device).to(device)
model = Network(14, 70, 200, 3, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

for epoch in range(20):
    train(epoch+1, device, optimizer, criterion, model, 256, verbose=False)
    evaluate(epoch+1, device, model, 512)

  0%|          | 0/33 [00:00<?, ?it/s]

Training | Epoch: 1


100%|██████████| 33/33 [00:25<00:00,  1.32it/s]


Evaluating | Epoch: 1


  0%|          | 0/33 [00:00<?, ?it/s]

0.2567 0.4981
Training | Epoch: 2


100%|██████████| 33/33 [00:25<00:00,  1.27it/s]


Evaluating | Epoch: 2


  0%|          | 0/33 [00:00<?, ?it/s]

0.2736 0.4952
Training | Epoch: 3


100%|██████████| 33/33 [00:25<00:00,  1.30it/s]


Evaluating | Epoch: 3


  0%|          | 0/33 [00:00<?, ?it/s]

0.2679 0.4971
Training | Epoch: 4


100%|██████████| 33/33 [00:25<00:00,  1.28it/s]


Evaluating | Epoch: 4


  0%|          | 0/33 [00:00<?, ?it/s]

0.2832 0.4759
Training | Epoch: 5


100%|██████████| 33/33 [00:25<00:00,  1.30it/s]


Evaluating | Epoch: 5


  0%|          | 0/33 [00:00<?, ?it/s]

0.2851 0.4817
Training | Epoch: 6


100%|██████████| 33/33 [00:26<00:00,  1.26it/s]


Evaluating | Epoch: 6


  0%|          | 0/33 [00:00<?, ?it/s]

0.2945 0.4759
Training | Epoch: 7


100%|██████████| 33/33 [00:25<00:00,  1.30it/s]


Evaluating | Epoch: 7


  0%|          | 0/33 [00:00<?, ?it/s]

0.3136 0.4681
Training | Epoch: 8


100%|██████████| 33/33 [00:25<00:00,  1.31it/s]


Evaluating | Epoch: 8


  0%|          | 0/33 [00:00<?, ?it/s]

0.3108 0.4701
Training | Epoch: 9


100%|██████████| 33/33 [00:24<00:00,  1.35it/s]


Evaluating | Epoch: 9


  0%|          | 0/33 [00:00<?, ?it/s]

0.2733 0.4836
Training | Epoch: 10


100%|██████████| 33/33 [00:25<00:00,  1.28it/s]


Evaluating | Epoch: 10


  0%|          | 0/33 [00:00<?, ?it/s]

0.3157 0.4797
Training | Epoch: 11


100%|██████████| 33/33 [00:25<00:00,  1.29it/s]


Evaluating | Epoch: 11


  0%|          | 0/33 [00:00<?, ?it/s]

0.3042 0.4778
Training | Epoch: 12


100%|██████████| 33/33 [00:25<00:00,  1.31it/s]


Evaluating | Epoch: 12


  0%|          | 0/33 [00:00<?, ?it/s]

0.2965 0.4788
Training | Epoch: 13


100%|██████████| 33/33 [00:25<00:00,  1.31it/s]


Evaluating | Epoch: 13


  0%|          | 0/33 [00:00<?, ?it/s]

0.336 0.4595
Training | Epoch: 14


100%|██████████| 33/33 [00:26<00:00,  1.27it/s]


Evaluating | Epoch: 14


  0%|          | 0/33 [00:00<?, ?it/s]

0.3681 0.4739
Training | Epoch: 15


100%|██████████| 33/33 [00:27<00:00,  1.19it/s]


Evaluating | Epoch: 15


  0%|          | 0/33 [00:00<?, ?it/s]

0.323 0.4759
Training | Epoch: 16


100%|██████████| 33/33 [00:28<00:00,  1.15it/s]


Evaluating | Epoch: 16


  0%|          | 0/33 [00:00<?, ?it/s]

0.3334 0.4836
Training | Epoch: 17


100%|██████████| 33/33 [00:29<00:00,  1.12it/s]


Evaluating | Epoch: 17


  0%|          | 0/33 [00:00<?, ?it/s]

0.3468 0.4807
Training | Epoch: 18


100%|██████████| 33/33 [00:29<00:00,  1.14it/s]


Evaluating | Epoch: 18


  0%|          | 0/33 [00:00<?, ?it/s]

0.3413 0.4836
Training | Epoch: 19


100%|██████████| 33/33 [00:28<00:00,  1.15it/s]


Evaluating | Epoch: 19


  0%|          | 0/33 [00:00<?, ?it/s]

0.3399 0.4846
Training | Epoch: 20


100%|██████████| 33/33 [00:28<00:00,  1.16it/s]


Evaluating | Epoch: 20
0.3702 0.4884


In [None]:
# for epoch in range(1):
#     train(epoch+1, device, optimizer, criterion, model, 64, verbose=False)
#     evaluate(epoch+1, device, model, 64)

In [17]:
evaluate_train(1, device, model, 512, True)

Evaluating | Epoch: 1
              precision    recall  f1-score   support

           0       0.78      0.15      0.25      1641
           1       0.60      0.34      0.43      2546
           2       0.57      0.91      0.70      4190

    accuracy                           0.59      8377
   macro avg       0.65      0.47      0.46      8377
weighted avg       0.62      0.59      0.53      8377



In [18]:
evaluate(1, device, model, 512, True)

Evaluating | Epoch: 1
              precision    recall  f1-score   support

           0       0.48      0.13      0.21       216
           1       0.35      0.22      0.27       305
           2       0.52      0.80      0.63       515

    accuracy                           0.49      1036
   macro avg       0.45      0.38      0.37      1036
weighted avg       0.46      0.49      0.44      1036



In [None]:
# torch.save(model.state_dict(), "Timeline_14_62_66.pth")

In [None]:
# t = model.state_dict()['clf.fc.weight'][0].cpu().numpy()
# t2 = np.abs(t.reshape(15,70,-1)[:,-20:,:].reshape(15,-1)).mean(axis=1)
# print(t2 / t2.sum())
# t = model.state_dict()['clf.fc.weight'][1].cpu().numpy()
# t2 = np.abs(t.reshape(15,70,-1)[:,-20:,:].reshape(15,-1)).mean(axis=1)
# print(t2 / t2.sum())
# t = model.state_dict()['clf.fc.weight'][2].cpu().numpy()
# t2 = np.abs(t.reshape(15,70,-1)[:,-20:,:].reshape(15,-1)).mean(axis=1)
# print(t2 / t2.sum())