In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import json, pickle
from gensim.models import Word2Vec
from tqdm import tqdm
import numpy as np
# import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import classification_report

In [2]:
word2vec_model = Word2Vec.load("../../data/commons/word2vec.model")

In [3]:
post_id_2_text = pickle.load(open("../../data/non-graph/posts_id_2_text.pkl", "rb"))

In [4]:
class Encoder(nn.Module):

    def __init__(self, seq_len, inp_dim):
        super(Encoder, self).__init__()

        self.seq_len = seq_len
        self.inp_dim = inp_dim

        self.rnn = nn.LSTM(inp_dim, inp_dim, 2, batch_first=True, bidirectional=False)
    
    def forward(self, x):
        assert x.shape[1] == self.seq_len
        assert x.shape[2] == self.inp_dim

        x, _ = self.rnn(x)
        return x

In [5]:
class Classifier(nn.Module):
    
    def __init__(self, inp_dim, out_logits):
        super(Classifier, self).__init__()

        self.inp_dim = inp_dim
        self.out_dim = out_logits

        self.fc = nn.Linear(inp_dim, out_logits)
    
    def forward(self, x):
        assert x.shape[1] == self.inp_dim

        x = self.fc(x)
        return torch.softmax(x, dim=1)

In [6]:
class Network(nn.Module):

    def __init__(self, seq_len, inp_dim, out_logits):
        super(Network, self).__init__()

        self.seq_len = seq_len
        self.inp_dim = inp_dim
        self.out_dim = out_logits

        self.enc = Encoder(seq_len, inp_dim)
        self.clf = Classifier(seq_len*inp_dim, out_logits)

    def forward(self, x):

        x = self.enc(x).reshape(-1, self.seq_len*self.inp_dim)
        x = self.clf(x)
        return x

In [8]:
def get_batches(post_ids, y, device, batch_size, shuffle=True):
    num_of_batches = y.shape[0] // batch_size + (1 if y.shape[0] % batch_size else 0)
    if shuffle:
        shuffled_idxs = np.random.permutation(np.arange(y.shape[0]))
    else:
        shuffled_idxs = np.arange(y.shape[0])
    for i in range(num_of_batches):
        batch_idxs = shuffled_idxs[i*batch_size:(i+1)*batch_size]
        X = np.zeros((batch_idxs.shape[0], 1, max_len, dim), dtype=np.float32)
        for dim1, batch_idx in enumerate(batch_idxs):
            post_id = int(post_ids[batch_idx][:-4])
            batch_post_ids = [post_id]
            for dim2, post_id in enumerate(batch_post_ids):
                vecs = []
                for word in post_id_2_text[post_id].split():
                    try:
                        vecs.append(word2vec_model.wv.get_vector(word))
                    except:
                        pass
                sent_len = len(vecs)
                if sent_len >= max_len:
                    vecs = vecs[:max_len]
                else:
                    pad_len = max_len - sent_len
                    pad_vecs = []
                    for _ in range(pad_len):
                        pad_vecs.append(np.zeros((dim,)))
                    vecs = pad_vecs + vecs
                X[dim1, dim2] = np.array(vecs)
        yield torch.FloatTensor(X.squeeze()).to(device),\
                torch.LongTensor(y[batch_idxs]).to(device)

In [8]:
def train(epoch_num, data, labels, device, optimizer, criterion, model, batch_size=32, verbose=False):
    print("Training | Epoch:", epoch_num)
    model.train()
    for i, batch in enumerate(tqdm(get_batches(data, labels, device, batch_size),
                                    total=data.shape[0]//batch_size+(1 if data.shape[0]%batch_size else 0))):
        out = model(batch[0])
        loss = criterion(out, batch[1])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if verbose:
            print("Epoch:", epoch_num, "| Iter:", i+1, "| Loss:", round(loss.item(), 4))
    model.eval()

In [14]:
def evaluate(epoch_num, data, labels, device, model, batch_size=64):
    print("Evaluating | Epoch:", epoch_num)
    y_preds = []
    y_tests = []
    for batch in get_batches(data, labels, device, batch_size, shuffle=False):
        out = model(batch[0])
        for y in batch[1].cpu().numpy():
            y_tests.append(y)
        for y in out.argmax(dim=1).cpu().numpy():
            y_preds.append(y)
    print(classification_report(y_tests, y_preds))
    return y_preds

In [10]:
def get_ids_and_labels(split):
    ids = [i for i in pid[split] if 'gab' in i]
    labels = [label_dict[Counter([j["label"] for j in data[i]["annotators"]]).most_common(1)[0][0]] for i in pid[split] if 'gab' in i]
    return np.array(ids), np.array(labels)

In [11]:
data_path = '../../data/commons/dataset.json'
pid_path = '../../data/commons/post_id_divisions.json'

with open(pid_path) as f:
    pid = json.load(f)
with open(data_path) as f:
    data = json.load(f)

label_dict = {'normal': 0, 'offensive': 1, 'hatespeech': 2}

train_ids, train_labels = get_ids_and_labels("train")
val_ids, val_labels = get_ids_and_labels("val")
test_ids, test_labels = get_ids_and_labels("test")

In [16]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# # device = torch.device("cpu")
# model = Network(70, 200, 3).to(device)

# optimizer = optim.Adam(model.parameters(), lr=3e-5)
# criterion = nn.CrossEntropyLoss()

# max_len = 70
# dim = 200

# model.load_state_dict(torch.load("simple_lstm_model.pth"))

for epoch in range(1):
#     train(epoch+1, train_ids, train_labels, device, optimizer, criterion, model, 128)
    evaluate(epoch+1, test_ids, test_labels, device, model, 512)

Evaluating | Epoch: 1
              precision    recall  f1-score   support

           0       0.60      0.56      0.58       216
           1       0.55      0.52      0.53       305
           2       0.77      0.81      0.79       515

    accuracy                           0.67      1036
   macro avg       0.64      0.63      0.63      1036
weighted avg       0.67      0.67      0.67      1036



In [66]:
torch.save(model.state_dict(), "simple_lstm_model.pth")