# Preprocessing the Data

In [None]:
from tqdm import tqdm_notebook as tqdm
import preprocessor as p
import numpy as np
import pandas
from math import log
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys
import re
from collections import defaultdict, Counter

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
import torch.nn.functional as F

In [None]:
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score,f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [None]:
import matplotlib.pyplot as plt
import random

In [None]:
def preprocess(text):
    p.set_options(p.OPT.URL,p.OPT.MENTION,p.OPT.EMOJI,p.OPT.HASHTAG)
    return p.tokenize(text).split()

In [None]:
data = pandas.read_csv('../HS_labeled_data.csv')
data

In [None]:
def preprocess(text):
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI ,p.OPT.HASHTAG)
    return p.tokenize(text).split()

In [None]:
def indexer(split_text):
    sent2idx = []
    for w in split_text:
        if w.lower() in word2idx:
            sent2idx.append(word2idx[w.lower()])
        else:
            sent2idx.append(word2idx['_UNK'])
            
    return sent2idx

In [None]:
train, valid = train_test_split(data)

In [None]:
train

In [None]:
train['clean_text'] = train.tweet.apply(lambda x: preprocess(x.lower().strip()))

words = Counter()
for sent in tqdm(train.clean_text.values):
    words.update(w.lower() for w in sent)
   
# sort with most frequently occuring words first
words = sorted(words, key=words.get, reverse=True)
# add <pad> and <unk> token to vocab which will be used later
words = ['_PAD','_UNK'] + words

word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

train['sentence2idx'] = train.clean_text.apply(lambda x: indexer(x))
train['length'] = train.clean_text.apply(lambda x: len(x))
train['label'] = train['class']

In [None]:
fig = plt.figure(figsize=(8,5))
ax = plt.plot(train.length.values)
plt.show()

In [None]:
valid['clean_text'] = valid.tweet.apply(lambda x: preprocess(x.strip()))

valid['sentence2idx'] = valid.clean_text.apply(lambda x: indexer(x))
valid['length'] = valid.clean_text.apply(lambda x: len(x))
valid['label'] = valid['class']

In [None]:
class VectorizeData(Dataset):
    def __init__(self, df, maxlen=30):
        self.maxlen = maxlen
        self.df = df
#         print('Padding')
        self.df['padded_text'] = self.df.sentence2idx.apply(lambda x: self.pad_data(x))
        self.padded_text = list(self.df.padded_text)
        self.labels = list(self.df.label)
        self.lengths = list(self.df.length)
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
#         lens = self.df.length[idx]
        X = self.padded_text[idx]
        y = self.labels[idx]
        lens = self.lengths[idx]
        return X,y,lens
    
    def pad_data(self, s):
        padded = np.zeros((self.maxlen,), dtype=np.int64)
        if len(s) > self.maxlen: padded[:] = s[:self.maxlen]
        else: padded[:len(s)] = s
        return padded

In [None]:
train_loader = VectorizeData(train)
valid_loader = VectorizeData(valid)

In [None]:
tl = DataLoader(dataset=train_loader, batch_size=100, shuffle=True)
print(len(tl))

In [None]:
vl = DataLoader(dataset=valid_loader, batch_size=100, shuffle=False)
print(len(vl))

In [None]:
for i, samples in enumerate(tl):
    print(i)
    print(samples)
    break

In [None]:
for i, samples in enumerate(vl):
    print(i)
    print(samples)
    break

## RecModel

In [None]:
class RecAttnArch(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, bidir, rnnType, attnType,device):
        super(RecAttnArch, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.device = device
        self.rnnType = rnnType
        self.attnType = attnType
        self.bidirectional = bidir
        
        if self.bidirectional:
            self.numDirs = 2
        else:
            self.numDirs = 1
        
        self.emb = nn.Embedding(self.vocab_size, embedding_dim)
        
        if self.rnnType == 'lstm':
            self.recNN = nn.LSTM(embedding_dim,hidden_dim, num_layers,batch_first=True,bidirectional=self.bidirectional)
            
        if self.rnnType == 'gru':
            self.recNN = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True,bidirectional=self.bidirectional)
            
        if self.rnnType == 'rnn':
            self.recNN = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True, nonlinearity='tanh',bidirectional=self.bidirectional)
        
        self.query_vector = nn.Parameter(torch.rand(hidden_dim*self.numDirs,1)).float()
        
        self.attnWgtMatrixSize = [self.numDirs*self.hidden_dim, self.numDirs*self.hidden_dim]
        self.attnWgtMatrix = nn.Parameter(torch.randn(self.attnWgtMatrixSize).float()) # Multiplicative Attention
    
        self.softmax = nn.Softmax(dim=1)
        
        if self.attnType == 'dot':
            self.fc = nn.Linear(self.numDirs*self.hidden_dim, output_dim)
        
        if self.attnType == 'self':
            self.fc = nn.Linear(self.numDirs*30*self.hidden_dim, output_dim)
    
    
    def forward(self,x,encMode=False):
        embs = self.emb(x)
        embs = embs.view(x.size(0),-1,self.embedding_dim).to(self.device)
        
        h0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
        
        if self.rnnType == 'lstm':        
            c0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
            
            out,(hn,cn) = self.recNN(embs,(h0,c0))
        
        else:
            out, hn = self.recNN(embs, h0)
        
        if self.attnType == 'dot':
            Hw = out
            attn_weights = self.softmax(Hw.matmul(self.query_vector))

            out = out.mul(attn_weights)
            context_vector = torch.sum(out,dim=1)
            
            fc_out = context_vector
            
        if self.attnType == 'self':
            queryMatrix = out
            keyMatrix = out.permute(0,2,1)
            
            attnScores = torch.bmm( torch.matmul(queryMatrix,self.attnWgtMatrix), keyMatrix )
            attnScores = F.softmax(attnScores, dim=2)
            hidden_matrix = torch.bmm(attnScores, queryMatrix)
            
            fc_out = hidden_matrix.view(-1, hidden_matrix.size()[1]*hidden_matrix.size()[2])
        
        if encMode:
            return fc_out
            
        else:
            return self.fc(fc_out)
        
    
    def getScores(self,x):
        embs = self.emb(x)
        embs = embs.view(x.size(0),-1,self.embedding_dim).to(self.device)
        
        h0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
        
        if self.rnnType == 'lstm':        
            c0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
            
            out,(hn,cn) = self.recNN(embs,(h0,c0))
        
        else:
            out, hn = self.recNN(embs, h0)
        
        if self.attnType == 'dot':
            Hw = out
            attn_weights = self.softmax(Hw.matmul(self.query_vector))
            out = out.mul(attn_weights)
            
            return out
            
        if self.attnType == 'self':
            queryMatrix = out
            keyMatrix = out.permute(0,2,1)
            
            attnScores = torch.bmm( torch.matmul(queryMatrix,self.attnWgtMatrix), keyMatrix )
            attnScores = F.softmax(attnScores, dim=2)
            
            return attnScores
        

### Optimal combination seems to be with GRU of 50 units and 1 layer

In [None]:
vocab_size = len(words)
embedding_dim = 256
n_hidden = 50
n_out = 4
num_layers = 1
rnnType = 'gru'
bidir = True

if torch.cuda.is_available():
    device = 'cuda:2'
    
else:
    device = 'cpu'
    
model = model = RecAttnArch(vocab_size,embedding_dim,n_hidden,n_out,num_layers,bidir,rnnType,'self',device)
model.to(device)
model.float()

In [None]:
model.to(device)
model.float()

In [None]:
optimizer = torch.optim.Adagrad(model.parameters())
# criterion = torch.nn.BCEWithLogitsLoss()
criterion = torch.nn.CrossEntropyLoss()

seq_dim = 30
num_epochs = 50

train_losses_iterwise = []
recall_iterwise = []
precision_iterwise = []
accuracy_iterwise = []
f1score_iterwise = []
val_losses_iterwise = []

for epoch in tqdm(range(num_epochs)):
    train_losses = []
    val_losses = []
    for i, (text,label,lengths) in enumerate(tl):

        text = Variable(text.view(-1, seq_dim, 1)).to(device)
        label = Variable(label).to(device)
        
#         print(sexism_label)
        
        optimizer.zero_grad()
        outputs = model(text)
        
#         print(outputs)
        
        loss = criterion(outputs, label)
        train_losses.append(loss.data.cpu())
        
        loss.backward()
        optimizer.step()
    
    if epoch % 50 == 0:
        correct = 0
        total = 0

        allLabels = []
        allPreds = []
        probPreds = []

        for i, (text,label,lengths) in enumerate(vl):
            labels=[]
            text = Variable(text.view(-1, seq_dim, 1)).to(device)
            label = Variable(label).to(device)

            predicted = model(text)
            predicted =  torch.softmax(predicted,1)
            probPreds.append(predicted)
            predicted = torch.max(predicted, 1)[1].cpu().numpy().tolist()
    #                 print(predicted)
    #                 print(sexism_label)
            allLabels += (label.cpu().numpy().tolist())
            allPreds += (predicted)

        valacc = accuracy_score(allLabels, allPreds)
        recscore = recall_score(allLabels, allPreds,average='macro')
        precscore = precision_score(allLabels, allPreds,average='macro')
        f1score = f1_score(allLabels, allPreds,average='macro')
#         roc = roc_auc_score(allLabels,allPreds)
        cr = classification_report(allLabels, allPreds)
        print(f'acc: {valacc} F1 {f1score}')
        print(cr)

        train_losses_iterwise.append(np.mean(train_losses))

In [None]:
for i in range(len(text)):
    wgts = model.getScores(text[i].reshape(1,30,1)).to(device).detach().cpu().reshape(30,30)
    
    plt.rcParams['figure.figsize'] = [10, 10]

    plt.matshow(wgts, cmap='hot')
    plt.show()