# Preprocessing the Data

In [None]:
from tqdm import tqdm_notebook as tqdm
import preprocessor as p
import numpy as np
import pandas
from math import log
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys
import re
from collections import defaultdict, Counter

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
import torch.nn.functional as F

In [None]:
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score,f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [None]:
import matplotlib.pyplot as plt
import random

In [None]:
def preprocess(text):
    p.set_options(p.OPT.URL,p.OPT.MENTION,p.OPT.EMOJI,p.OPT.HASHTAG)
    return p.tokenize(text).split()

In [None]:
data = pandas.read_csv('../HS_labeled_data.csv')
data

In [None]:
def preprocess(text):
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI ,p.OPT.HASHTAG)
    return p.tokenize(text).split()

In [None]:
def indexer(split_text):
    sent2idx = []
    for w in split_text:
        if w.lower() in word2idx:
            sent2idx.append(word2idx[w.lower()])
        else:
            sent2idx.append(word2idx['_UNK'])
            
    return sent2idx

In [None]:
train, valid = train_test_split(data)

In [None]:
train

In [None]:
train['clean_text'] = train.tweet.apply(lambda x: preprocess(x.lower().strip()))

words = Counter()
for sent in tqdm(train.clean_text.values):
    words.update(w.lower() for w in sent)
   
# sort with most frequently occuring words first
words = sorted(words, key=words.get, reverse=True)
# add <pad> and <unk> token to vocab which will be used later
words = ['_PAD','_UNK'] + words

word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

train['sentence2idx'] = train.clean_text.apply(lambda x: indexer(x))
train['length'] = train.clean_text.apply(lambda x: len(x))
train['label'] = train['class']

In [None]:
valid['clean_text'] = valid.tweet.apply(lambda x: preprocess(x.strip()))

valid['sentence2idx'] = valid.clean_text.apply(lambda x: indexer(x))
valid['length'] = valid.clean_text.apply(lambda x: len(x))
valid['label'] = valid['class']

In [None]:
class VectorizeData(Dataset):
    def __init__(self, df, maxlen=30):
        self.maxlen = maxlen
        self.df = df
#         print('Padding')
        self.df['padded_text'] = self.df.sentence2idx.apply(lambda x: self.pad_data(x))
        self.padded_text = list(self.df.padded_text)
        self.labels = list(self.df.label)
        self.lengths = list(self.df.length)
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
#         lens = self.df.length[idx]
        X = self.padded_text[idx]
        y = self.labels[idx]
        lens = self.lengths[idx]
        return X,y,lens
    
    def pad_data(self, s):
        padded = np.zeros((self.maxlen,), dtype=np.int64)
        if len(s) > self.maxlen: padded[:] = s[:self.maxlen]
        else: padded[:len(s)] = s
        return padded

In [None]:
train_loader = VectorizeData(train)
valid_loader = VectorizeData(valid)

In [None]:
tl = DataLoader(dataset=train_loader, batch_size=100, shuffle=True)
print(len(tl))

In [None]:
vl = DataLoader(dataset=valid_loader, batch_size=100, shuffle=False)
print(len(vl))

In [None]:
for i, samples in enumerate(tl):
    print(i)
    print(samples)
    break

In [None]:
for i, samples in enumerate(vl):
    print(i)
    print(samples)
    break

## PMI

In [None]:
def computePMIMatrix(listOfTokenizedSentences):
    wordCounts = defaultdict(lambda:0)
    
    print('Calculating Word Probabilities')
    for tokenizedSent in tqdm(listOfTokenizedSentences):
        for word in set(tokenizedSent):
            wordCounts[word] += 1
            
    for key in wordCounts:
        wordCounts[key] = wordCounts[key] / len(listOfTokenizedSentences)
    
    pairwiseCounts = defaultdict(lambda:defaultdict(lambda:0))
    
    print('Calculating PairWise Probabilities')
    for tokenizedSent in tqdm(listOfTokenizedSentences):
        sentWords = set(tokenizedSent)
        
        for i in sentWords:
            for j in sentWords:
                pairwiseCounts[i][j] += 1 / len(listOfTokenizedSentences)
        
    return wordCounts, pairwiseCounts

In [None]:
a,b = computePMIMatrix(list(train['clean_text']))

In [None]:
def PPMI(w1,w2):
    try:
        return max( 0,log(b[w1][w2]) - (log(a[w1])+log(a[w2])) )
    except:
        return 0

In [None]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

def preprocess_adj(adj):
    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
    # return sparse_to_tuple(adj_normalized)
    return adj_normalized.A

def chebyshev_polynomials(adj, k):
    """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation)."""
    print("Calculating Chebyshev polynomials up to order {}...".format(k))

    adj_normalized = normalize_adj(adj)
    laplacian = sp.eye(adj.shape[0]) - adj_normalized
    largest_eigval, _ = eigsh(laplacian, 1, which='LM')
    scaled_laplacian = (
        2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0])

    t_k = list()
    # t_k.append(sp.eye(adj.shape[0]))
    # t_k.append(scaled_laplacian)
    t_k.append(sp.eye(adj.shape[0]).A)
    t_k.append(scaled_laplacian.A)

    def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap):
        s_lap = sp.csr_matrix(scaled_lap, copy=True)
        return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two

    for i in range(2, k+1):
        t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian))

    # return sparse_to_tuple(t_k)
    return t_k

## RecModel

In [None]:
class RecArch(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, bidir, rnnType,device):
        super(RecArch, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.device = device
        self.rnnType = rnnType
        self.bidirectional = bidir
        
        if self.bidirectional:
            self.numDirs = 2
        else:
            self.numDirs = 1
        
        self.emb = nn.Embedding(self.vocab_size, embedding_dim)
        
        if self.rnnType == 'lstm':
            self.recNN = nn.LSTM(embedding_dim,hidden_dim, num_layers,batch_first=True,bidirectional=self.bidirectional)
            
        if self.rnnType == 'gru':
            self.recNN = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True,bidirectional=self.bidirectional)
            
        if self.rnnType == 'rnn':
            self.recNN = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True, nonlinearity='tanh',bidirectional=self.bidirectional)
        
        self.fc = nn.Linear(self.numDirs*hidden_dim,output_dim)
    
    def encode(self,x):
        embs = self.emb(x)
        embs = embs.view(x.size(0),-1,self.embedding_dim).to(self.device)
        
        h0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
        
        if self.rnnType == 'lstm':        
            c0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
            
            out,(hn,cn) = self.recNN(embs,(h0,c0))
        
        else:
            out, hn = self.recNN(embs, h0)
        
#         print(out[:,-1,:].shape)
        return out[:, -1, :]
    
    def forward(self,x):
        embs = self.emb(x)
        embs = embs.view(x.size(0),-1,self.embedding_dim).to(self.device)
        
        h0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
        
        if self.rnnType == 'lstm':        
            c0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
            
            out,(hn,cn) = self.recNN(embs,(h0,c0))
        
        else:
            out, hn = self.recNN(embs, h0)
        
#         print(out[:,-1,:].shape)
        out = self.fc(out[:, -1, :])
        return out

### Optimal combination seems to be with GRU of 50 units and 1 layer

In [None]:
vocab_size = len(words)
embedding_dim = 256
n_hidden = 50
n_out = 3
num_layers = 1
rnnType = 'gru'
bidir = False

if torch.cuda.is_available():
    device = 'cuda:1'
else:
    device = 'cpu'

In [None]:
model = RecArch(vocab_size,embedding_dim,n_hidden,n_out,num_layers,bidir,rnnType,device)
model = model.to(device)
model.float()

# Pretraining Text Encoder

In [None]:
optimizer = torch.optim.Adagrad(model.parameters(),lr=0.01)
# criterion = torch.nn.BCEWithLogitsLoss()
criterion = torch.nn.CrossEntropyLoss()

seq_dim = 30
num_epochs = 200

train_losses_iterwise = []
recall_iterwise = []
precision_iterwise = []
accuracy_iterwise = []
f1score_iterwise = []
val_losses_iterwise = []

for epoch in tqdm(range(num_epochs)):
    train_losses = []
    val_losses = []
    for i, (text,label,lengths) in enumerate(tl):

        text = Variable(text.view(-1, seq_dim, 1)).to(device)
        label = Variable(label).to(device)
        
#         print(sexism_label)
        
        optimizer.zero_grad()
        outputs = model(text)
        
#         print(outputs)
        
        loss = criterion(outputs, label)
        train_losses.append(loss.data.cpu())
        
        loss.backward()
        optimizer.step()
    
    if epoch % 50 == 0:
        correct = 0
        total = 0

        allLabels = []
        allPreds = []
        probPreds = []

        for i, (text,label,lengths) in enumerate(vl):
            labels=[]
            text = Variable(text.view(-1, seq_dim, 1)).to(device)
            label = Variable(label).to(device)

            predicted = model(text)
            predicted =  torch.softmax(predicted,1)
            probPreds.append(predicted)
            predicted = torch.max(predicted, 1)[1].cpu().numpy().tolist()
    #                 print(predicted)
    #                 print(sexism_label)
            allLabels += (label.cpu().numpy().tolist())
            allPreds += (predicted)

        valacc = accuracy_score(allLabels, allPreds)
        recscore = recall_score(allLabels, allPreds,average='macro')
        precscore = precision_score(allLabels, allPreds,average='macro')
        f1score = f1_score(allLabels, allPreds,average='macro')
#         roc = roc_auc_score(allLabels,allPreds)
        cr = classification_report(allLabels, allPreds)
#         print(f'acc: {valacc} AUC {roc}')
        print(cr)

        train_losses_iterwise.append(np.mean(train_losses))

# Training Conv Net

In [None]:
tl = DataLoader(dataset=train_loader, batch_size=1, shuffle=True)
vl = DataLoader(dataset=valid_loader, batch_size=1, shuffle=True)

In [None]:
def computeAdjMatrix(text):
    text = text.reshape(-1).tolist()
    words = [idx2word[idx] for idx in text]
    matrix = []
    for i in range(len(words)):
        row = []
        for j in range(len(words)):
            row.append(PPMI(words[i],words[j]))
        row.append(1)
        matrix.append(row)
        
    matrix.append([1 for i in range(len(words)+1)])
    return preprocess_adj(np.array(matrix))

In [None]:
class GraphConvLayer(nn.Module):
    def __init__(self, in_size, out_size,seq_dim):
        super(GraphConvLayer,self).__init__()
        
        self.attn = nn.parameter.Parameter(torch.FloatTensor(seq_dim, seq_dim))
        self.weight = nn.parameter.Parameter(torch.FloatTensor(in_size, out_size))
        var = 2./(self.weight.size(1)+self.weight.size(0))
        self.weight.data.normal_(0,var)
        var = 2./(self.attn.size(1)+self.attn.size(0))
        self.attn.data.normal_(0,var)
        
    def forward(self,X,A_hat):
        X = torch.mm(X, self.weight)
        wgtScores = torch.mm(A_hat, self.attn)
        out = F.relu(torch.mm(wgtScores,X))
        
        return out
    
    def getScores(self,A_hat):
        wgtScores = torch.mm(A_hat, self.attn)
        return wgtScores

In [None]:
class GraphConvNet(nn.Module):
    def __init__(self,feature_dim,seq_dim):
        super(GraphConvNet, self).__init__()
        self.graphlayer1 = GraphConvLayer(feature_dim,feature_dim,seq_dim)
        self.graphlayer2 = GraphConvLayer(feature_dim,feature_dim,seq_dim)        
        self.fc = nn.Linear(50,3)
        
    def forward(self,X,A_hat):
        A_hat = torch.tensor(A_hat).float()
        X1 = self.graphlayer1(X,A_hat)
        X2 = self.graphlayer2(X1,A_hat)
        
        out = self.fc(X2)
        return out

### Custom GCN with Fixed Adj Matrix

In [None]:
gcnModel = GraphConvNet(50,31).to(device)
gcnModel.fc.load_state_dict(model.fc.state_dict())

In [None]:
seq_dim = 30

optimizer = optim.Adam(gcnModel.parameters(), lr=0.02)
criterion = torch.nn.CrossEntropyLoss()

for epoch in tqdm_notebook(range(5)):
    train_losses = []
    for i, (text,label,lengths) in tqdm_notebook(enumerate(tl),total=len(tl)):
        textencs = model.encode(text.reshape(seq_dim,-1,1).to(device))
        sentenc = model.encode(text.reshape(-1, seq_dim, 1).to(device))
        embeds = torch.cat([textencs,sentenc])
        label = Variable(label).to(device)
        
        adj_matrix = torch.tensor(computeAdjMatrix(text)).to(device)
        
        optimizer.zero_grad()
        outputs = gcnModel(embeds,adj_matrix)
        
        loss = criterion(outputs[-1].reshape(1,-1), label)
        
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
#         print(loss.item())
    print(np.average(train_losses))
    
    allLabels = []
    allPreds = []
    probPreds = []

    for i, (text,label,lengths) in enumerate(vl):
        labels=[]
        textencs = model.encode(text.reshape(seq_dim,-1,1).to(device))
        sentenc = model.encode(text.reshape(-1, seq_dim, 1).to(device))
        embeds = torch.cat([textencs,sentenc])
        label = Variable(label).to(device)

        outputs = gcnModel(embeds,adj_matrix)
        predicted =  torch.softmax(outputs[-1].reshape(1,-1),1)
        predicted = torch.max(predicted, 1)[1].cpu().numpy().tolist()
        allLabels += (label.cpu().numpy().tolist())
        allPreds += (predicted)

    valacc = accuracy_score(allLabels, allPreds)
    f1score = f1_score(allLabels, allPreds,average='macro')
#         roc = roc_auc_score(allLabels,allPreds)
    cr = classification_report(allLabels, allPreds)
    print(f'acc: {valacc} f1 {f1score}')

In [None]:
text

In [None]:
bigf = torch.softmax(gcnModel.graphlayer2.getScores(adj_matrix.float()).cpu().detach(),1)

In [None]:
plt.rcParams['figure.figsize'] = [10, 10]

plt.matshow(bigf, cmap='hot')
plt.show()

# Identifying BSWs

In [None]:
dixon_bsws = ['lesbian', 'gay', 'bisexual', 'transgender', 'trans', 'queer', 'lgbt', 'lgbtq', 'homosexual', 'straight', 'heterosexual', 'male', 'female', 'nonbinary',
'african', 'african american', 'black', 'white', 'european', 'hispanic', 'latino', 'latina', 'latinx', 'mexican', 'canadian', 'american', 'asian', 'indian',
'middle eastern', 'chinese', 'japanese', 'christian', 'muslim', 'jewish', 'buddhist', 'catholic', 'protestant', 'sikh', 'taoist', 'old', 'older', 'young',
'younger', 'teenage', 'millenial', 'middle aged', 'elderly', 'blind', 'deaf', 'paralyzed']

In [None]:
bsws = {}
for word in tqdm(word2idx):
    vect = torch.tensor([word2idx[word]]).to(device)
    textenc = model.encode(vect)
    scores = torch.softmax(gcnModel.fc(textenc),1)
    
    if torch.max(scores) > 0.7 and torch.argmax(scores) == 0:
        bsws[word] = torch.max(scores).item()

In [None]:
bsws

In [None]:
def testModelClassification(sentence):
    tokens = preprocess(sentence.lower().strip())
    output = model(torch.tensor([word2idx[x] for x in tokens]).reshape(1,-1).to(device))
    return output

In [None]:
def testGraphClassification(sentence):
    tokens = preprocess(sentence.lower().strip())
    text = torch.tensor([word2idx[x] for x in tokens]).reshape(1,-1).to(device)
    
    textencs = model.encode(text.reshape(len(tokens),-1,1).to(device))
    sentenc = model.encode(text.reshape(-1, len(tokens), 1).to(device))
    embeds = torch.cat([textencs,sentenc])

    adj_matrix = torch.tensor(computeAdjMatrix(text)).to(device)
    
    outputs = gcnModel(embeds,adj_matrix)
    
    return outputs, adj_matrix

In [None]:
def SOAC(listOfTokenizedSentences, listOfLabels):
    tfs = defaultdict(lambda:0)
    dfs = defaultdict(lambda:0)
    df_pos = defaultdict(lambda:0)
    df_neg = defaultdict(lambda:0)
    
    for i in range(len(listOfTokenizedSentences)):
        sent = listOfTokenizedSentences[i]
        wordCounts = Counter(sent)
        
        for word in wordCounts:
            tfs[word] += wordCounts[word]
            dfs[word] += 1
            
            if listOfLabels[i] == 0:
                df_pos[word] += 1
            if listOfLabels[i] == 2:
                df_neg[word] += 1
                
    return tfs,dfs,df_pos,df_neg

In [None]:
tfs,dfs,df_pos,df_neg = SOAC(list(train['clean_text']),list(train['class']))

In [None]:
def getSOAC_BSWs(tfs,dfs,df_pos,df_neg,threshold):
    bsws = []
    for key in list(tfs.keys()):
        if tfs[key] > threshold and df_pos[key] > df_neg[key]:
            bsws.append(key)
    return bsws

In [None]:
bsws = getSOAC_BSWs(tfs,dfs,df_pos,df_neg,50)

### Bias Examples

In [None]:
testModelClassification('woman')

In [None]:
testModelClassification('kat is a woman')

In [None]:
out,adj = testGraphClassification('kat is a woman')

In [None]:
adj

In [None]:
out

In [None]:
testModelClassification('alice is a woman')

In [None]:
model(torch.tensor([word2idx[x] for x in ['can','you','throw','that','garbage','please']]).reshape(1,-1).to(device))

# Pinned Bias Metrics

In [None]:
def pinned_bias(listOfProbabilities, threshold_type, num_classes=3):
    prob_hateful = listOfProbabilities
#     print(listOfProbabilities)
    
    if threshold_type == 'mean':
        pb = np.absolute(prob_hateful).sum() / len(listOfProbabilities)
        
    if threshold_type == 'sym':
        num = np.array(prob_hateful) - 1/num_classes
        pb = np.absolute(num).sum() / len(listOfProbabilities)
        
    if threshold_type == 'asym':
        num = np.array(prob_hateful) - np.array([min(x,0.5) for x in prob_hateful])
        pb = np.absolute(num).sum() / len(listOfProbabilities)
    
    return pb

In [None]:
hatefulProbsOfBSWs = [testGraphClassification(word)[0][0][0].item() for word in bsws]

In [None]:
print(pinned_bias(hatefulProbsOfBSWs,'mean'))
print(pinned_bias(hatefulProbsOfBSWs,'sym'))
print(pinned_bias(hatefulProbsOfBSWs,'asym'))

In [None]:
hatefulProbsOfBSWs = [testModelClassification(word)[0][0].item() for word in bsws]

In [None]:
print(pinned_bias(hatefulProbsOfBSWs,'mean'))
print(pinned_bias(hatefulProbsOfBSWs,'sym'))
print(pinned_bias(hatefulProbsOfBSWs,'asym'))