# Word Embedding Association Tests (WEAT)

Here we will implement and apply the WEAT to evaluate the indirect bias of our word embeddings. You can either evaluate the standard glove 50 dimensinal embeddings provided by the authors of the glove paper or your own embeddings (e.g. from Session_1.3).  
Before you can apply WEAT you need to complete the function `word_attribute_association`.

In [None]:
!pip install git+https://github.com/millawell/bias-ml-dh.git#subdirectory=material/notebooks/bias_ml_dh_utils
!pip install --upgrade tqdm

In [None]:
%load_ext autoreload
%autoreload 2
#WEAT Word Embedding Association Tests
import torch as tr
import numpy as np
import spacy
from tqdm.notebook import tqdm
import itertools
nlp = spacy.load("en_core_web_sm")
tokenizer = nlp.Defaults.create_tokenizer(nlp)
import bias_ml_dh_utils as utils

In [None]:
embedding_dim = 50
embedding_identifier = "glove.6B.{}d".format(embedding_dim)

embedding_path = utils.download_dataset(embedding_identifier)

embedding_matrix, vocab = utils.create_embedding_matrix(embedding_path)

### Compute p-value for WEAT

In [None]:
def word_attribute_association(w, A, B, vocab):
    #Here we want to compute the cosine similarity between the word w and A/B respectively 
    #and subtract the average cosine similarity over A from the average cosine similarity over B, i.e.:
    #s(w,A,B) = mean_a cos(w,a) - mean_b cos(w,b)
    
    #Step 1: Create embedding_matrices for A, B and w.
    #Hint: You can use utils.lookup_embeddings
    
    
    
    
    #Step 2: Compute the cosine similarity (normalised dot product) for (w,A) and (w,B)
    
    
    
    #Step 3: Return the difference of the average cosine similarity: mean_a cos(w,a) - mean_b cos(w,b)
    

def test_statistic(A,B,X,Y, vocab):
    
    wA = 0
    
    for ix in X:
        wA += word_attribute_association([ix], A, B, vocab)
        
    wB = 0
    
    for iy in Y:
        wB -= word_attribute_association([iy], A, B, vocab)
        
    return wA+wB

def calculate_pvalue(A,B,X,Y,vocab,alpha=0.05):
    
    #check out-of-vocab words
    A = list(set(A).intersection(vocab))
    B = list(set(B).intersection(vocab))
    X = list(set(X).intersection(vocab))
    Y = list(set(Y).intersection(vocab))
        
    
    test_stat_orig = test_statistic(A,B,X,Y,vocab)
    
    union = set(X+Y)
    subset_size = len(union)//2
    
    larger = 0
    total = 0
    
    for subset in tqdm(set(itertools.combinations(union, subset_size))):
        total += 1
        Xi = list(set(subset))
        Yi = list(union - set(subset))
        if test_statistic(A, B, Xi, Yi, vocab) > test_stat_orig:
            larger += 1
    if larger/float(total)<alpha:
        print("The difference between the attributes {} and {} \nfor the given target words is significant.".format(A,B))
    else:
        print("The difference between the attributes {} and {} \nfor the given target words is not significant.".format(A,B))

    return larger/float(total)

### Try it out yourself!

In [None]:
############################################################################
#We use a one-sided test, therefore it is not symmetric. 
#The target words you assume are more associated with A should be in X 
#and respectively the target words for B should be in Y
############################################################################
#choose your attributes in A and B
A = ['female', 'woman']
B = ['male', 'man']

#choose your target words in X and Y
X = ['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives']
# X = ['nurse','teacher','librarian']
Y = ['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career']
# Y = ['programmer','engineer','scientist']

p = calculate_pvalue(A,B,X,Y, vocab)
print(p)