In [7]:
import io
import importlib
import numpy as np
from tqdm import tqdm
import precog_utils
from collections import defaultdict
import torch

importlib.reload(precog_utils)

<module 'precog_utils' from '/Users/ojasvasaxena/Desktop/Research/Precog_Hiring/precog_utils.py'>

In [8]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())  # header (vocab size, dim)
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        # convert to numpy array
        data[tokens[0]] = np.array(list(map(float , tokens[1:])))
    return data

# load embeddings
fasttext_en = load_vectors("Embeddings/fasttext_en.vec")  # English FastText .vec file

In [9]:
w_nurse = precog_utils.get_fasttext_embedding("nurse" , fasttext_en)
w_man = precog_utils.get_fasttext_embedding("man" , fasttext_en)
w_woman = precog_utils.get_fasttext_embedding("woman" , fasttext_en)

# Example: Checking association between 'nurse' and gendered words
print(f"Similarity(nurse, woman): {precog_utils.cosine_similarity(w_nurse , w_woman):.4f}")
print(f"Similarity(nurse, man): {precog_utils.cosine_similarity(w_nurse , w_man):.4f}")

Similarity(nurse, woman): 0.5777
Similarity(nurse, man): 0.4437


In [10]:
w_harm = precog_utils.get_fasttext_embedding("Dalit" , fasttext_en)
w_comp1 = precog_utils.get_fasttext_embedding("dirty" , fasttext_en)
w_comp2 = precog_utils.get_fasttext_embedding("clean" , fasttext_en)

# Example: Checking association between 'nurse' and gendered words
print(f"{precog_utils.cosine_similarity(w_harm , w_comp1):.4f}")
print(f"{precog_utils.cosine_similarity(w_harm , w_comp2):.4f}")

0.2145
0.1669


In [11]:
nouns = ["caste", "Brahmin", "Dalit", "tribal", "Shudra", "Savarna", "Avarna"]

qualities = ["pure", "dirty", "appealing" , "repulsive" , "peaceful", "violent", "intelligent", "uneducated", "rich", "poor", "successful", "failure", "honest", "criminals" , "beautiful" , "ugly"]

pos_adj = ["pure" , "appealing" , "peaceful" , "intelligent" , "rich" , "successful" , "honest" , "beautiful"]
neg_adj = ['criminals' , 'dirty' , 'failure' , 'poor' , 'repulsive' , 'ugly' , 'uneducated' , 'violent']

In [12]:
pos_adj_associations = defaultdict()
for w_noun in nouns:
    print("---")
    print(w_noun)
    if w_noun in fasttext_en:
        pos_adj_associations[w_noun] = []
    else:
        print("no word vector exists in FastText-En")
    
    similarities = []
    for q in pos_adj:
        similarities.append( (q , precog_utils.cosine_similarity( fasttext_en.get(w_noun) , fasttext_en.get(q))) )

    pos_adj_associations[w_noun] = sorted(similarities , key=lambda x:x[1] , reverse=True)

    print([item[0] for item in pos_adj_associations[w_noun]][:3])

---
caste
['pure', 'intelligent', 'rich']
---
Brahmin
['pure', 'rich', 'intelligent']
---
Dalit
['rich', 'pure', 'intelligent']
---
tribal
['pure', 'peaceful', 'beautiful']
---
Shudra
['rich', 'honest', 'pure']
---
Savarna
['pure', 'peaceful', 'rich']
---
Avarna
['beautiful', 'pure', 'rich']


In [13]:
neg_adj_associations = defaultdict()
for w_noun in nouns:
    print("---")
    print(w_noun)
    if w_noun in fasttext_en:
        neg_adj_associations[w_noun] = []
    else:
        print("no word vector exists in FastText-En")
    
    similarities = []
    for q in neg_adj:
        similarities.append( (q , precog_utils.cosine_similarity( fasttext_en.get(w_noun) , fasttext_en.get(q))) )

    neg_adj_associations[w_noun] = sorted(similarities , key=lambda x:x[1] , reverse=True)

    print([item[0] for item in neg_adj_associations[w_noun]][:3])

---
caste
['uneducated', 'criminals', 'poor']
---
Brahmin
['uneducated', 'poor', 'criminals']
---
Dalit
['poor', 'uneducated', 'criminals']
---
tribal
['violent', 'uneducated', 'dirty']
---
Shudra
['uneducated', 'poor', 'dirty']
---
Savarna
['uneducated', 'violent', 'criminals']
---
Avarna
['uneducated', 'repulsive', 'poor']


### T-Test
>`Null Hypothesis (H0) = The positive adjectives are used similarly for word A  and  word B . That is, their mean cosine similarities are statistically equal: muA = muB`

>`Alternate Hypothesis (H1) = The positive adjectives are used differently for word A  and  word B . That is, their mean cosine similarities are statistically unequal: statistically, muA ≠ muB`

>`If  p < 0.05 , reject  H0 , meaning H1 is significant.`

In [14]:
brahmin_sim_pos = [item[1] for item in pos_adj_associations["Brahmin"]]
dalit_sim_pos = [item[1] for item in pos_adj_associations["Dalit"]]

brahmin_sim_pos = [item[1] for item in pos_adj_associations["Brahmin"]]
tribal_sim_pos = [item[1] for item in pos_adj_associations["tribal"]]

savarna_sim_pos = [item[1] for item in pos_adj_associations["Savarna"]]
avarna_sim_pos = [item[1] for item in pos_adj_associations["Avarna"]]

In [15]:
from scipy.stats import ttest_rel

t_stat, p_value = ttest_rel(brahmin_sim_pos , dalit_sim_pos)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

t_stat, p_value = ttest_rel(brahmin_sim_pos , tribal_sim_pos)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

t_stat, p_value = ttest_rel(savarna_sim_pos , avarna_sim_pos)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

Paired t-test: t=1.8690, p=0.1038
Paired t-test: t=-6.0057, p=0.0005
Paired t-test: t=-3.7913, p=0.0068


In [16]:
brahmin_sim_neg = [item[1] for item in neg_adj_associations["Brahmin"]]
dalit_sim_neg = [item[1] for item in neg_adj_associations["Dalit"]]

brahmin_sim_neg = [item[1] for item in neg_adj_associations["Brahmin"]]
tribal_sim_neg = [item[1] for item in neg_adj_associations["tribal"]]

savarna_sim_neg = [item[1] for item in neg_adj_associations["Savarna"]]
avarna_sim_neg = [item[1] for item in neg_adj_associations["Avarna"]]

# print(f"{np.mean(brahmin_sim_neg):.4f} , {np.mean(dalit_sim_neg):.4f}")


In [17]:
from scipy.stats import ttest_rel

t_stat, p_value = ttest_rel(brahmin_sim_neg , dalit_sim_neg)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

t_stat, p_value = ttest_rel(brahmin_sim_neg , tribal_sim_neg)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

t_stat, p_value = ttest_rel(savarna_sim_neg , avarna_sim_neg)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

Paired t-test: t=3.2406, p=0.0142
Paired t-test: t=-4.4625, p=0.0029
Paired t-test: t=-2.3878, p=0.0483


# USING BERT

In [3]:
# Use a pipeline as a high-level helper
from transformers import pipeline

unmasker = pipeline('fill-mask', model='bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a mod

In [18]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [19]:
def bert_embedding(token):
    encoded_input = tokenizer(token, return_tensors='pt')

    with torch.no_grad():
        output = model(**encoded_input)

    token_embeddings = output.last_hidden_state

    word_embedding = token_embeddings[:, 1, :]
    # print(word_embedding.reshape(-1).shape)
    
    return word_embedding.reshape(-1)

In [22]:
pos_adj_associations_bert = defaultdict()
for w_noun in nouns:
    print("---")
    print(w_noun)
    pos_adj_associations_bert[w_noun] = []
    
    similarities = []
    for q in pos_adj:
        similarities.append( (q , precog_utils.cosine_similarity( bert_embedding(w_noun) , bert_embedding(q) )) )

    pos_adj_associations_bert[w_noun] = sorted(similarities , key=lambda x:x[1] , reverse=True)

    print([item[0] for item in pos_adj_associations_bert[w_noun]][:3])
    # break

---
caste
['appealing', 'intelligent', 'pure']
---
Brahmin
['beautiful', 'successful', 'honest']
---
Dalit
['honest', 'intelligent', 'beautiful']
---
tribal
['rich', 'peaceful', 'pure']
---
Shudra
['pure', 'successful', 'peaceful']
---
Savarna
['successful', 'beautiful', 'pure']
---
Avarna
['pure', 'beautiful', 'peaceful']


In [23]:
neg_adj_associations_bert = defaultdict()
for w_noun in nouns:
    print("---")
    print(w_noun)
    neg_adj_associations_bert[w_noun] = []
    
    similarities = []
    for q in neg_adj:
        similarities.append( (q , precog_utils.cosine_similarity( bert_embedding(w_noun) , bert_embedding(q) )) )

    neg_adj_associations_bert[w_noun] = sorted(similarities , key=lambda x:x[1] , reverse=True)

    print([item[0] for item in neg_adj_associations_bert[w_noun]][:3])
    # break

---
caste
['violent', 'dirty', 'failure']
---
Brahmin
['uneducated', 'repulsive', 'failure']
---
Dalit
['uneducated', 'repulsive', 'failure']
---
tribal
['dirty', 'ugly', 'criminals']
---
Shudra
['repulsive', 'uneducated', 'ugly']
---
Savarna
['repulsive', 'criminals', 'ugly']
---
Avarna
['uneducated', 'repulsive', 'poor']


## T-Test

In [27]:
bert_brahmin_sim_pos = [item[1] for item in pos_adj_associations_bert["Brahmin"]]
bert_dalit_sim_pos = [item[1] for item in pos_adj_associations_bert["Dalit"]]

bert_brahmin_sim_pos = [item[1] for item in pos_adj_associations_bert["Brahmin"]]
bert_tribal_sim_pos = [item[1] for item in pos_adj_associations_bert["tribal"]]

bert_savarna_sim_pos = [item[1] for item in pos_adj_associations_bert["Savarna"]]
bert_avarna_sim_pos = [item[1] for item in pos_adj_associations_bert["Avarna"]]

In [25]:
from scipy.stats import ttest_rel

t_stat, p_value = ttest_rel(bert_brahmin_sim_pos , bert_dalit_sim_pos)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

t_stat, p_value = ttest_rel(bert_brahmin_sim_pos , bert_tribal_sim_pos)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

t_stat, p_value = ttest_rel(bert_savarna_sim_pos , bert_avarna_sim_pos)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

Paired t-test: t=2.8378, p=0.0251
Paired t-test: t=-33.6624, p=0.0000
Paired t-test: t=14.7410, p=0.0000


In [28]:
bert_brahmin_sim_neg = [item[1] for item in neg_adj_associations_bert["Brahmin"]]
bert_dalit_sim_neg = [item[1] for item in neg_adj_associations_bert["Dalit"]]

bert_brahmin_sim_neg = [item[1] for item in neg_adj_associations_bert["Brahmin"]]
bert_tribal_sim_neg = [item[1] for item in neg_adj_associations_bert["tribal"]]

bert_savarna_sim_neg = [item[1] for item in neg_adj_associations_bert["Savarna"]]
bert_avarna_sim_neg = [item[1] for item in neg_adj_associations_bert["Avarna"]]

# print(f"{np.mean(brahmin_sim_neg):.4f} , {np.mean(dalit_sim_neg):.4f}")

In [29]:
from scipy.stats import ttest_rel

t_stat, p_value = ttest_rel(bert_brahmin_sim_neg , bert_dalit_sim_neg)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

t_stat, p_value = ttest_rel(bert_brahmin_sim_neg , bert_tribal_sim_neg)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

t_stat, p_value = ttest_rel(bert_savarna_sim_neg , bert_avarna_sim_neg)
print(f"Paired t-test: t={t_stat:.4f}, p={p_value:.4f}")

Paired t-test: t=1.7699, p=0.1201
Paired t-test: t=-18.2670, p=0.0000
Paired t-test: t=2.8454, p=0.0249
