# Reproduction of the paper's results

## setup virtual environment

1. open new terminal and install python version manager:

2. download old python version from web: https://www.python.org/downloads/release/python-375rc1/

3. also in Terminal, run these 4 commands to setup virtual environment:

4. Afterwards, restart JupyterLab and select "Python 3.7 (myenv-python37)" as kernel

## Uninstall libraries

In [1]:
%pip uninstall --yes transformers
%pip uninstall --yes torch
%pip uninstall --yes pandas
%pip uninstall --yes numpy
%pip uninstall --yes scikit-learn
%pip uninstall --yes cython

Found existing installation: transformers 3.0.2
Uninstalling transformers-3.0.2:
  Successfully uninstalled transformers-3.0.2
Note: you may need to restart the kernel to use updated packages.
Found existing installation: torch 1.5.1
Uninstalling torch-1.5.1:
  Successfully uninstalled torch-1.5.1
Note: you may need to restart the kernel to use updated packages.
Found existing installation: pandas 1.0.5
Uninstalling pandas-1.0.5:
  Successfully uninstalled pandas-1.0.5
Note: you may need to restart the kernel to use updated packages.
Found existing installation: numpy 1.19.2
Uninstalling numpy-1.19.2:
  Successfully uninstalled numpy-1.19.2
Note: you may need to restart the kernel to use updated packages.
Found existing installation: scikit-learn 0.23.1
Uninstalling scikit-learn-0.23.1:
  Successfully uninstalled scikit-learn-0.23.1
Note: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


## Install libraries with software versions stated in "requirements.txt"

In [2]:
%pip install --upgrade pip setuptools wheel
%pip install transformers==3.0.2
%pip install torch==1.5.1
%pip install pandas==1.0.5
%pip install numpy==1.19.2
%pip install scikit-learn==0.23.1
%pip install --upgrade jupyter ipywidgets
%jupyter nbextension enable --py widgetsnbextension

Note: you may need to restart the kernel to use updated packages.
Collecting transformers==3.0.2
  Using cached transformers-3.0.2-py3-none-any.whl (769 kB)
Collecting numpy (from transformers==3.0.2)
  Using cached numpy-1.21.6-cp37-cp37m-macosx_10_9_x86_64.whl (16.9 MB)
Installing collected packages: numpy, transformers
Successfully installed numpy-1.21.6 transformers-3.0.2
Note: you may need to restart the kernel to use updated packages.
Collecting torch==1.5.1
  Using cached torch-1.5.1-cp37-none-macosx_10_9_x86_64.whl (80.5 MB)
Installing collected packages: torch
Successfully installed torch-1.5.1
Note: you may need to restart the kernel to use updated packages.
Collecting pandas==1.0.5
  Using cached pandas-1.0.5-cp37-cp37m-macosx_10_9_x86_64.whl (10.0 MB)
Installing collected packages: pandas
Successfully installed pandas-1.0.5
Note: you may need to restart the kernel to use updated packages.
Collecting numpy==1.19.2
  Using cached numpy-1.19.2-cp37-cp37m-macosx_10_9_x86_64.whl

UsageError: Line magic function `%jupyter` not found.


## Functions to be added by us before running author's code

In [1]:
#this function is needed because the delimiter "," is also apparent in some queries
def process_row(row):
    if row['label'] not in ['m', 'n', 'f']:
        row["query"] = row["query"] + row["label"]
        row["label"] = row["other"]
    return row

## Author's code for training

In [None]:
def process_row(row):
    if row['label'] not in ['m', 'n', 'f']:
        row["query"] = row["query"] + row["label"]
        row["label"] = row["other"]
    return row
# importing libraries
import numpy as np
import torch
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from torch import nn, optim
import pandas as pd
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

#Defining some key variables for preprocessing step
class_names = ['Female', 'Male' , 'Neutral']
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 33
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 1
EPOCHS = 8
LEARNING_RATE = 2e-5
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

#Dataset
class GenderBiasDataset(Dataset):

    def __init__(self, queries, targets, tokenizer, max_len):
        self.queries = queries
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, index):
        query_text = str(self.queries[index])
        target = self.targets[index]
         
        encoding = self.tokenizer.encode_plus(
            query_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        return {
                'query': query_text,
                'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
                'targets': torch.tensor(target, dtype=torch.long)
        }
        
#Dataloader
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GenderBiasDataset(
        queries = df['query'].to_numpy(),
        targets = df['label'].to_numpy(),
        tokenizer  =tokenizer,
        max_len = max_len
    )
    return DataLoader(
        ds,
        batch_size = batch_size,
        num_workers = 1
    )

#Training function
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels = targets
        )
        _, preds = torch.max(outputs[1], dim=1)  # the second return value is logits
        loss = outputs[0] #the first return value is loss
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

#Evaluation function - used when adopting K-fold
def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels = targets
            )
        _, preds = torch.max(outputs[1], dim=1)
        loss = outputs[0]
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

#Prediction function - used to calculate the accuracy of the model when true labels are available
def get_predictions(model, data_loader):
    model = model.eval()
    query_texts = []
    predictions = []
    prediction_probs = []
    real_values = []
    with torch.no_grad():
        for d in data_loader:
            texts = d["query"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            	labels = targets
              )
        _, preds = torch.max(outputs[1], dim=1)
        query_texts.extend(texts)
        predictions.extend(preds)
        prediction_probs.extend(outputs[1])
        real_values.extend(targets)
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return query_texts, predictions, prediction_probs, real_values


#Fine-Tuning the BERT on the Dataset
result = open("BERT_Tuninig_results.txt", "w")

#=======NOTE: THE FOLLOWING SECTION HAS BEEN CHANGED===============
df = pd.read_csv("./data/queries_gender_annotated.csv", names = ["index", "query", "label", "other"])
df['label'] = df['label'].astype(str)
df = df.apply(process_row, axis=1)
df.drop(columns=["index", "other"], inplace=True)
df = df[df['label'].isin(['m', 'n', 'f'])]
labelEncoder = LabelEncoder()
df['label'] = labelEncoder.fit_transform(df['label'])
print("Shape of Dataset: {} \n".format(df.shape))
wordlist = pd.read_csv("./data/wordlist_genderspecific.txt", names = ["query", "label"])
wordlist['label'] = labelEncoder.fit_transform(wordlist['label'])
#df = pd.concat([df, wordlist], ignore_index = False)
result.write("Shape of Dataset after concatination with wordlist: {} \n".format(df.shape))
train_data_loader = create_data_loader(df, tokenizer, MAX_LEN, TRAIN_BATCH_SIZE)
#df = df.head(40)
#print(df.head(50))
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels = 3) 
model = model.to(device)

optimizer = AdamW(params =  model.parameters(), lr = LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps = 0,
            num_training_steps = total_steps
        )

for epoch in range(EPOCHS):
    result.write(f'Epoch {epoch + 1}/{EPOCHS}')
    result.write("\n")
    result.write('-' * 10)
    result.write("\n")
    train_acc, train_loss = train_epoch(
                model,
                train_data_loader,
                optimizer,
                device,
                scheduler,
                len(df)
        )
    result.write(f'Train loss {train_loss} accuracy {train_acc}')
    result.write("\n")

torch.save(model.state_dict(), "BERT_fine_tuned.bin")
result.close()

Shape of Dataset: (3707, 2) 



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Author's code for predicting

In [None]:
# importing libraries
import transformers
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch import nn
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Constant variables 
class_names = ['Female', 'Male' , 'Neutral']
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
TEST_BATCH_SIZE = 16
MAX_LEN = 55

# Dataset
class GenderBiasDataset(Dataset):

    def __init__(self, queries, tokenizer, max_len):
        self.queries = queries
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, index):
        query_text = str(self.queries[index])
         
        encoding = self.tokenizer.encode_plus(
            query_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        return {
                'query': query_text,
                'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long)
        }

# Dataloader
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = GenderBiasDataset(
    queries = df['query'].to_numpy(),
    tokenizer  =tokenizer,
    max_len = max_len
  )
  return DataLoader(
    ds,
    batch_size = batch_size,
    num_workers = 5 
  )

#Prediction function
def get_predictions(model, data_loader):
  model = model.eval()
  query_texts = []
  predictions = []
  prediction_probs = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["query"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs[0], dim=1)
      query_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs[0])
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  return query_texts, predictions, prediction_probs


#Reading MSMarco dev set queries (these queires do not have label)
df = pd.read_table("./data/msmacro.tsv") # a dataframe containing the queries CHANGED LINE OF CODE
test_data_loader = create_data_loader(df, tokenizer, MAX_LEN, TEST_BATCH_SIZE)

#Loading the fine-tuned model - you can download the model from https://drive.google.com/file/d/1_YTRs4v5DVUGUffnRHS_3Yk4qteJKO6w/view?usp=sharing
print("Loading the Model")
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels = 3)
model.load_state_dict(torch.load("BERT_fine_tuned.bin", map_location = device))
print("Model Loaded Successfully")

print("Prediction started")
y_query_texts, y_pred, y_pred_probs = get_predictions(model, test_data_loader)
prediction = pd.DataFrame(df.values.tolist(), columns = ["qid","query"])
prediction['female_probability'] = y_pred_probs[:, 0]
prediction['male_probability'] = y_pred_probs[:, 1]
prediction['neutral_probability'] = y_pred_probs[:, 2]
prediction['prediction'] = y_pred
prediction.to_csv("predictions.csv", index = False)


In [8]:
import numpy as np
import torch
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from torch import nn, optim
import pandas as pd
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


df = pd.read_csv("./data/queries_gender_annotated.csv", names = ["index", "query", "label", "other"])
df['label'] = df['label'].astype(str)
df = df.apply(process_row, axis=1)
df.drop(columns=["index", "other"], inplace=True)
df = df[df['label'].isin(['m', 'n', 'f'])]
print(df.shape)
labelEncoder = LabelEncoder()
df['label'] = labelEncoder.fit_transform(df['label'])
#result.write("Shape of Dataset: {} \n".format(df.shape))
wordlist = pd.read_csv("./data/wordlist_genderspecific.txt", names = ["word", "label"])
wordlist['label'] = labelEncoder.fit_transform(wordlist['label'])
print(wordlist.shape)
df = pd.concat([df, wordlist], ignore_index = False)
#result.write("Shape of Dataset after concatination with wordlist: {} \n".format(df.shape))
print(df.shape)
print(df.head(50))

(3707, 2)
(64, 2)
(3771, 3)
                                                query  label word
0                   who was known as the heretic king      1  NaN
1   who plays the main character in night at the m...      2  NaN
2                                   what is surrogate      2  NaN
3                       how popular is the name katie      0  NaN
4          how much sleep in one day does a baby need      2  NaN
5          what type of books does karen hesse write?      0  NaN
6             can you drink coffee before a mammogram      2  NaN
7                   what college did bill gates go to      1  NaN
8       who was jacqueline kennedy's social secretary      0  NaN
9                                    abbot definition      2  NaN
10            concentra how old is a cat in one year.      2  NaN
11                   when was stalins response speech      1  NaN
12                             who is jennifer garner      0  NaN
13              how many weeks can a puppy get s

In [9]:
print(df.memory_usage(deep=True))

Index     30168
query    328609
label     30168
word     122659
dtype: int64


In [None]:
from sklearn.preprocessing import LabelEncoder
def process_row(row):
    if row['label'] not in ['m', 'n', 'f']:
        row["query"] = row["query"] + row["label"]
        row["label"] = row["other"]
    return row
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from torch.utils.data import Dataset, DataLoader
# Minimal Dataset class
class SimpleDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Ensure labels are integers
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

# Load and preprocess a small part of the dataset
df = pd.read_csv("./data/queries_gender_annotated.csv", names = ["index", "query", "label", "other"])
df['label'] = df['label'].astype(str)
df = df.apply(process_row, axis=1)
df.drop(columns=["index", "other"], inplace=True)
df = df[df['label'].isin(['m', 'n', 'f'])]
print(df.shape)
labelEncoder = LabelEncoder()
df['label'] = labelEncoder.fit_transform(df['label'])

# Encode the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encodings = tokenizer(df['query'].tolist(), truncation=True, padding=True)

# Ensure labels are integers
labels = [int(label) for label in df['label'].tolist()]

# Create a minimal dataset and dataloader
dataset = SimpleDataset(encodings, labels)
data_loader = DataLoader(dataset, batch_size=1)

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Run a single batch through the model
for batch in data_loader:
    input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
    attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
    labels = batch['labels'].to('cuda' if torch.cuda.is_available() else 'cpu')

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    print(outputs)
    break  # Only test with the first batch

(3707, 2)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at