# Reproduction of the paper's results

## setup virtual environment

install python version manager:

In [None]:
python3 -m venv myenv
source myenv/bin/activate
pip install ipykernel

Afterwards, restart JupyterLab and select "myenv" as kernel

## Install correct libraries

In [5]:
%pip install --upgrade pip setuptools wheel
%pip uninstall --yes transformers
%pip uninstall --yes torch
%pip uninstall --yes pandas
%pip uninstall --yes numpy
%pip uninstall --yes scikit-learn
%pip uninstall --yes cython

Note: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
#%pip install transformers==3.0.2
#%pip install torch==1.5.1
#%pip install pandas==1.0.5
%pip install --upgrade cython
%pip install numpy==1.19.2 --only-binary :all:
#%pip install scikit-learn==0.23.1

Collecting cython
  Using cached Cython-3.0.7-py2.py3-none-any.whl.metadata (3.2 kB)
Using cached Cython-3.0.7-py2.py3-none-any.whl (1.2 MB)
Installing collected packages: cython
Successfully installed cython-3.0.7
Note: you may need to restart the kernel to use updated packages.
[31mERROR: Could not find a version that satisfies the requirement numpy==1.19.2 (from versions: 1.23.2, 1.23.3, 1.23.4, 1.23.5, 1.24.0rc1, 1.24.0rc2, 1.24.0, 1.24.1, 1.24.2, 1.24.3, 1.24.4, 1.25.0rc1, 1.25.0, 1.25.1, 1.25.2, 1.26.0b1, 1.26.0rc1, 1.26.0, 1.26.1, 1.26.2, 1.26.3)[0m[31m
[0m[31mERROR: No matching distribution found for numpy==1.19.2[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


## Author's code for predicting

In [None]:
# importing libraries
import transformers
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch import nn
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Constant variables 
class_names = ['Female', 'Male' , 'Neutral']
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
TEST_BATCH_SIZE = 16
MAX_LEN = 55

# Dataset
class GenderBiasDataset(Dataset):

    def __init__(self, queries, tokenizer, max_len):
        self.queries = queries
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, index):
        query_text = str(self.queries[index])
         
        encoding = self.tokenizer.encode_plus(
            query_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        return {
                'query': query_text,
                'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long)
        }

# Dataloader
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = GenderBiasDataset(
    queries = df['query'].to_numpy(),
    tokenizer  =tokenizer,
    max_len = max_len
  )
  return DataLoader(
    ds,
    batch_size = batch_size,
    num_workers = 5 
  )

#Prediction function
def get_predictions(model, data_loader):
  model = model.eval()
  query_texts = []
  predictions = []
  prediction_probs = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["query"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs[0], dim=1)
      query_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs[0])
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  return query_texts, predictions, prediction_probs


#Reading MSMarco dev set queries (these queires do not have label)
df = pd.read_csv("msmarco.csv") # a dataframe containing the queries
test_data_loader = create_data_loader(df, tokenizer, MAX_LEN, TEST_BATCH_SIZE)

#Loading the fine-tuned model - you can download the model from https://drive.google.com/file/d/1_YTRs4v5DVUGUffnRHS_3Yk4qteJKO6w/view?usp=sharing
print("Loading the Model")
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels = 3)
model.load_state_dict(torch.load("BERT_fine_tuned.bin", map_location = device))
print("Model Loaded Successfully")

print("Prediction started")
y_query_texts, y_pred, y_pred_probs = get_predictions(model, test_data_loader)
prediction = pd.DataFrame(df.values.tolist(), columns = ["qid","query"])
prediction['female_probability'] = y_pred_probs[:, 0]
prediction['male_probability'] = y_pred_probs[:, 1]
prediction['neutral_probability'] = y_pred_probs[:, 2]
prediction['prediction'] = y_pred
prediction.to_csv("predictions.csv", index = False)


## Author's code for training

In [None]:
# importing libraries
import numpy as np
import torch
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from torch import nn, optim
import pandas as pd
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

#Defining some key variables for preprocessing step
class_names = ['Female', 'Male' , 'Neutral']
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 33
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 8
LEARNING_RATE = 2e-5
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

#Dataset
class GenderBiasDataset(Dataset):

    def __init__(self, queries, targets, tokenizer, max_len):
        self.queries = queries
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, index):
        query_text = str(self.queries[index])
        target = self.targets[index]
         
        encoding = self.tokenizer.encode_plus(
            query_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        return {
                'query': query_text,
                'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
                'targets': torch.tensor(target, dtype=torch.long)
        }
        
#Dataloader
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = GenderBiasDataset(
    queries = df['query'].to_numpy(),
    targets = df['label'].to_numpy(),
    tokenizer  =tokenizer,
    max_len = max_len
  )
  return DataLoader(
    ds,
    batch_size = batch_size,
    num_workers = 5
  )

#Training function
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels = targets
    )
    _, preds = torch.max(outputs[1], dim=1)  # the second return value is logits
    loss = outputs[0] #the first return value is loss
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

#Evaluation function - used when adopting K-fold
def eval_model(model, data_loader, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels = targets
      )
      _, preds = torch.max(outputs[1], dim=1)
      loss = outputs[0]
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

#Prediction function - used to calculate the accuracy of the model when true labels are available
def get_predictions(model, data_loader):
  model = model.eval()
  query_texts = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["query"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
	labels = targets
      )
      _, preds = torch.max(outputs[1], dim=1)
      query_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs[1])
      real_values.extend(targets)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return query_texts, predictions, prediction_probs, real_values


#Fine-Tuning the BERT on the Dataset
result = open("BERT_Tuninig_results.txt", "w")
df = pd.read_csv("queries_gender_annotated.csv", names = ["query", "label"]) 
labelEncoder = LabelEncoder()
df['label'] = labelEncoder.fit_transform(df['label'])
result.write("Shape of Dataset: {} \n".format(df.shape))
wordlist = pd.read_csv("gender_specific_wordlist.csv")
wordlist['label'] = labelEncoder.fit_transform(wordlist['label'])
df = pd.concat([df, wordlist], ignore_index = False)
result.write("Shape of Dataset after concatination with wordlist: {} \n".format(df.shape))

train_data_loader = create_data_loader(df, tokenizer, MAX_LEN, TRAIN_BATCH_SIZE)

model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels = 3) 
model = model.to(device)

optimizer = AdamW(params =  model.parameters(), lr = LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps = 0,
            num_training_steps = total_steps
        )

for epoch in range(EPOCHS):
    result.write(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    result.write("\n")
    result.write('-' * 10)
    result.write("\n")
    train_acc, train_loss = train_epoch(
                model,
                train_data_loader,
                optimizer,
                device,
                scheduler,
                len(df)
        )
    result.write(f'Train loss {train_loss} accuracy {train_acc}')
    result.write("\n")

torch.save(model.state_dict(), "BERT_fine_tuned.bin")
result.close()
