# CV

In [16]:
class_names = ['Female', 'Male' , 'Neutral']
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 33
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 1
EPOCHS = 8
LEARNING_RATE = 2e-5
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

# importing libraries
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from torch import nn, optim
import pandas as pd
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import transformers
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import KFold
from torch import nn
import pandas as pd
import numpy as np
import torch
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'

# Constant variables 
class_names = ['Female', 'Male' , 'Neutral']
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
TEST_BATCH_SIZE = 16
MAX_LEN = 55

class GenderBiasDataset(Dataset):
    def __init__(self, queries, targets, tokenizer, max_len):
        self.queries = queries
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, index):
        query_text = str(self.queries[index])
        target = self.targets[index]
         
        encoding = self.tokenizer.encode_plus(
            query_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        return {
                'query': query_text,
                'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
                'targets': torch.tensor(target, dtype=torch.long)
        }

# Dataloader
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GenderBiasDataset(
    queries = df['query'].to_numpy(),
    targets = df['label'].to_numpy(),
    tokenizer  =tokenizer,
    max_len = max_len
  )
    return DataLoader(
    ds,
    batch_size = batch_size,
    num_workers = 5
  )

#Training function
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels = targets
    )
    _, preds = torch.max(outputs[1], dim=1)  # the second return value is logits
    loss = outputs[0] #the first return value is loss
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)


#Evaluation function - used when adopting K-fold
def eval_model(model, data_loader, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels = targets
      )
      _, preds = torch.max(outputs[1], dim=1)
      loss = outputs[0]
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

#Prediction function - used to calculate the accuracy of the model when true labels are available
def get_predictions(model, data_loader):
  model = model.eval()
  query_texts = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["query"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
	labels = targets
      )
      _, preds = torch.max(outputs[1], dim=1)
      query_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs[1])
      real_values.extend(targets)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return query_texts, predictions, prediction_probs, real_values

In [None]:
res = open("CV_res.txt", "w")
df = pd.read_csv("./data/queries_gender_annotated.csv", names = ["index", "query", "label", "other"])
df['label'] = df['label'].astype(str)
df = df.apply(process_row, axis=1)
df.drop(columns=["index", "other"], inplace=True)
df = df[df['label'].isin(['m', 'n', 'f'])]
labelEncoder = LabelEncoder()
df['label'] = labelEncoder.fit_transform(df['label'])

res.write("Shape of Dataset: {} \n".format(df.shape))
wordlist = pd.read_csv("./data/wordlist_genderspecific.txt", names = ["query", "label"])
wordlist['label'] = labelEncoder.fit_transform(wordlist['label'])
df = pd.concat([df, wordlist], ignore_index=False)
res.write("Shape of Dataset after concatenation: {} \n".format(df.shape))
num_folds = 5

kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

accuracy_folds = []
for fold, (train_index, val_index) in enumerate(kf.split(df['query'], df['label'])):
    train_data_loader = create_data_loader(df.iloc[train_index], tokenizer, MAX_LEN, TRAIN_BATCH_SIZE)
    val_data_loader = create_data_loader(df.iloc[val_index], tokenizer, MAX_LEN, VALID_BATCH_SIZE)

    model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=3)
    model = model.to(device)

    optimizer = AdamW(params=model.parameters(), lr=LEARNING_RATE, correct_bias=False)
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    res.write(f'fold {fold + 1}/{num_folds}')
    print(f'fold {fold + 1}/{num_folds}')
    res.write("\n")
    res.write('-' * 10)
    res.write("\n")

    for epoch in range(EPOCHS):
        res.write(f'epoch {epoch + 1}/{EPOCHS}')
        print(f'epoch {epoch + 1}/{EPOCHS}')
        res.write("\n")
        res.write('-' * 10)
        res.write("\n")

        train_acc, train_loss = train_epoch(
            model,
            train_data_loader,
            optimizer,
            device,
            scheduler,
            len(df)
        )
        
        val_acc, val_loss = eval_model(
            model,
            val_data_loader,
            device,
            len(df)/num_folds,
        )
        
        res.write(f'Val loss {val_loss} accuracy {val_acc}')
        res.write("\n")
        
    accuracy_folds.append(val_acc)
    
res.write(f'mean accuracy over all folds: {sum(accuracy_folds)/len(accuracy_folds)}')
res.close()

                                               query  label
0                  who was known as the heretic king      1
1  who plays the main character in night at the m...      2
2                                  what is surrogate      2
3                      how popular is the name katie      0
4         how much sleep in one day does a baby need      2
5         what type of books does karen hesse write?      0
6            can you drink coffee before a mammogram      2
7                  what college did bill gates go to      1
8      who was jacqueline kennedy's social secretary      0
9                                   abbot definition      2


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

fold 1/5
epoch 1/8
