In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the CSV file
data = pd.read_csv('wiki_cda.csv', nrows=4000)

# Select only the 'sent' column
data = data[['sent']]

# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(data)

# Initialize the tokenizer for mBERT
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')

# Load pre-trained mBERT model for Masked Language Modeling
model = AutoModelForMaskedLM.from_pretrained(
    'bert-base-multilingual-cased',
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.1
)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['sent'], padding='max_length', truncation=True, max_length=256)

# Apply the tokenization function to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',
    eval_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    max_steps=50,
    weight_decay=0.01,
    save_steps=50,
    seed=0,
    gradient_accumulation_steps=64,
    logging_dir='./logs',
    logging_steps=50
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Ideally, use a separate validation set
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-mbert')
tokenizer.save_pretrained('./fine-tuned-mbert')


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
50,7.419,6.922314


('./fine-tuned-mbert/tokenizer_config.json',
 './fine-tuned-mbert/special_tokens_map.json',
 './fine-tuned-mbert/vocab.txt',
 './fine-tuned-mbert/added_tokens.json',
 './fine-tuned-mbert/tokenizer.json')

In [None]:
#crows
!python metric.py --input_file="crows_pairs_anonymized.csv" --lm_model="mbertu" --output_file="output.csv"

Evaluating:
Input: crows_pairs_anonymized.csv
Model: mbertu
  _C._set_default_tensor_type(t)
  df_score = df_score._append(
100% 1508/1508 [09:17<00:00,  2.70it/s]
Total examples: 1508
Metric score: 48.21
Stereotype score: 49.84
Anti-stereotype score: 38.53
Num. neutral: 0 0.0



In [None]:
#seat
!sh run_seat_debiased.sh

2024-09-16 19:31 | seat_utils | INFO > Run number: run00
2024-09-16 19:31 | seat_utils | INFO > Config: {'tests': 'seat6,seat6b,seat7,seat7b,seat8,seat8b', 'model_name': 'bert', 'output_dir': './out/', 'cache_encs': True, 'data_dir': './tests', 'debiased': True, 'num_samples': 100000, 'use_parametric': False, 'run_name': 'run00', 'use_ckpt': True, 'ckpt_dir': './fine-tuned-mbert', 'version': 'bert-base-uncased', 'deterministic': False, 'enc_save_dir': './tests'}
2024-09-16 19:31 | run_seat | INFO > Seed: 3
2024-09-16 19:31 | run_seat | INFO > Found tests: ['seat6', 'seat6b', 'seat7', 'seat7b', 'seat8', 'seat8b', 'weat6', 'weat6b', 'weat7', 'weat7b', 'weat8', 'weat8b', 'word2sents6', 'word2sents6b', 'word2sents7', 'word2sents7b', 'word2sents8', 'word2sents8b']
2024-09-16 19:31 | run_seat | INFO > Selected tests: ['seat6', 'seat6b', 'seat7', 'seat7b', 'seat8', 'seat8b']
2024-09-16 19:31 | run_seat | INFO > Selected models: ['bert']
2024-09-16 19:31 | run_seat | INFO > Start to run the SE