In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np

In [None]:
df = pd.read_csv("gender_classification_dataset.csv")

In [None]:
df.head()

Unnamed: 0,Index,Sentences,Labels,about_Male,Masc_terms
0,1,The adventurous boy climbed the mountain alone...,consistent,1,1
1,2,The dominant man asserted his authority in the...,consistent,1,1
2,3,Males tend to be more competitive than females...,consistent,1,1
3,4,The athletic men took on the challenge of runn...,consistent,1,1
4,5,The boy's autonomy allowed him to make his own...,consistent,1,1


In [None]:
df = df[["Sentences", "about_Male"]]
df = df.rename(columns={"about_Male": "label"})

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
dataset = dataset.train_test_split(test_size=0.3, seed=42)  # 70% train, 30% temp
test_valid = dataset["test"].train_test_split(test_size=0.5, seed=42)  # split into val/test
dataset["validation"] = test_valid["train"]
dataset["test"] = test_valid["test"]

In [None]:
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(batch):
    return tokenizer(batch["Sentences"], padding="max_length", truncation=True, max_length=128)

tokenized = dataset.map(tokenize_fn, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Map:   0%|          | 0/5870 [00:00<?, ? examples/s]

Map:   0%|          | 0/1258 [00:00<?, ? examples/s]

Map:   0%|          | 0/1258 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels),
        "f1": f1.compute(predictions=preds, references=labels, average="weighted"),
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./roberta_gender_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0829,0.111692,{'accuracy': 0.9682034976152624},{'f1': 0.9682008446910646}
2,0.0701,0.110453,{'accuracy': 0.9721780604133545},{'f1': 0.9721791680285352}
3,0.0542,0.121413,{'accuracy': 0.972972972972973},{'f1': 0.9729642233758475}
4,0.041,0.151137,{'accuracy': 0.9737678855325914},{'f1': 0.9737687972107842}
5,0.0409,0.138782,{'accuracy': 0.9753577106518283},{'f1': 0.9753579442517966}


TrainOutput(global_step=1835, training_loss=0.05998239182646333, metrics={'train_runtime': 788.5892, 'train_samples_per_second': 37.218, 'train_steps_per_second': 2.327, 'total_flos': 1930577368704000.0, 'train_loss': 0.05998239182646333, 'epoch': 5.0})

In [None]:
metrics = trainer.evaluate(tokenized["test"])
print(metrics)

{'eval_loss': 0.10153450071811676, 'eval_accuracy': {'accuracy': 0.9745627980922098}, 'eval_f1': {'f1': 0.9745715676511524}, 'eval_runtime': 8.0454, 'eval_samples_per_second': 156.363, 'eval_steps_per_second': 9.819, 'epoch': 5.0}


In [None]:
trainer.save_model("./roberta_gender_classifier")
tokenizer.save_pretrained("./roberta_gender_classifier")

('./roberta_gender_classifier/tokenizer_config.json',
 './roberta_gender_classifier/special_tokens_map.json',
 './roberta_gender_classifier/vocab.json',
 './roberta_gender_classifier/merges.txt',
 './roberta_gender_classifier/added_tokens.json',
 './roberta_gender_classifier/tokenizer.json')

In [None]:
from google.colab import files
!zip -r roberta_gender_classifier.zip roberta_gender_classifier
files.download("roberta_gender_classifier.zip")

  adding: roberta_gender_classifier/ (stored 0%)
  adding: roberta_gender_classifier/model.safetensors (deflated 13%)
  adding: roberta_gender_classifier/training_args.bin (deflated 53%)
  adding: roberta_gender_classifier/config.json (deflated 50%)
  adding: roberta_gender_classifier/merges.txt (deflated 53%)
  adding: roberta_gender_classifier/vocab.json (deflated 59%)
  adding: roberta_gender_classifier/tokenizer_config.json (deflated 75%)
  adding: roberta_gender_classifier/tokenizer.json (deflated 82%)
  adding: roberta_gender_classifier/special_tokens_map.json (deflated 52%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>