## Importação do dataset

In [1]:
pip install transformers datasets evaluate accelerate --user

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from datasets import load_dataset

datasetTrain = load_dataset("md_gender_bias", "opensubtitles_inferred", split="train")
datasetTest = load_dataset("md_gender_bias", "opensubtitles_inferred", split="test")

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the module from C:\Users\mclar\.cache\huggingface\modules\datasets_modules\datasets\md_gender_bias\8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05 (last modified on Thu May  2 14:28:43 2024) since it couldn't be found locally at md_gender_bias, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\mclar\.cache\huggingface\modules\datasets_modules\datasets\md_gender_bias\8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05 (last modified on Thu May  2 14:28:43 2024) since it couldn't be found locally at md_gender_bias, or remotely on the Hugging Face Hub.


## Pré-processamento

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [4]:
def preprocess_function(examples):
    label_column = 'ternary_label'
    return {'input_ids': tokenizer(examples['text'], truncation=True)['input_ids'],
            'labels': examples[label_column]}

In [5]:
tokenized_datasetTrain = datasetTrain.map(preprocess_function, batched=True)
tokenized_datasetTest = datasetTest.map(preprocess_function, batched=True)

In [6]:
tokenized_datasetTrain

Dataset({
    features: ['text', 'binary_label', 'binary_score', 'ternary_label', 'ternary_score', 'input_ids', 'labels'],
    num_rows: 351036
})

In [7]:
tokenized_datasetTest

Dataset({
    features: ['text', 'binary_label', 'binary_score', 'ternary_label', 'ternary_score', 'input_ids', 'labels'],
    num_rows: 49108
})

In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

In [9]:
pip install scikit-learn





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import evaluate

accuracy = evaluate.load("accuracy")

In [11]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [13]:
id2label = {0: "ABOUT:female", 1: "ABOUT:male", 2: "ABOUT:gender-neutral"}
label2id = {"ABOUT:female": 0, "ABOUT:male": 1, "ABOUT:gender-neutral": 2}

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir="mdgenderbias_bert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasetTrain,
    eval_dataset=tokenized_datasetTest,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  1%|          | 500/43880 [41:08<55:34:32,  4.61s/it]

{'loss': 0.632, 'grad_norm': 7.899115562438965, 'learning_rate': 1.977210574293528e-05, 'epoch': 0.02}


  2%|▏         | 1000/43880 [1:23:59<59:48:15,  5.02s/it]

{'loss': 0.4554, 'grad_norm': 8.819112777709961, 'learning_rate': 1.9544211485870555e-05, 'epoch': 0.05}


  3%|▎         | 1500/43880 [2:06:50<59:14:54,  5.03s/it]

{'loss': 0.4301, 'grad_norm': 13.60871410369873, 'learning_rate': 1.9316317228805835e-05, 'epoch': 0.07}


  5%|▍         | 2000/43880 [2:50:35<68:06:06,  5.85s/it]

{'loss': 0.4172, 'grad_norm': 6.058439254760742, 'learning_rate': 1.9088422971741115e-05, 'epoch': 0.09}


  6%|▌         | 2500/43880 [3:36:31<54:54:21,  4.78s/it]

{'loss': 0.3983, 'grad_norm': 13.576598167419434, 'learning_rate': 1.8860528714676392e-05, 'epoch': 0.11}


  7%|▋         | 3000/43880 [5:58:26<46:31:07,  4.10s/it]     

{'loss': 0.3865, 'grad_norm': 12.149919509887695, 'learning_rate': 1.863263445761167e-05, 'epoch': 0.14}


  8%|▊         | 3500/43880 [6:29:20<45:02:31,  4.02s/it]

{'loss': 0.3726, 'grad_norm': 10.941991806030273, 'learning_rate': 1.8404740200546946e-05, 'epoch': 0.16}


  9%|▉         | 4000/43880 [7:02:02<45:08:45,  4.08s/it]

{'loss': 0.3718, 'grad_norm': 9.097623825073242, 'learning_rate': 1.8176845943482226e-05, 'epoch': 0.18}


 10%|█         | 4500/43880 [7:36:02<52:32:49,  4.80s/it]

{'loss': 0.387, 'grad_norm': 11.869604110717773, 'learning_rate': 1.7948951686417502e-05, 'epoch': 0.21}


 11%|█▏        | 5000/43880 [8:05:25<44:28:03,  4.12s/it]

{'loss': 0.3621, 'grad_norm': 12.18720531463623, 'learning_rate': 1.7721057429352783e-05, 'epoch': 0.23}


 13%|█▎        | 5500/43880 [8:42:53<51:57:14,  4.87s/it]

{'loss': 0.3571, 'grad_norm': 3.3931872844696045, 'learning_rate': 1.749316317228806e-05, 'epoch': 0.25}


 14%|█▎        | 6000/43880 [9:15:08<42:58:00,  4.08s/it]

{'loss': 0.361, 'grad_norm': 3.5927886962890625, 'learning_rate': 1.726526891522334e-05, 'epoch': 0.27}


 15%|█▍        | 6500/43880 [9:46:25<29:45:03,  2.87s/it]

{'loss': 0.3586, 'grad_norm': 14.576594352722168, 'learning_rate': 1.7037374658158616e-05, 'epoch': 0.3}


 16%|█▌        | 7000/43880 [10:12:26<34:00:06,  3.32s/it]

{'loss': 0.3499, 'grad_norm': 10.297513961791992, 'learning_rate': 1.6809480401093893e-05, 'epoch': 0.32}


 17%|█▋        | 7500/43880 [10:41:14<29:59:20,  2.97s/it]

{'loss': 0.3529, 'grad_norm': 8.049631118774414, 'learning_rate': 1.6581586144029173e-05, 'epoch': 0.34}


 18%|█▊        | 8000/43880 [11:04:27<25:21:15,  2.54s/it]

{'loss': 0.3282, 'grad_norm': 13.691970825195312, 'learning_rate': 1.635369188696445e-05, 'epoch': 0.36}


 19%|█▉        | 8500/43880 [12:28:19<48:40:24,  4.95s/it] 

{'loss': 0.3451, 'grad_norm': 11.67617130279541, 'learning_rate': 1.612579762989973e-05, 'epoch': 0.39}


 21%|██        | 9000/43880 [24:15:29<27:50:51,  2.87s/it]      

{'loss': 0.324, 'grad_norm': 4.721872329711914, 'learning_rate': 1.5897903372835007e-05, 'epoch': 0.41}


 22%|██▏       | 9500/43880 [24:38:59<26:52:34,  2.81s/it]

{'loss': 0.3423, 'grad_norm': 13.715059280395508, 'learning_rate': 1.5670009115770283e-05, 'epoch': 0.43}


 23%|██▎       | 10000/43880 [25:09:20<38:09:15,  4.05s/it]

{'loss': 0.3327, 'grad_norm': 11.315714836120605, 'learning_rate': 1.544211485870556e-05, 'epoch': 0.46}


 24%|██▍       | 10500/43880 [25:55:59<69:33:29,  7.50s/it]

{'loss': 0.3285, 'grad_norm': 6.5384979248046875, 'learning_rate': 1.5214220601640838e-05, 'epoch': 0.48}


 25%|██▌       | 11000/43880 [26:44:49<37:16:16,  4.08s/it]

{'loss': 0.3257, 'grad_norm': 11.960065841674805, 'learning_rate': 1.4986326344576119e-05, 'epoch': 0.5}


 26%|██▌       | 11500/43880 [27:32:19<56:15:17,  6.25s/it]

{'loss': 0.3194, 'grad_norm': 9.680868148803711, 'learning_rate': 1.4758432087511397e-05, 'epoch': 0.52}


 27%|██▋       | 12000/43880 [28:18:08<45:36:47,  5.15s/it]

{'loss': 0.3209, 'grad_norm': 16.159332275390625, 'learning_rate': 1.4530537830446674e-05, 'epoch': 0.55}


 28%|██▊       | 12500/43880 [28:55:38<29:40:14,  3.40s/it]

{'loss': 0.3108, 'grad_norm': 13.798442840576172, 'learning_rate': 1.430264357338195e-05, 'epoch': 0.57}


 30%|██▉       | 13000/43880 [29:28:14<34:27:40,  4.02s/it]

{'loss': 0.318, 'grad_norm': 16.143577575683594, 'learning_rate': 1.4074749316317229e-05, 'epoch': 0.59}


 31%|███       | 13500/43880 [30:01:04<34:02:11,  4.03s/it]

{'loss': 0.3252, 'grad_norm': 7.701740741729736, 'learning_rate': 1.3846855059252509e-05, 'epoch': 0.62}


 32%|███▏      | 14000/43880 [30:33:25<32:45:47,  3.95s/it]

{'loss': 0.3231, 'grad_norm': 14.839254379272461, 'learning_rate': 1.3618960802187786e-05, 'epoch': 0.64}


 33%|███▎      | 14500/43880 [31:14:51<45:38:15,  5.59s/it]

{'loss': 0.3176, 'grad_norm': 15.025217056274414, 'learning_rate': 1.3391066545123064e-05, 'epoch': 0.66}


 34%|███▍      | 15000/43880 [32:04:29<49:55:31,  6.22s/it]

{'loss': 0.306, 'grad_norm': 3.0132250785827637, 'learning_rate': 1.3163172288058341e-05, 'epoch': 0.68}


 35%|███▌      | 15500/43880 [32:41:57<36:21:40,  4.61s/it]

{'loss': 0.3148, 'grad_norm': 13.627669334411621, 'learning_rate': 1.2935278030993621e-05, 'epoch': 0.71}


 36%|███▋      | 16000/43880 [33:17:13<43:17:56,  5.59s/it]

{'loss': 0.3117, 'grad_norm': 6.246889591217041, 'learning_rate': 1.2707383773928898e-05, 'epoch': 0.73}


 38%|███▊      | 16500/43880 [33:47:03<19:24:29,  2.55s/it]

{'loss': 0.3151, 'grad_norm': 15.739224433898926, 'learning_rate': 1.2479489516864176e-05, 'epoch': 0.75}


 39%|███▊      | 17000/43880 [34:10:23<18:33:03,  2.48s/it]

{'loss': 0.2997, 'grad_norm': 8.009055137634277, 'learning_rate': 1.2251595259799453e-05, 'epoch': 0.77}


 40%|███▉      | 17500/43880 [34:33:46<19:33:59,  2.67s/it]

{'loss': 0.2967, 'grad_norm': 10.1065034866333, 'learning_rate': 1.2023701002734731e-05, 'epoch': 0.8}


 41%|████      | 18000/43880 [34:57:07<20:50:18,  2.90s/it]

{'loss': 0.3012, 'grad_norm': 11.694458961486816, 'learning_rate': 1.1795806745670012e-05, 'epoch': 0.82}


 42%|████▏     | 18500/43880 [35:29:01<34:57:03,  4.96s/it]

{'loss': 0.302, 'grad_norm': 6.919264316558838, 'learning_rate': 1.1567912488605288e-05, 'epoch': 0.84}


 43%|████▎     | 19000/43880 [36:03:31<21:00:47,  3.04s/it]

{'loss': 0.2939, 'grad_norm': 2.117318868637085, 'learning_rate': 1.1340018231540567e-05, 'epoch': 0.87}


 44%|████▍     | 19500/43880 [36:31:45<22:14:13,  3.28s/it]

{'loss': 0.2852, 'grad_norm': 5.039098739624023, 'learning_rate': 1.1112123974475843e-05, 'epoch': 0.89}


 46%|████▌     | 20000/43880 [37:02:04<25:29:27,  3.84s/it]

{'loss': 0.2962, 'grad_norm': 11.87049674987793, 'learning_rate': 1.0884229717411122e-05, 'epoch': 0.91}


 47%|████▋     | 20500/43880 [37:35:19<24:34:55,  3.79s/it]

{'loss': 0.2929, 'grad_norm': 6.6516947746276855, 'learning_rate': 1.06563354603464e-05, 'epoch': 0.93}


 48%|████▊     | 21000/43880 [38:07:26<26:20:56,  4.15s/it]

{'loss': 0.2869, 'grad_norm': 10.399170875549316, 'learning_rate': 1.0428441203281679e-05, 'epoch': 0.96}


 49%|████▉     | 21500/43880 [38:32:54<18:33:07,  2.98s/it]

{'loss': 0.2821, 'grad_norm': 4.610116481781006, 'learning_rate': 1.0200546946216955e-05, 'epoch': 0.98}


                                                           
 50%|█████     | 21940/43880 [39:26:35<17:31:02,  2.87s/it]

{'eval_loss': 0.2897173762321472, 'eval_accuracy': 0.8871059705139692, 'eval_runtime': 1949.0426, 'eval_samples_per_second': 25.196, 'eval_steps_per_second': 1.575, 'epoch': 1.0}


 50%|█████     | 22000/43880 [39:29:32<19:24:33,  3.19s/it]   

{'loss': 0.2735, 'grad_norm': 1.632042646408081, 'learning_rate': 9.972652689152234e-06, 'epoch': 1.0}


 51%|█████▏    | 22500/43880 [39:53:46<18:03:06,  3.04s/it]

{'loss': 0.1821, 'grad_norm': 1.4639359712600708, 'learning_rate': 9.744758432087512e-06, 'epoch': 1.03}


 52%|█████▏    | 23000/43880 [40:17:47<18:00:31,  3.10s/it]

{'loss': 0.1776, 'grad_norm': 12.217546463012695, 'learning_rate': 9.516864175022789e-06, 'epoch': 1.05}


 54%|█████▎    | 23500/43880 [40:41:39<17:19:05,  3.06s/it]

{'loss': 0.1752, 'grad_norm': 1.9365828037261963, 'learning_rate': 9.28896991795807e-06, 'epoch': 1.07}


 55%|█████▍    | 24000/43880 [41:05:27<15:23:31,  2.79s/it]

{'loss': 0.1847, 'grad_norm': 15.34396743774414, 'learning_rate': 9.061075660893346e-06, 'epoch': 1.09}


 56%|█████▌    | 24500/43880 [41:29:42<15:19:04,  2.85s/it]

{'loss': 0.1921, 'grad_norm': 21.96861457824707, 'learning_rate': 8.833181403828624e-06, 'epoch': 1.12}


 57%|█████▋    | 25000/43880 [41:54:21<16:08:00,  3.08s/it]

{'loss': 0.1804, 'grad_norm': 14.597578048706055, 'learning_rate': 8.605287146763903e-06, 'epoch': 1.14}


 58%|█████▊    | 25500/43880 [42:18:49<15:45:39,  3.09s/it]

{'loss': 0.1777, 'grad_norm': 14.325240135192871, 'learning_rate': 8.377392889699181e-06, 'epoch': 1.16}


 59%|█████▉    | 26000/43880 [42:43:15<14:21:38,  2.89s/it]

{'loss': 0.19, 'grad_norm': 10.89764404296875, 'learning_rate': 8.149498632634458e-06, 'epoch': 1.19}


 60%|██████    | 26500/43880 [43:07:40<15:08:41,  3.14s/it]

{'loss': 0.1639, 'grad_norm': 0.05502019450068474, 'learning_rate': 7.921604375569736e-06, 'epoch': 1.21}


 62%|██████▏   | 27000/43880 [43:32:46<13:10:35,  2.81s/it]

{'loss': 0.1895, 'grad_norm': 3.563906192779541, 'learning_rate': 7.693710118505015e-06, 'epoch': 1.23}


 63%|██████▎   | 27500/43880 [43:57:17<12:32:35,  2.76s/it]

{'loss': 0.1894, 'grad_norm': 13.308648109436035, 'learning_rate': 7.465815861440292e-06, 'epoch': 1.25}


 64%|██████▍   | 28000/43880 [44:22:22<13:44:00,  3.11s/it]

{'loss': 0.1725, 'grad_norm': 16.942378997802734, 'learning_rate': 7.237921604375571e-06, 'epoch': 1.28}


 65%|██████▍   | 28500/43880 [44:47:04<12:46:03,  2.99s/it]

{'loss': 0.1905, 'grad_norm': 27.28179931640625, 'learning_rate': 7.010027347310848e-06, 'epoch': 1.3}


 66%|██████▌   | 29000/43880 [45:11:30<12:14:49,  2.96s/it]

{'loss': 0.1836, 'grad_norm': 35.503807067871094, 'learning_rate': 6.782133090246127e-06, 'epoch': 1.32}


 67%|██████▋   | 29500/43880 [45:36:10<12:53:07,  3.23s/it]

{'loss': 0.1779, 'grad_norm': 7.891546249389648, 'learning_rate': 6.554238833181404e-06, 'epoch': 1.34}


 68%|██████▊   | 30000/43880 [46:00:46<10:55:45,  2.83s/it]

{'loss': 0.1797, 'grad_norm': 13.81067180633545, 'learning_rate': 6.326344576116682e-06, 'epoch': 1.37}


 70%|██████▉   | 30500/43880 [46:25:01<9:25:54,  2.54s/it] 

{'loss': 0.1794, 'grad_norm': 39.09189987182617, 'learning_rate': 6.09845031905196e-06, 'epoch': 1.39}


 71%|███████   | 31000/43880 [46:49:31<11:03:51,  3.09s/it]

{'loss': 0.1862, 'grad_norm': 25.461376190185547, 'learning_rate': 5.870556061987238e-06, 'epoch': 1.41}


 72%|███████▏  | 31500/43880 [47:13:48<11:53:52,  3.46s/it]

{'loss': 0.1801, 'grad_norm': 13.548848152160645, 'learning_rate': 5.6426618049225164e-06, 'epoch': 1.44}


 73%|███████▎  | 32000/43880 [47:38:05<10:59:18,  3.33s/it]

{'loss': 0.1759, 'grad_norm': 41.152015686035156, 'learning_rate': 5.414767547857794e-06, 'epoch': 1.46}


 74%|███████▍  | 32500/43880 [48:02:19<9:31:02,  3.01s/it] 

{'loss': 0.1811, 'grad_norm': 27.286209106445312, 'learning_rate': 5.186873290793072e-06, 'epoch': 1.48}


 75%|███████▌  | 33000/43880 [48:26:46<9:24:15,  3.11s/it] 

{'loss': 0.1678, 'grad_norm': 28.86810874938965, 'learning_rate': 4.958979033728351e-06, 'epoch': 1.5}


 76%|███████▋  | 33500/43880 [48:51:17<7:33:36,  2.62s/it] 

{'loss': 0.1705, 'grad_norm': 30.21375846862793, 'learning_rate': 4.7310847766636284e-06, 'epoch': 1.53}


 77%|███████▋  | 34000/43880 [49:15:35<8:27:49,  3.08s/it] 

{'loss': 0.1637, 'grad_norm': 17.97902488708496, 'learning_rate': 4.503190519598907e-06, 'epoch': 1.55}


 79%|███████▊  | 34500/43880 [49:43:00<8:04:39,  3.10s/it] 

{'loss': 0.1712, 'grad_norm': 11.723309516906738, 'learning_rate': 4.2752962625341845e-06, 'epoch': 1.57}


 80%|███████▉  | 35000/43880 [50:16:28<8:14:25,  3.34s/it] 

{'loss': 0.1601, 'grad_norm': 8.765729904174805, 'learning_rate': 4.047402005469462e-06, 'epoch': 1.6}


 81%|████████  | 35500/43880 [50:45:19<7:04:58,  3.04s/it] 

{'loss': 0.1771, 'grad_norm': 0.08423256874084473, 'learning_rate': 3.8195077484047405e-06, 'epoch': 1.62}


 82%|████████▏ | 36000/43880 [51:20:59<12:10:27,  5.56s/it]

{'loss': 0.1625, 'grad_norm': 31.280996322631836, 'learning_rate': 3.5916134913400185e-06, 'epoch': 1.64}


 83%|████████▎ | 36500/43880 [52:02:44<12:11:03,  5.94s/it]

{'loss': 0.1785, 'grad_norm': 12.730192184448242, 'learning_rate': 3.3637192342752965e-06, 'epoch': 1.66}


 84%|████████▍ | 37000/43880 [52:50:32<11:17:45,  5.91s/it]

{'loss': 0.1728, 'grad_norm': 2.7593235969543457, 'learning_rate': 3.135824977210575e-06, 'epoch': 1.69}


 85%|████████▌ | 37500/43880 [53:28:45<7:04:41,  3.99s/it] 

{'loss': 0.174, 'grad_norm': 4.071615695953369, 'learning_rate': 2.9079307201458525e-06, 'epoch': 1.71}


 87%|████████▋ | 38000/43880 [54:00:00<6:26:59,  3.95s/it]

{'loss': 0.1624, 'grad_norm': 8.750226020812988, 'learning_rate': 2.6800364630811305e-06, 'epoch': 1.73}


 88%|████████▊ | 38500/43880 [55:10:43<7:17:12,  4.88s/it] 

{'loss': 0.1697, 'grad_norm': 2.7544171810150146, 'learning_rate': 2.4521422060164085e-06, 'epoch': 1.75}


 89%|████████▉ | 39000/43880 [55:50:30<7:04:47,  5.22s/it]

{'loss': 0.1557, 'grad_norm': 12.739141464233398, 'learning_rate': 2.2242479489516865e-06, 'epoch': 1.78}


 90%|█████████ | 39500/43880 [57:09:19<11:08:55,  9.16s/it]  

{'loss': 0.1771, 'grad_norm': 11.654959678649902, 'learning_rate': 1.9963536918869645e-06, 'epoch': 1.8}


 91%|█████████ | 40000/43880 [58:48:21<4:43:17,  4.38s/it]   

{'loss': 0.1688, 'grad_norm': 2.2126145362854004, 'learning_rate': 1.7684594348222425e-06, 'epoch': 1.82}


 92%|█████████▏| 40500/43880 [59:25:26<4:20:07,  4.62s/it]

{'loss': 0.1662, 'grad_norm': 5.491201877593994, 'learning_rate': 1.5405651777575205e-06, 'epoch': 1.85}


 93%|█████████▎| 41000/43880 [61:02:39<3:05:57,  3.87s/it]     

{'loss': 0.1521, 'grad_norm': 8.001460075378418, 'learning_rate': 1.3126709206927987e-06, 'epoch': 1.87}


 95%|█████████▍| 41500/43880 [61:32:11<2:15:02,  3.40s/it]

{'loss': 0.1498, 'grad_norm': 35.9430046081543, 'learning_rate': 1.0847766636280768e-06, 'epoch': 1.89}


 96%|█████████▌| 42000/43880 [61:58:48<1:31:24,  2.92s/it]

{'loss': 0.1437, 'grad_norm': 11.076151847839355, 'learning_rate': 8.568824065633547e-07, 'epoch': 1.91}


 97%|█████████▋| 42500/43880 [62:23:33<1:10:00,  3.04s/it]

{'loss': 0.1642, 'grad_norm': 35.19893264770508, 'learning_rate': 6.289881494986328e-07, 'epoch': 1.94}


 98%|█████████▊| 43000/43880 [62:48:49<39:53,  2.72s/it]  

{'loss': 0.1646, 'grad_norm': 4.3579325675964355, 'learning_rate': 4.010938924339107e-07, 'epoch': 1.96}


 99%|█████████▉| 43500/43880 [63:14:01<16:59,  2.68s/it]

{'loss': 0.1688, 'grad_norm': 9.6923246383667, 'learning_rate': 1.7319963536918872e-07, 'epoch': 1.98}


                                                        
100%|██████████| 43880/43880 [64:07:16<00:00,  2.85s/it]

{'eval_loss': 0.3602883219718933, 'eval_accuracy': 0.8996701148489045, 'eval_runtime': 2065.1091, 'eval_samples_per_second': 23.78, 'eval_steps_per_second': 1.487, 'epoch': 2.0}


100%|██████████| 43880/43880 [64:07:19<00:00,  5.26s/it]

{'train_runtime': 230839.8497, 'train_samples_per_second': 3.041, 'train_steps_per_second': 0.19, 'train_loss': 0.2574123617292211, 'epoch': 2.0}





TrainOutput(global_step=43880, training_loss=0.2574123617292211, metrics={'train_runtime': 230839.8497, 'train_samples_per_second': 3.041, 'train_steps_per_second': 0.19, 'train_loss': 0.2574123617292211, 'epoch': 2.0})

In [16]:
pip install sentencepiece





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from transformers import AutoTokenizer, BertTokenizer, BertForSequenceClassification, BertConfig

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

config = BertConfig.from_pretrained("mdgenderbias_bert/checkpoint-43880/config.json")

model = BertForSequenceClassification.from_pretrained(
    "mdgenderbias_bert/checkpoint-43880/model.safetensors", 
    config=config
)

In [5]:
from datasets import load_dataset

datasetValidation = load_dataset("md_gender_bias", "opensubtitles_inferred", split="validation")
datasetValidation

Using the latest cached version of the module from C:\Users\mclar\.cache\huggingface\modules\datasets_modules\datasets\md_gender_bias\8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05 (last modified on Thu May  2 14:28:43 2024) since it couldn't be found locally at md_gender_bias, or remotely on the Hugging Face Hub.


Dataset({
    features: ['text', 'binary_label', 'binary_score', 'ternary_label', 'ternary_score'],
    num_rows: 41957
})

In [19]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pandas as pd
import torch

def fazer_previsao(texto):
    inputs = tokenizer(texto, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    previsao = torch.argmax(outputs.logits).item()
    return previsao


previsoes = { 'text': [], 'prediction': []}

for exemplo in datasetValidation['text']:
    previsao = fazer_previsao(exemplo)
    previsoes['text'].append(exemplo)
    previsoes['prediction'].append(previsao)


df = pd.DataFrame(previsoes)

datatoexcel = pd.ExcelWriter('validation_prediction-bert.xlsx')

df.to_excel(datatoexcel)

datatoexcel.close()

In [7]:
cont = 0

for pred, label in zip(previsoes['prediction'], datasetValidation['ternary_label']):
    if pred == label:
        cont += 1

print(len(datasetValidation['ternary_label']), (cont/len(datasetValidation['ternary_label']))*100)

41957 89.83244750577973


In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

y_pred = pd.read_excel("validation_prediction-bert.xlsx")
y_pred = y_pred['prediction']
y_true = datasetValidation['ternary_label']

precision = precision_score(y_true, y_pred, average='weighted')
print("Precisão:", precision)

recall = recall_score(y_true, y_pred, average='weighted')
print("Revocação:", recall)

f1 = f1_score(y_true, y_pred, average='weighted')
print("F1-score:", f1)

Precisão: 0.8984636141616736
Revocação: 0.8983244750577972
F1-score: 0.8983148667006122
