## Importação do modelo e do tokenizador

In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig

config = BertConfig.from_pretrained("mdgenderbias_bert/checkpoint-43880/config.json")

model = BertForSequenceClassification.from_pretrained(
    "mdgenderbias_bert/checkpoint-43880/model.safetensors", 
    config=config
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

## Importação e preparação dos dados

In [4]:
import pandas as pd
import torch

data_gpt = pd.read_excel("data/gpt-3.xlsx")
data_gemini = pd.read_excel("data/gemini.xlsx")

data_gpt = data_gpt.drop(['Unnamed: 0'], axis=1)
data_gemini = data_gemini.drop(['Unnamed: 0'], axis=1)

def fazer_previsao(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    previsao = torch.argmax(outputs.logits).item()
    return previsao

In [5]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import nltk

def dividir_em_frases(texto):
    frases = nltk.sent_tokenize(texto)
    return frases

In [7]:
def calculate_label(array):
    cont_label_0 = 0
    cont_label_1 = 0
    cont_label_2 = 0
    percentage = { "label_0": 0, "label_1": 0, "label_2": 0 }
    label = -1
    for pred in array:
        if pred == 0:
            cont_label_0 += 1
        elif pred == 1:
            cont_label_1 += 1
        else:
            cont_label_2 += 1

    if cont_label_0 > cont_label_1 and cont_label_0 > cont_label_2:
        label = 0
    elif cont_label_1 > cont_label_0 and cont_label_1 > cont_label_2:
        label = 1
    else:
        label = 2
    
    percentage["label_0"] = (cont_label_0/len(array))*100
    percentage["label_1"] = (cont_label_1/len(array))*100
    percentage["label_2"] = (cont_label_2/len(array))*100

    return percentage, label

## Predição

In [8]:
predictions_gpt = {
    "front-end": [],
    "back-end": [],
    "full stack": [],
    "software testing analyst": [],
    "network architecture": [],
    "hardware engineering": [],
    "SEO": [],
    "cybersecurity": [],
    "game": [],
    "mobile": [],
    "industrial technology": [],
    "technology field": []
}

for coluna in data_gpt.columns:
    prediction_group = []
    for texto in data_gpt[coluna]:
        prediction = []
        frases_divididas = dividir_em_frases(texto)
        for frase in frases_divididas:
            prediction.append(fazer_previsao(frase))
        label_pred = calculate_label(prediction)
        prediction_group.append(label_pred)
    predictions_gpt[coluna].extend(prediction_group)

In [9]:
predictions_gemini = {
    "front-end": [],
    "back-end": [],
    "full stack": [],
    "software testing analyst": [],
    "network architecture": [],
    "hardware engineering": [],
    "SEO": [],
    "cybersecurity": [],
    "game": [],
    "mobile": [],
    "industrial technology": [],
    "technology field": []
}

for coluna in data_gemini.columns:
    prediction_group = []
    for texto in data_gemini[coluna]:
        prediction = []
        frases_divididas = dividir_em_frases(texto)
        for frase in frases_divididas:
            prediction.append(fazer_previsao(frase))
        label_pred = calculate_label(prediction)
        prediction_group.append(label_pred)
    predictions_gemini[coluna].extend(prediction_group)

In [10]:
df = pd.DataFrame(predictions_gpt)
datatoexcel = pd.ExcelWriter('BERT_gpt_prediction.xlsx')
df.to_excel(datatoexcel)
datatoexcel.close()

In [11]:
df = pd.DataFrame(predictions_gemini)
datatoexcel = pd.ExcelWriter('BERT_gemini_prediction.xlsx')
df.to_excel(datatoexcel)
datatoexcel.close()