In [1]:
from transformers import pipeline
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset

# Input
dataset = load_dataset("moatazhamza194/gb_test")
data = dataset["train"].to_pandas()

# Load classifier
classifier = pipeline(
    "text-classification",
    model="moatazhamza194/gender_classification-deberta",
    tokenizer="moatazhamza194/gender_classification-deberta",
)

# Initialize gender column as neutral
data["gender"] = "neutral"

# --- Rule-based logic ---
def assign_gender(row):
    male_val = row["male"]
    female_val = row["female"]

    # Case 1: Female strong
    if female_val >= 0.5 and male_val < 0.5:
        return "female"

    # Case 2: Male strong
    elif male_val >= 0.5 and female_val < 0.5:
        return "male"

    # Case 3: Neutral
    elif male_val == 0 and female_val == 0:
        return "neutral"

    # Case 4 & 5: Ambiguous cases -> return None for model classification
    elif (0 < female_val < 0.5 and 0 < male_val < 0.5) or (female_val > 0.5 and male_val > 0.5):
        return None

    # Fallback (just in case)
    return "neutral"

# Apply rule-based logic
data["gender"] = data.apply(assign_gender, axis=1)

# --- Model for ambiguous cases ---
ambiguous_mask = data["gender"].isna()
texts_to_classify = data.loc[ambiguous_mask, "comment"]

batch_size = 64
results = []

for i in tqdm(range(0, len(texts_to_classify), batch_size)):
    batch_texts = texts_to_classify.iloc[i:i+batch_size].tolist()
    preds = classifier(batch_texts, truncation=True, max_length=512, batch_size=batch_size)

    for pred in preds:
        pred_id = int(pred["label"].split("_")[-1])
        results.append("male" if pred_id == 1 else "female")

# Fill in ambiguous cases with model results
data.loc[ambiguous_mask, "gender"] = results

# Drop old columns
data = data.drop(columns=["male", "female"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/386 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/20.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/96844 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/886 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Device set to use cuda:0
 59%|█████▉    | 10/17 [00:04<00:02,  3.05it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 17/17 [00:06<00:00,  2.77it/s]


In [2]:
data.sample(5)

Unnamed: 0,comment,label,gender
18278,Sodomites Doom Nations,0.213115,neutral
65909,"Obviously the party-fixers, the money-bags guy...",0.2,neutral
53502,"Very similar to many ADN postings, yes?",0.0,neutral
80960,What about the white cop in San Antonio execut...,0.4,neutral
224,Sounds more like the Antifascists are the real...,0.647887,neutral


In [3]:
len(data)

96844

In [4]:
from datasets import Dataset
dataset = Dataset.from_pandas(data)

In [5]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `new` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate w

In [6]:
dataset.push_to_hub("moatazhamza194/gb_test_gendered")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/97 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :  44%|####3     | 8.68MB / 19.9MB            

CommitInfo(commit_url='https://huggingface.co/datasets/moatazhamza194/gb_test_gendered/commit/7b5a1435c6452979138bab42de90b53faf3c4e4f', commit_message='Upload dataset', commit_description='', oid='7b5a1435c6452979138bab42de90b53faf3c4e4f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/moatazhamza194/gb_test_gendered', endpoint='https://huggingface.co', repo_type='dataset', repo_id='moatazhamza194/gb_test_gendered'), pr_revision=None, pr_num=None)

In [7]:
data[data["gender"] == "male"].count()

Unnamed: 0,0
comment,1914
label,1914
gender,1914


In [8]:
data[data["gender"] == "female"].count()

Unnamed: 0,0
comment,2140
label,2140
gender,2140
