In [None]:
# 1 安装 transformers 并下载预训练模型

!pip install -q transformers

from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

# 定义模型名称和保存路径
model_name = "xlm-roberta-base"
model_path = "/content/xlm-roberta-base"

# 下载并保存模型和 tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

tokenizer.save_pretrained(model_path)
model.save_pretrained(model_path)

print("✅ 模型已下载并保存在本地路径 /content/xlm-roberta-base")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ 模型已下载并保存在本地路径 /content/xlm-roberta-base


In [None]:
# 2 安装 datasets 库

!pip install datasets



In [None]:
# 3 加载数据并编码文本

from datasets import Dataset
import pandas as pd
from transformers import XLMRobertaTokenizer

# 加载 JSONL 格式数据
df = pd.read_json("hate_speech_multilingual_train.jsonl", lines=True)
dataset = Dataset.from_pandas(df)

# 使用本地 tokenizer（刚刚下载的模型）
tokenizer = XLMRobertaTokenizer.from_pretrained("/content/xlm-roberta-base")

# 编码文本
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize, batched=True)

# 划分训练/验证集
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

print("✅ 数据集加载和编码完成！")


Map:   0%|          | 0/12 [00:00<?, ? examples/s]

✅ 数据集加载和编码完成！


In [None]:
# 4 设置训练参数并开始训练

from transformers import TrainingArguments, Trainer
from transformers import XLMRobertaForSequenceClassification

# 加载模型（从本地路径）
model = XLMRobertaForSequenceClassification.from_pretrained("/content/xlm-roberta-base", num_labels=2)

# 训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=5,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none"  # ✅ 这一行关闭 wandb
)

# 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

# 🚀 开始训练！
trainer.train()


KeyboardInterrupt: 

In [None]:
# 5 将训练好的模型和 tokenizer 保存到本地目录

model.save_pretrained("/content/my-multilingual-model")
tokenizer.save_pretrained("/content/my-multilingual-model")

print("🎉 模型已保存到 /content/my-multilingual-model")


🎉 模型已保存到 /content/my-multilingual-model


In [None]:
# 6 压缩模型文件夹为 zip

!zip -r my-multilingual-model.zip my-multilingual-model


  adding: my-multilingual-model/ (stored 0%)
  adding: my-multilingual-model/special_tokens_map.json (deflated 85%)
  adding: my-multilingual-model/model.safetensors (deflated 32%)
  adding: my-multilingual-model/config.json (deflated 50%)
  adding: my-multilingual-model/tokenizer_config.json (deflated 76%)
  adding: my-multilingual-model/sentencepiece.bpe.model (deflated 49%)


In [None]:
# 7 登录 Hugging Face

!pip install -q huggingface_hub

from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# 8 创建模型库并上传模型
from huggingface_hub import create_repo, upload_folder

# ⚠️ 请替换 your-username 为你的 Hugging Face 用户名
repo_id = "momoali23/multilingual-hate-detector"  # 👈 修改这一行

# 创建 Hugging Face 仓库（如果已经存在则不会报错）
create_repo(repo_id, private=False)

# 上传整个模型文件夹（你训练好的模型）
upload_folder(
    repo_id=repo_id,
    folder_path="/content/my-multilingual-model",
    commit_message="Initial model upload"
)


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/momoali23/multilingual-hate-detector/commit/cc27b8b5a17a6a983b561c3c9a018e3d7ac4c1da', commit_message='Initial model upload', commit_description='', oid='cc27b8b5a17a6a983b561c3c9a018e3d7ac4c1da', pr_url=None, repo_url=RepoUrl('https://huggingface.co/momoali23/multilingual-hate-detector', endpoint='https://huggingface.co', repo_type='model', repo_id='momoali23/multilingual-hate-detector'), pr_revision=None, pr_num=None)

In [None]:
!pip install transformers datasets evaluate


from datasets import load_dataset
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

# 加载 tweet_eval 的 hate 子任务
dataset = load_dataset("tweet_eval", "hate")

# 重命名字段统一处理
# dataset = dataset.rename_column("text", "text")
# dataset = dataset.rename_column("label", "label")

# 加载 tokenizer & model
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize 数据
def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.2)

# 评价函数
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# 训练参数
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 开始训练
trainer.train()



  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.437436,0.807778
2,0.533500,0.446954,0.817222
3,0.387700,0.473891,0.822222


TrainOutput(global_step=1350, training_loss=0.4208426468460648, metrics={'train_runtime': 25172.38, 'train_samples_per_second': 0.858, 'train_steps_per_second': 0.054, 'total_flos': 1420799698944000.0, 'train_loss': 0.4208426468460648, 'epoch': 3.0})

In [None]:
model.push_to_hub("finetuned-xlm-r-tweeteval-hate")
tokenizer.push_to_hub("finetuned-xlm-r-tweeteval-hate")

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/momoali23/finetuned-xlm-r-tweeteval-hate/commit/91a0d03b6c5408eed291b96ee7db098a794fcdf1', commit_message='Upload tokenizer', commit_description='', oid='91a0d03b6c5408eed291b96ee7db098a794fcdf1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/momoali23/finetuned-xlm-r-tweeteval-hate', endpoint='https://huggingface.co', repo_type='model', repo_id='momoali23/finetuned-xlm-r-tweeteval-hate'), pr_revision=None, pr_num=None)

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `fine-tune#1` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authen

In [None]:
#测试
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import torch

# 加载你刚上传的模型
model = XLMRobertaForSequenceClassification.from_pretrained("momoali23/finetuned-xlm-r-tweeteval-hate")
tokenizer = XLMRobertaTokenizer.from_pretrained("momoali23/finetuned-xlm-r-tweeteval-hate")

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
        prediction = torch.argmax(logits, dim=1).item()
    return "🚨 Hate Speech" if prediction == 1 else "✅ Normal"

# ✅ 示例句子测试
print(predict("I hate all those stupid people!"))
print(predict("Why are immigrants ruining everything?"))
print(predict("Black people are criminals."))
print(predict("I love tacos 🌮"))


✅ Normal
🚨 Hate Speech
🚨 Hate Speech
✅ Normal


In [None]:
#以后fine-tune每次打开第2步
#加载模型
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained("momoali23/finetuned-xlm-es-v2")
tokenizer = XLMRobertaTokenizer.from_pretrained("momoali23/finetuned-xlm-es-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

In [None]:
#以后fine-tune每次打开第一步
from huggingface_hub import login
login(token="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")

In [None]:
#以后fine-tune每次打开第一步
from huggingface_hub import login
login(token="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")

#以后fine-tune每次打开第2步
#加载模型
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained("momoali23/finetuned-xlm-esa-v3")
tokenizer = XLMRobertaTokenizer.from_pretrained("momoali23/finetuned-xlm-esa-v3")


#以后fine-tune每次打开第3步
!pip install -q datasets


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.2 kB[0m [31m20.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip uninstall -y requests
!pip install requests


Found existing installation: requests 2.32.3
Uninstalling requests-2.32.3:
  Successfully uninstalled requests-2.32.3
Collecting requests
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: requests
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2024.12.0 which is incompatible.[0m[31m
[0mSuccessfully installed requests-2.32.3


In [None]:
import os
os.kill(os.getpid(), 9)


In [None]:
!pip install -q datasets transformers huggingface_hub


In [None]:
from datasets import load_dataset

dataset = load_dataset("manueltonneau/spanish-hate-speech-superset")

def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)


Map:   0%|          | 0/29855 [00:00<?, ? examples/s]

In [None]:
from datasets import load_dataset

dataset = load_dataset("manueltonneau/spanish-hate-speech-superset")


In [None]:
# 编码
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 如果没有 train/test 自动划分，手动划分
if "test" not in tokenized_dataset:
    tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.2)

print(tokenized_dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'source', 'dataset', 'nb_annotators', 'tweet_id', 'post_author_country_location', 'input_ids', 'attention_mask'],
        num_rows: 23884
    })
    test: Dataset({
        features: ['text', 'labels', 'source', 'dataset', 'nb_annotators', 'tweet_id', 'post_author_country_location', 'input_ids', 'attention_mask'],
        num_rows: 5971
    })
})


In [None]:
!pip install -U transformers



In [None]:
!pip install -q transformers datasets evaluate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m81.9/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
import transformers
print(transformers.__version__)


4.51.3


In [None]:
from transformers import TrainingArguments, Trainer


In [None]:
import transformers
print(transformers.__file__)


/usr/local/lib/python3.11/dist-packages/transformers/__init__.py


In [None]:
import os
os.kill(os.getpid(), 9)


In [None]:
!pip install -U transformers datasets evaluate




In [None]:
from transformers import TrainingArguments
print(TrainingArguments.__module__)


transformers.training_args


In [None]:
# 安装依赖
!pip install -q transformers datasets evaluate

# 导入库
from datasets import load_dataset
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

# ✅ 加载西班牙语 hate speech 数据集（来自 Hugging Face）
dataset = load_dataset("manueltonneau/spanish-hate-speech-superset")

# ✅ 使用你已经上传的模型进行 fine-tune
model_name = "momoali23/finetuned-xlm-r-tweeteval-hate"  # 改为你自己的 Hugging Face repo ID
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

# ✅ 编码文本
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = int(example["labels"])  # 👈 强制转换 float → int
    return tokens


tokenized_dataset = dataset.map(tokenize_fn)

# ✅ 如果数据集没有自动划分 train/test，可以手动划分（此处数据集自带）
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.2)

# ✅ 评价函数
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# ✅ 训练参数（结构不变）
training_args = TrainingArguments(
    output_dir="./results-spanish",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none"
)

# ✅ 创建 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# ✅ 启动训练！
trainer.train()


Map:   0%|          | 0/29855 [00:00<?, ? examples/s]

  trainer = Trainer(


RuntimeError: expected scalar type Long but found Float

In [None]:
print(tokenized_dataset["train"].features)


{'text': Value(dtype='string', id=None), 'labels': Value(dtype='float64', id=None), 'source': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'nb_annotators': Value(dtype='int64', id=None), 'tweet_id': Value(dtype='int64', id=None), 'post_author_country_location': Value(dtype='string', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [None]:
from transformers import TrainingArguments
help(TrainingArguments.__init__)


Help on function __init__ in module transformers.training_args:

    Initialize self.  See help(type(self)) for accurate signature.



In [None]:
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = int(example["labels"])  # 👈 强制转换 float → int
    return tokens


In [None]:
tokenized_dataset = dataset.map(tokenize_fn)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)


Map:   0%|          | 0/29855 [00:00<?, ? examples/s]

AttributeError: 'DatasetDict' object has no attribute 'train_test_split'

In [None]:
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = int(example["labels"])
    return tokens

# 分别对 train 和 test 做 tokenize
tokenized_dataset = {
    "train": dataset["train"].map(tokenize_fn),
    "test": dataset["test"].map(tokenize_fn)
}


KeyError: 'test'

In [None]:
# Tokenization + label 转 int
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = int(example["labels"])  # 强制转换
    return tokens

# 只对 dataset["train"] 做处理
tokenized_dataset = dataset["train"].map(tokenize_fn)

# 手动划分 train / test
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

# 检查结构
print(tokenized_dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'source', 'dataset', 'nb_annotators', 'tweet_id', 'post_author_country_location', 'input_ids', 'attention_mask'],
        num_rows: 23884
    })
    test: Dataset({
        features: ['text', 'labels', 'source', 'dataset', 'nb_annotators', 'tweet_id', 'post_author_country_location', 'input_ids', 'attention_mask'],
        num_rows: 5971
    })
})


In [None]:
print(tokenized_dataset["train"].features)


{'text': Value(dtype='string', id=None), 'labels': Value(dtype='float64', id=None), 'source': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'nb_annotators': Value(dtype='int64', id=None), 'tweet_id': Value(dtype='int64', id=None), 'post_author_country_location': Value(dtype='string', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [None]:
def convert_labels(example):
    example["labels"] = int(example["labels"])
    return example

dataset_int = dataset["train"].map(convert_labels)


Map:   0%|          | 0/29855 [00:00<?, ? examples/s]

In [None]:
print(dataset_int.features)


{'text': Value(dtype='string', id=None), 'labels': Value(dtype='int64', id=None), 'source': Value(dtype='string', id=None), 'dataset': Value(dtype='string', id=None), 'nb_annotators': Value(dtype='int64', id=None), 'tweet_id': Value(dtype='int64', id=None), 'post_author_country_location': Value(dtype='string', id=None)}


In [None]:
# 1. 加载原始数据
raw_dataset = dataset["train"]

# 2. 修复标签 + tokenize（逐条处理）
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = int(example["labels"])  # 强制转换
    return tokens

tokenized_dataset = raw_dataset.map(tokenize_fn)  # 注意：不要加 batched=True

# 3. 手动划分训练集和测试集
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

# 4. 删除不需要的字段（推荐）
tokenized_dataset = tokenized_dataset.remove_columns([
    "source", "dataset", "nb_annotators", "tweet_id", "post_author_country_location"
])


In [None]:
sample = tokenized_dataset["train"][0]
print(type(sample["labels"]), sample["labels"])


<class 'float'> 0.0


In [None]:
from datasets import Features, Value

# ✅ 明确设置 labels 为 int64 类型
features = Features({
    "text": Value("string"),
    "labels": Value("int64")
})

# ✅ 先提取原始 train 数据
raw_dataset = dataset["train"]

# ✅ 定义 tokenize + label 转换函数
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = int(example["labels"])
    return tokens

# ✅ 显式传入 features，确保输出字段类型固定为 int64
tokenized_dataset = raw_dataset.map(tokenize_fn, features=features)


Map:   0%|          | 0/29855 [00:00<?, ? examples/s]

KeyError: 'source'

In [None]:
# 只保留 text 和 labels 字段
raw_dataset = dataset["train"].remove_columns([
    "source", "dataset", "nb_annotators", "tweet_id", "post_author_country_location"
])


In [None]:
def tokenize_fn(example):
    encoded = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    encoded["labels"] = int(example["labels"])  # 强制转换 float → int
    return encoded


In [None]:
print(raw_dataset.features)


{'text': Value(dtype='string', id=None), 'labels': Value(dtype='float64', id=None)}


In [None]:
from datasets import Features, Value

# 明确指定输出字段格式
features = Features({
    "text": Value("string"),
    "labels": Value("int64")
})

# Tokenization + 修复 labels 类型
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = int(example["labels"])
    return tokens

# 执行 tokenize + 修复
tokenized_dataset = raw_dataset.map(tokenize_fn, features=features)


Map:   0%|          | 0/29855 [00:00<?, ? examples/s]

KeyError: 'input_ids'

In [None]:
from datasets import Features, Value, Sequence

features = Features({
    "input_ids": Sequence(Value("int32")),
    "attention_mask": Sequence(Value("int8")),
    "labels": Value("int64")
})


In [None]:
def tokenize_fn(example):
    encoded = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    encoded["labels"] = int(example["labels"])
    return encoded


In [None]:
tokenized_dataset = raw_dataset.map(tokenize_fn)


Map:   0%|          | 0/29855 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)


In [None]:
print(tokenized_dataset["train"].features)
print(type(tokenized_dataset["train"][0]["labels"]), tokenized_dataset["train"][0]["labels"])


{'text': Value(dtype='string', id=None), 'labels': Value(dtype='float64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
<class 'float'> 1.0


In [None]:
from datasets import Value
tokenized_dataset = tokenized_dataset.cast_column("labels", Value("int64"))


Casting the dataset:   0%|          | 0/23884 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5971 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset["train"].features)
print(type(tokenized_dataset["train"][0]["labels"]), tokenized_dataset["train"][0]["labels"])


{'text': Value(dtype='string', id=None), 'labels': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
<class 'int'> 1


In [None]:
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
from transformers import DataCollatorWithPadding

class DataCollatorFixLabel(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        # 强制将 labels 转为 LongTensor
        batch["labels"] = batch["labels"].long()
        return batch


In [None]:
data_collator = DataCollatorFixLabel(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,  # ✅ 加上这个就彻底解决了
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
trainer.train()


NameError: name 'trainer' is not defined

In [None]:
from datasets import Value

dataset_int = dataset_int.cast_column("labels", Value("int64"))


NameError: name 'dataset_int' is not defined

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Value
import numpy as np
import evaluate
import torch


ModuleNotFoundError: No module named 'evaluate'

In [None]:
from datasets import load_dataset

dataset = load_dataset("manueltonneau/spanish-hate-speech-superset")
raw_dataset = dataset["train"].remove_columns([
    "source", "dataset", "nb_annotators", "tweet_id", "post_author_country_location"
])


README.md:   0%|          | 0.00/6.86k [00:00<?, ?B/s]

es_hf_102024.csv:   0%|          | 0.00/5.34M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29855 [00:00<?, ? examples/s]

In [None]:
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = int(example["labels"])
    return tokens

from datasets import Value
tokenized_dataset = raw_dataset.map(tokenize_fn)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
tokenized_dataset = tokenized_dataset.cast_column("labels", Value("int64"))
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/29855 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/23884 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5971 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding

class DataCollatorFixLabel(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        batch["labels"] = batch["labels"].long()
        return batch

data_collator = DataCollatorFixLabel(tokenizer=tokenizer)


In [None]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results-spanish",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4091,0.356855,0.837213
2,0.3245,0.348573,0.863674
3,0.2459,0.386557,0.862502


TrainOutput(global_step=4479, training_loss=0.32505726627963916, metrics={'train_runtime': 2092.1799, 'train_samples_per_second': 34.248, 'train_steps_per_second': 2.141, 'total_flos': 4713108334663680.0, 'train_loss': 0.32505726627963916, 'epoch': 3.0})

In [None]:
model.save_pretrained("finetuned-xlm-es-v2")
tokenizer.save_pretrained("finetuned-xlm-es-v2")


('finetuned-xlm-es-v2/tokenizer_config.json',
 'finetuned-xlm-es-v2/special_tokens_map.json',
 'finetuned-xlm-es-v2/sentencepiece.bpe.model',
 'finetuned-xlm-es-v2/added_tokens.json')

In [None]:
model.save_pretrained("finetuned-xlm-es-v2")


In [None]:
from huggingface_hub import create_repo, upload_folder

repo_id = "momoali23/finetuned-xlm-es-v2"  # 你可以自定义名字
create_repo(repo_id, private=False)
upload_folder(repo_id=repo_id, folder_path="finetuned-xlm-es-v2", commit_message="Upload Spanish fine-tuned XLM-R model")


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/momoali23/finetuned-xlm-es-v2/commit/49851dfbec553df1409786580c959184d21c3f95', commit_message='Upload Spanish fine-tuned XLM-R model', commit_description='', oid='49851dfbec553df1409786580c959184d21c3f95', pr_url=None, repo_url=RepoUrl('https://huggingface.co/momoali23/finetuned-xlm-es-v2', endpoint='https://huggingface.co', repo_type='model', repo_id='momoali23/finetuned-xlm-es-v2'), pr_revision=None, pr_num=None)

In [None]:
#阿拉伯

In [None]:
from datasets import load_dataset

dataset = load_dataset("manueltonneau/arabic-hate-speech-superset")


README.md:   0%|          | 0.00/7.98k [00:00<?, ?B/s]

ar_hf_112024.csv:   0%|          | 0.00/76.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/449078 [00:00<?, ? examples/s]

In [None]:
raw_dataset = dataset["train"].remove_columns([
    "source", "dataset", "nb_annotators",  "post_author_country_location"
])


In [None]:
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=96)
    tokens["labels"] = int(example["labels"])
    return tokens

from datasets import Value
tokenized_dataset = raw_dataset.map(tokenize_fn)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
tokenized_dataset = tokenized_dataset.cast_column("labels", Value("int64"))
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/449078 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/359262 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/89816 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding

class DataCollatorFixLabel(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        batch["labels"] = batch["labels"].long()
        return batch

data_collator = DataCollatorFixLabel(tokenizer=tokenizer)


In [None]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results-arabic",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none",
    fp16=True
)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1056,0.114698,0.966398
2,0.0894,0.107412,0.969237
3,0.0892,0.105423,0.970183


TrainOutput(global_step=33681, training_loss=0.10044018737579134, metrics={'train_runtime': 6216.5062, 'train_samples_per_second': 173.375, 'train_steps_per_second': 5.418, 'total_flos': 5.317076473354368e+16, 'train_loss': 0.10044018737579134, 'epoch': 3.0})

In [None]:
model.save_pretrained("finetuned-xlm-esa-v3")
tokenizer.save_pretrained("finetuned-xlm-esa-v3")


('finetuned-xlm-esa-v3/tokenizer_config.json',
 'finetuned-xlm-esa-v3/special_tokens_map.json',
 'finetuned-xlm-esa-v3/sentencepiece.bpe.model',
 'finetuned-xlm-esa-v3/added_tokens.json')

In [None]:
from huggingface_hub import create_repo, upload_folder

repo_id = "momoali23/finetuned-xlm-esa-v3"
create_repo(repo_id, private=True)
upload_folder(repo_id=repo_id, folder_path="finetuned-xlm-esa-v3", commit_message="Fine-tuned on Arabic hate speech data")


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/momoali23/finetuned-xlm-esa-v3/commit/e58e7c57bfeee44645f804a4ad6e3bcab95ef0ee', commit_message='Fine-tuned on Arabic hate speech data', commit_description='', oid='e58e7c57bfeee44645f804a4ad6e3bcab95ef0ee', pr_url=None, repo_url=RepoUrl('https://huggingface.co/momoali23/finetuned-xlm-esa-v3', endpoint='https://huggingface.co', repo_type='model', repo_id='momoali23/finetuned-xlm-esa-v3'), pr_revision=None, pr_num=None)

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
#以后fine-tune每次打开第一步
from huggingface_hub import login
login(token="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")

#以后fine-tune每次打开第2步
#加载模型
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained("momoali23/finetuned-xlm-esa-v3")
tokenizer = XLMRobertaTokenizer.from_pretrained("momoali23/finetuned-xlm-esa-v3")


#以后fine-tune每次打开第3步
!pip install -q datasets

In [None]:
#加载模型
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained("momoali23/finetuned-xlm-esa-v3")
tokenizer = XLMRobertaTokenizer.from_pretrained("momoali23/finetuned-xlm-esa-v3")

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset

ds = load_dataset("manueltonneau/french-hate-speech-superset")  # 示例法语仇恨言论数据集

README.md:   0%|          | 0.00/5.82k [00:00<?, ?B/s]

fr_hf.csv:   0%|          | 0.00/2.76M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18071 [00:00<?, ? examples/s]

In [None]:
def clean_columns(dataset):
    return dataset.remove_columns([col for col in dataset.column_names if col not in ["text", "labels"]])

train_ds = clean_columns(dataset_split["train"])
test_ds = clean_columns(dataset_split["test"])


In [None]:
dataset_split = ds["train"].train_test_split(test_size=0.2, seed=42)

In [None]:
def tokenize_fn(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = int(example["labels"])
    return tokens

from datasets import Value

train_tokenized = train_ds.map(tokenize_fn)
test_tokenized = test_ds.map(tokenize_fn)

train_tokenized = train_tokenized.cast_column("labels", Value("int64"))
test_tokenized = test_tokenized.cast_column("labels", Value("int64"))

train_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/14456 [00:00<?, ? examples/s]

Map:   0%|          | 0/3615 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/14456 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3615 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results-french",
    learning_rate=1e-5,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.412043,0.822683
2,0.444600,0.411,0.824343
3,0.371300,0.41363,0.824066


TrainOutput(global_step=1356, training_loss=0.3913774574752403, metrics={'train_runtime': 291.6546, 'train_samples_per_second': 148.696, 'train_steps_per_second': 4.649, 'total_flos': 2852650062213120.0, 'train_loss': 0.3913774574752403, 'epoch': 3.0})

In [None]:
model.save_pretrained("finetuned-xlm-esaf-v4")
tokenizer.save_pretrained("finetuned-xlm-esaf-v4")

('finetuned-xlm-esaf-v4/tokenizer_config.json',
 'finetuned-xlm-esaf-v4/special_tokens_map.json',
 'finetuned-xlm-esaf-v4/sentencepiece.bpe.model',
 'finetuned-xlm-esaf-v4/added_tokens.json')

In [None]:
from huggingface_hub import create_repo, upload_folder

repo_id = "momoali23/finetuned-xlm-esaf-v4"
create_repo(repo_id, private=True)
upload_folder(repo_id=repo_id, folder_path="finetuned-xlm-esaf-v4", commit_message="Fine-tuned on French hate speech data")

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/momoali23/finetuned-xlm-esaf-v4/commit/90a049a12caae3d185b32477036864c6f40d5455', commit_message='Fine-tuned on French hate speech data', commit_description='', oid='90a049a12caae3d185b32477036864c6f40d5455', pr_url=None, repo_url=RepoUrl('https://huggingface.co/momoali23/finetuned-xlm-esaf-v4', endpoint='https://huggingface.co', repo_type='model', repo_id='momoali23/finetuned-xlm-esaf-v4'), pr_revision=None, pr_num=None)

In [None]:
#加载模型
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained("momoali23/finetuned-xlm-esaf-v4")
tokenizer = XLMRobertaTokenizer.from_pretrained("momoali23/finetuned-xlm-esaf-v4")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

In [None]:
!pip install -q datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Step 1: 导入库
import pandas as pd
from datasets import Dataset
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import torch
from sklearn.metrics import accuracy_score, f1_score

# Step 2: 加载模型和 tokenizer
model = XLMRobertaForSequenceClassification.from_pretrained("momoali23/finetuned-xlm-esaf-v4")
tokenizer = XLMRobertaTokenizer.from_pretrained("momoali23/finetuned-xlm-esaf-v4")

# Step 3: 读取 .xlsx 文件
df = pd.read_excel("/content/Test Dataset Arabic.xlsx")   # 修改为你文件的路径
df = df.rename(columns={"comment_text": "text", "toxic": "label"})

# 只保留需要的两列
df = df[["text", "label"]]

# ✅ 修正：确保 text 列为字符串类型
df["text"] = df["text"].astype(str)

# Step 4: 转换为 Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Step 5: Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Step 6: 模型预测
predictions = []
true_labels = []

model.eval()

for batch in tokenized_dataset:
    inputs = {
        'input_ids': batch['input_ids'].unsqueeze(0),
        'attention_mask': batch['attention_mask'].unsqueeze(0)
    }
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    pred = torch.argmax(logits, dim=1).item()
    predictions.append(pred)
    true_labels.append(batch['label'])

# Step 7: 计算 Accuracy 和 F1
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ F1 Score: {f1:.4f}")


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

✅ Accuracy: 0.5315
✅ F1 Score: 0.4054


In [None]:
from sklearn.metrics import classification_report

print(classification_report(true_labels, predictions, target_names=["Normal", "Hate Speech"]))


              precision    recall  f1-score   support

      Normal       0.52      0.99      0.68      1000
 Hate Speech       0.90      0.07      0.13      1000

    accuracy                           0.53      2000
   macro avg       0.71      0.53      0.41      2000
weighted avg       0.71      0.53      0.41      2000



In [None]:
import torch
import torch.nn.functional as F

custom_threshold = 0.3  # 你想设置的新阈值

predictions = []
true_labels = []

model.eval()

for batch in tokenized_dataset:
    inputs = {
        'input_ids': batch['input_ids'].unsqueeze(0),
        'attention_mask': batch['attention_mask'].unsqueeze(0)
    }
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = F.softmax(logits, dim=1)  # 转换为概率

    hate_prob = probs[0][1].item()   # 获取属于 Hate Speech 类别的概率

    if hate_prob >= custom_threshold:
        pred = 1   # 判定为 Hate Speech
    else:
        pred = 0   # 判定为 Normal

    predictions.append(pred)
    true_labels.append(batch['label'])


In [None]:
from sklearn.metrics import classification_report

print(classification_report(true_labels, predictions, target_names=["Normal", "Hate Speech"]))


              precision    recall  f1-score   support

      Normal       0.53      0.98      0.69      1000
 Hate Speech       0.88      0.12      0.21      1000

    accuracy                           0.55      2000
   macro avg       0.71      0.55      0.45      2000
weighted avg       0.71      0.55      0.45      2000

