In [61]:
!wandb login

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mmakaronma-rb[0m ([33mmakaronma-rb-makaron-rb[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [62]:
%env WANDB_PROJECT=spooky

env: WANDB_PROJECT=spooky


In [63]:
from datasets import load_dataset
from datasets.features import ClassLabel

ds_ori = load_dataset(
    "./dataset/spooky-author-identification", data_files=["train.csv"], split="train"
)
ds = ds_ori.shuffle(seed=42).select(range(20))
ds_size = len(ds)
labels = ClassLabel(names=["EAP", "HPL", "MWS"])
ds

Dataset({
    features: ['id', 'text', 'author'],
    num_rows: 20
})

In [64]:
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import BatchEncoding

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


def preprocess_function(data: str) -> BatchEncoding:
    token = tokenizer(data["text"], truncation=True, padding="max_length")
    token["label"] = labels.str2int(data["author"])
    return token



In [65]:
id2label = {0: "EAP", 1: "HPL", 2: "MWS"}
label2id = {"EAP": 0, "HPL": 1, "MWS": 2}

In [66]:
from typing import Any

import evaluate
import numpy as np
from datasets import Dataset
from torch import Tensor
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred: Tensor):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [67]:
# perform cross-validation
def train_model(
    model: Any, wc: float, name: str, train_ds: Dataset, test_ds: Dataset
) -> Trainer:
    train_tokenized_ds = train_ds.map(preprocess_function, batched=True)
    test_tokenized_ds = test_ds.map(preprocess_function, batched=True)
    # tokenized_ds["train"][0]  # type: ignore

    out_dir = "out/" + name

    trainer = Trainer(
        model=model,
        train_dataset=train_tokenized_ds,
        eval_dataset=test_tokenized_ds,
        # processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,  # type: ignore
        args=TrainingArguments(
            output_dir=out_dir,
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=wc,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            push_to_hub=False,
            report_to="wandb",  # enable logging to W&B
            run_name="bert-base-high-lr",  # name of the W&B run
        ),
    )

    trainer.train()
    # trainer.model.save_pretrained(out_dir)

    return trainer

In [68]:
from sklearn.model_selection import KFold

ds = load_dataset(
    "./dataset/spooky-author-identification",
    data_files=["train.csv"],
)
ds = ds["train"]
# .select(range(1000))


kf = KFold(n_splits=5, shuffle=True)

total_score = 0

for i, (train_index, test_index) in enumerate(kf.split(range(len(ds)))):
    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert/distilbert-base-uncased",
        num_labels=3,
        id2label=id2label,
        label2id=label2id,
    )

    train_ds = ds.select(train_index)
    test_ds = ds.select(test_index)
    trainer = train_model(
        model,
        wc=0.01,
        name="spooky-wc001",
        train_ds=train_ds,
        test_ds=test_ds,
    )
    total_score += trainer.evaluate()["eval_accuracy"]

mean_score = total_score / 5
print(f"mean_score={mean_score}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15663 [00:00<?, ? examples/s]

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6555,0.401769,0.843718
2,0.2733,0.425108,0.858018
3,0.1361,0.490568,0.864913


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15663 [00:00<?, ? examples/s]

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6531,0.429543,0.826098
2,0.2699,0.387439,0.864658
3,0.1436,0.475967,0.864402


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15663 [00:00<?, ? examples/s]

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6577,0.409656,0.839377
2,0.2714,0.385772,0.856742
3,0.1407,0.482822,0.859551


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15663 [00:00<?, ? examples/s]

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6518,0.399723,0.848315
2,0.2909,0.364053,0.868233
3,0.1415,0.449869,0.86951


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15664 [00:00<?, ? examples/s]

Map:   0%|          | 0/3915 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.643,0.41608,0.838825
2,0.2817,0.368078,0.867688
3,0.1575,0.435548,0.874074


mean_score=0.8602077471081733
