In [18]:
!wandb login

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mmakaronma-rb[0m ([33mmakaronma-rb-makaron-rb[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [19]:
%env WANDB_PROJECT=spooky

env: WANDB_PROJECT=spooky


In [20]:
from datasets.features import ClassLabel
from transformers import AutoTokenizer
from transformers.tokenization_utils_base import BatchEncoding

labels = ClassLabel(names=["EAP", "HPL", "MWS"])

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


def preprocess_function(data: str) -> BatchEncoding:
    token = tokenizer(data["text"], truncation=True, padding="max_length")
    token["label"] = labels.str2int(data["author"])
    return token



In [21]:
id2label = {0: "EAP", 1: "HPL", 2: "MWS"}
label2id = {"EAP": 0, "HPL": 1, "MWS": 2}

In [22]:
from typing import Any

import evaluate
import numpy as np
from datasets import Dataset
from torch import Tensor
from transformers import (
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred: Tensor):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
# perform cross-validation
def train_model(
    model: Any, epochs: int, wc: float, name: str, train_ds: Dataset, test_ds: Dataset
) -> Trainer:
    train_tokenized_ds = train_ds.map(preprocess_function, batched=True)
    test_tokenized_ds = test_ds.map(preprocess_function, batched=True)
    # tokenized_ds["train"][0]  # type: ignore

    out_dir = "out/" + name

    trainer = Trainer(
        model=model,
        train_dataset=train_tokenized_ds,
        eval_dataset=test_tokenized_ds,
        # processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,  # type: ignore
        args=TrainingArguments(
            output_dir=out_dir,
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=epochs,
            weight_decay=wc,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            push_to_hub=False,
            report_to="wandb",  # enable logging to W&B
            run_name="bert-base-high-lr",  # name of the W&B run
        ),
    )

    trainer.train()
    # trainer.model.save_pretrained(out_dir)

    return trainer

In [24]:
# Train for 10 epochs
from datasets import load_dataset

ds = load_dataset(
    "./dataset/spooky-author-identification",
    data_files=["train.csv"],
)
splited_ds = ds["train"].train_test_split(test_size=0.2)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=3
)

trainer = train_model(
    model,
    epochs=10,
    wc=0.01,
    name="spooky",
    train_ds=splited_ds["train"],
    test_ds=splited_ds["test"],
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15663 [00:00<?, ? examples/s]

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6385,0.452439,0.820991
2,0.2757,0.38934,0.860061
3,0.1359,0.56305,0.858784
4,0.0738,0.655934,0.866956
5,0.0406,0.865757,0.852911
6,0.0271,0.814121,0.878192
7,0.0129,0.852547,0.876404
8,0.0072,0.95073,0.872319
9,0.0062,0.932495,0.874106
10,0.0026,0.950779,0.874362


In [25]:
# K-Fold cross validation
from sklearn.model_selection import KFold

ds = load_dataset(
    "./dataset/spooky-author-identification",
    data_files=["train.csv"],
)
ds = ds["train"]
kf = KFold(n_splits=3, shuffle=True)

total_score = 0

for i, (train_index, test_index) in enumerate(kf.split(range(len(ds)))):
    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert/distilbert-base-uncased",
        num_labels=3,
        id2label=id2label,
        label2id=label2id,
    )

    train_ds = ds.select(train_index)
    test_ds = ds.select(test_index)
    trainer = train_model(
        model,
        epochs=3,
        wc=0.01,
        name="spooky-wc001",
        train_ds=train_ds,
        test_ds=test_ds,
    )
    total_score += trainer.evaluate()["eval_accuracy"]

mean_score = total_score / 5
print(f"mean_score={mean_score}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13052 [00:00<?, ? examples/s]

Map:   0%|          | 0/6527 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6314,0.458394,0.819213
2,0.281,0.49893,0.824881
3,0.1946,0.498173,0.851387


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13053 [00:00<?, ? examples/s]

Map:   0%|          | 0/6526 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6379,0.438306,0.83129
2,0.2713,0.423493,0.844009
3,0.1873,0.481185,0.858259


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13053 [00:00<?, ? examples/s]

Map:   0%|          | 0/6526 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6509,0.415833,0.835428
2,0.2823,0.399963,0.851364
3,0.1945,0.446384,0.864082


mean_score=0.5029169717284121
