In [1]:
from datasets import Dataset
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import torch
from transformers import AutoConfig
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding

In [2]:
train_df = pd.read_csv("../data/processed/train.csv")
train_df.head()

Unnamed: 0,Clothing ID,Age,Positive Feedback Count,Division Name,Department Name,Class Name,text,labels
0,0,25,4,General,Bottoms,Skirts,"title: 3-season skirt! [SEP] review: Adorable,...",1
1,0,39,0,General,Bottoms,Skirts,title: Very cute [SEP] review: Love the asymme...,1
2,0,42,5,General,Bottoms,Skirts,title: Beautiful! fruns small for typical reta...,1
3,0,45,9,General,Bottoms,Skirts,title: none [SEP] review: I was really pleased...,1
4,0,57,1,General,Bottoms,Skirts,"title: Unique, pretty asymmetric skirt [SEP] r...",1


In [3]:
test_df = pd.read_csv("../data/processed/test.csv")
test_df.head()

Unnamed: 0,Clothing ID,Age,Positive Feedback Count,Division Name,Department Name,Class Name,text
0,0,32,0,General,Bottoms,Skirts,title: So happy i bought this skirt! [SEP] rev...
1,0,34,0,General,Bottoms,Skirts,title: Runs small [SEP] review: Beautiful patt...
2,0,37,0,General,Bottoms,Skirts,title: Love the comfort of thi skirt [SEP] rev...
3,0,39,10,General,Bottoms,Skirts,title: Way too small [SEP] review: This is a b...
4,0,39,0,General,Bottoms,Skirts,title: none [SEP] review: I usually wear a siz...


In [4]:
dataset = Dataset.from_pandas(train_df[["text", "labels"]]).class_encode_column("labels")
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42, stratify_by_column="labels")
print(dataset)

Stringifying the column:   0%|          | 0/10000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 2000
    })
})


In [5]:
model_name = "distilbert/distilbert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False)

In [6]:
max_token_train = train_df["text"].map(lambda x: len(tokenizer(x)["input_ids"])).max()
max_token_test = test_df["text"].map(lambda x: len(tokenizer(x)["input_ids"])).max()
max_token_train, max_token_test

(168, 173)

In [7]:
def tokenize(examples):
    return tokenizer(examples["text"], max_length=192, padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
# サンプル数を制限
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(5000))
eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(2000))

def compute_metrics(p):
    preds, labels = p
    preds = torch.softmax(torch.tensor(preds), dim=1).numpy()
    score = roc_auc_score(labels, preds[:, 1])
    return {"auc": score}

training_args = TrainingArguments(
    output_dir="../models",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    do_eval=True,
    eval_strategy="epoch",
    metric_for_best_model="auc",
    greater_is_better=True,
    seed=42,
)

config = AutoConfig.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # data_collator=DataCollatorWithPadding(tokenizer),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 