In [25]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer
from constants import constants

# `_prepare_data`

In [46]:
def _prepare_data(data_dir, tokenizer):
    # load dataset from csv
    dataset = load_dataset("csv", 
                           data_files=os.path.join(data_dir, constants.INPUT_DATA_FILENAME),
                           column_names=[constants.LABELS, constants.SENTENCES1, constants.SENTENCES2])['train']

    # preprocess dataset
    preprocessed_dataset = dataset.map(lambda batch: tokenizer(*(batch[constants.SENTENCES1], batch[constants.SENTENCES2]),padding=True, max_length=constants.MAX_SEQ_LENGTH, truncation=True,return_tensors="pt"))
    
    # split dataset
    preprocessed_dataset = preprocessed_dataset.train_test_split(test_size=1-constants.TRAIN_VAL_SPLIT)
    return preprocessed_dataset['train'],preprocessed_dataset['test']

In [47]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

train_dataset, test_dataset = _prepare_data('./data', tokenizer)

Using custom data configuration default-415b1850f0cd883d
Reusing dataset csv (/Users/philipp/.cache/huggingface/datasets/csv/default-415b1850f0cd883d/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)
Loading cached processed dataset at /Users/philipp/.cache/huggingface/datasets/csv/default-415b1850f0cd883d/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-d71e9b6cc84e76d9.arrow
Loading cached split indices for dataset at /Users/philipp/.cache/huggingface/datasets/csv/default-415b1850f0cd883d/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-358df60b7c388e41.arrow and /Users/philipp/.cache/huggingface/datasets/csv/default-415b1850f0cd883d/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-d89a738d4f43680d.arrow


In [51]:
import argparse
import copy
import json
import logging
import os
import sys
import tarfile
import time

import boto3
from constants import constants

# new
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

def _prepare_data(data_dir, tokenizer):
    # load dataset from csv
    dataset = load_dataset(
        "csv",
        data_files=os.path.join(data_dir, constants.INPUT_DATA_FILENAME),
        column_names=[constants.LABELS, constants.SENTENCES1, constants.SENTENCES2],
    )["train"]

    # preprocess dataset
    preprocessed_dataset = dataset.map(
        lambda batch: tokenizer(
            *(batch[constants.SENTENCES1], batch[constants.SENTENCES2]),
            padding=True,
            max_length=constants.MAX_SEQ_LENGTH,
            truncation=True
        )
    )

    # split dataset
    preprocessed_dataset = preprocessed_dataset.train_test_split(test_size=1 - constants.TRAIN_VAL_SPLIT)
    return preprocessed_dataset["train"], preprocessed_dataset["test"]


def _compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


def run_with_args():
    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-cased')
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')

    train_dataset, eval_dataset = _prepare_data('./data', tokenizer)

    logging.info(f" loaded train_dataset sizes is: {len(train_dataset)}")
    logging.info(f" loaded eval_dataset sizes is: {len(eval_dataset)}")

    # define training args
    training_args = TrainingArguments(
        output_dir='./data',
        num_train_epochs=1,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,  # TODO: is this a good idea?
        evaluation_strategy="epoch",
        logging_dir=f"./data/logs",
        learning_rate=float(5e-5),
        load_best_model_at_end=True,
        metric_for_best_model="f1",
    )

    # create Trainer instance
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        compute_metrics=_compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # train model
    trainer.train()

    # Saves the model to s3
    trainer.save_model(args.model_dir)
    tokenizer.save_pretrained(args.model_dir)


In [None]:
run_with_args()

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))





Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 