In [1]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': 'data/train_subset.csv', 'eval': 'data/eval_subset.csv'})



In [2]:
from utils import transform_labels
dataset = dataset.map(transform_labels)

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

Map:   0%|          | 0/553 [00:00<?, ? examples/s]

In [3]:
# let's train a Distilbert model

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# let's tokenize the data for the model to be able to understand
def tokenize_data(example):
    return tokenizer(example['Sentence'], padding='max_length')    

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
dataset = dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

Map:   0%|          | 0/553 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForSequenceClassification

# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# the default batch size for training arguments
batch_size = 8

# set number of epochs
number_of_epochs = 7
# let set the logging steps
logging_steps = len(dataset['train']) // batch_size # it should log each batch 

steps = (len(dataset['train']) / batch_size) * number_of_epochs
warmup_steps = int(0.2 * steps)

In [7]:
from transformers import TrainingArguments
training_args = TrainingArguments(
                                  num_train_epochs=number_of_epochs, 
                                  load_best_model_at_end=True,
                                  evaluation_strategy='steps', 
                                  save_strategy='steps',
                                  learning_rate=2e-5,
                                  logging_steps=logging_steps,
                                  warmup_steps= warmup_steps,
                                  save_steps=1000,
                                  eval_steps=500,
                                  output_dir="fine-tuned-distilbert-base-uncased"
                                  )



In [8]:
train_dataset = dataset['train'].shuffle(seed=10) 
eval_dataset = dataset['eval'].shuffle(seed=10)

In [9]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
# Launch the learning process: training
# Load model from checkpoint
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Current

Step,Training Loss,Validation Loss
500,0.9086,0.730691
1000,0.5851,0.683399
1500,0.3326,0.734979


TrainOutput(global_step=1939, training_loss=0.49105145618926627, metrics={'train_runtime': 2800.7752, 'train_samples_per_second': 5.523, 'train_steps_per_second': 0.692, 'total_flos': 2049307203041280.0, 'train_loss': 0.49105145618926627, 'epoch': 7.0})

In [11]:
from utils import compute_metrics

trainer_eval = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

In [12]:
trainer_eval.evaluate()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Trainer is attempting to log a value of "{'accuracy': 0.7703435804701627}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.7508408118748212}" of type <class 'dict'> for key "eval/f1score" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.6833987236022949,
 'eval_model_preparation_time': 0.0013,
 'eval_accuracy': {'accuracy': 0.7703435804701627},
 'eval_f1score': {'f1': 0.7508408118748212},
 'eval_runtime': 14.2494,
 'eval_samples_per_second': 38.809,
 'eval_steps_per_second': 4.912}

### Publish model:

In [14]:
trainer.push_to_hub()
trainer_eval.push_to_hub()

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67dddb91-5ffe60b3722a51be26f325c9;ba37fef4-5d62-448e-8c10-909df6027703)

Invalid username or password.