## Load Data

## Install Libraries

In [2]:
!pip install -qq transformers[sentencepiece] datasets

[K     |████████████████████████████████| 2.5MB 34.3MB/s 
[K     |████████████████████████████████| 266kB 47.8MB/s 
[K     |████████████████████████████████| 3.3MB 42.5MB/s 
[K     |████████████████████████████████| 901kB 29.1MB/s 
[K     |████████████████████████████████| 1.1MB 30.3MB/s 
[K     |████████████████████████████████| 122kB 58.3MB/s 
[K     |████████████████████████████████| 245kB 54.0MB/s 
[?25h

## Load Modules

In [3]:
import pandas as pd
import numpy as np

import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification



In [15]:
raw_dataset = load_dataset('csv', data_files='spam2.csv', column_names=['data', 'labels'], skiprows=1)

Using custom data configuration default-35cf20703b2e0fbe
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-35cf20703b2e0fbe/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23)


In [16]:
raw_dataset.shape

{'train': (5572, 2)}

In [17]:
len(raw_dataset['train'])

5572

In [18]:
raw_dataset.column_names

{'train': ['data', 'labels']}

## Train/Test Split

In [19]:
dataset = raw_dataset['train'].train_test_split(test_size=0.2)

In [20]:
dataset.shape

{'test': (1115, 2), 'train': (4457, 2)}

In [21]:
len(dataset['train']), len(dataset['test'])

(4457, 1115)

## Using the Transformers Library

In [22]:
checkpoint = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




## Build a Dataset from the DataFrames

In [23]:
dset_train_tok = dataset['train'].map(lambda x: tokenizer(x['data'], truncation=True, padding=True), batched=True)
dset_test_tok = dataset['test'].map(lambda x: tokenizer(x['data'], truncation=True, padding=True), batched=True)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [24]:
dset_train_tok.column_names

['attention_mask', 'data', 'input_ids', 'labels']

In [25]:
#remove the 'data column
dset_train_tok = dset_train_tok.remove_columns((['data']))
dset_test_tok = dset_test_tok.remove_columns((['data']))

In [26]:
dset_train_tok.column_names

['attention_mask', 'input_ids', 'labels']

In [28]:
dset_train_tok.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
dset_test_tok.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

## Model

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

### Training

In [30]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    'test-trainer',                          # output directory where information is stored!
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    learning_rate=2e-5,
    weight_decay = 0.01
    )

In [31]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset = dset_train_tok,
    eval_dataset = dset_test_tok,
    # data_collator = data_collator, # dynamic padding!
    tokenizer = tokenizer
)

In [32]:
# sanity check
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1115
  Batch size = 16


{'eval_loss': 0.621530294418335,
 'eval_runtime': 7.3525,
 'eval_samples_per_second': 151.649,
 'eval_steps_per_second': 9.521}

In [33]:
trainer.train()

***** Running training *****
  Num examples = 4457
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1395


Step,Training Loss
500,0.0975
1000,0.0148


Saving model checkpoint to test-trainer/checkpoint-500
Configuration saved in test-trainer/checkpoint-500/config.json
Model weights saved in test-trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-1000
Configuration saved in test-trainer/checkpoint-1000/config.json
Model weights saved in test-trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-1000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1395, training_loss=0.04225192634008264, metrics={'train_runtime': 471.95, 'train_samples_per_second': 47.219, 'train_steps_per_second': 2.956, 'total_flos': 1977775336238700.0, 'train_loss': 0.04225192634008264, 'epoch': 5.0})

### Evaluation

In [34]:
predictions = trainer.predict(dset_test_tok)

***** Running Prediction *****
  Num examples = 1115
  Batch size = 16


In [35]:
print(predictions.predictions.shape, predictions.label_ids.shape)

(1115, 2) (1115,)


In [36]:
predictions.predictions[:5]

array([[ 3.6214945, -4.6284647],
       [ 3.4930468, -4.451161 ],
       [ 3.5119376, -4.502981 ],
       [ 3.613709 , -4.65668  ],
       [ 2.8995633, -3.6801345]], dtype=float32)

In [37]:
predictions.label_ids[:5] # same as dset_test_tok['label']

array([0, 0, 0, 0, 0])

In [38]:
preds = np.argmax(predictions.predictions, axis=-1)

In [40]:
dset_test_tok['labels'][-5:]

tensor([0, 0, 0, 1, 0])

In [41]:
predictions.label_ids[-5:]

array([0, 0, 0, 1, 0])

In [43]:
def scorepro(targets, predictions):
    total = len(targets)
    true_positives = sum(torch.multiply(targets, predictions)) # tp
    possible_positives= sum(targets) # tp + fn
    predicted_positives = sum(predictions) # tp + fp
    
    acc = sum(targets==predictions) / total # wrong
    recall = true_positives / possible_positives
    precision = true_positives / predicted_positives

    f1 =  2*((precision*recall)/(precision+recall))
    return round(acc.item() * 100, 2), round(recall.item() *100, 2), round(precision.item()*100, 2), round(f1.item()*100, 2)

In [44]:
a, r, p, f1 = scorepro(torch.Tensor(predictions.label_ids), torch.Tensor(preds))

In [45]:
print(f'Accuracy: {a}%, Precision: {p}%, Recall: {r}%, F1: {f1}%')

Accuracy: 99.73%, Precision: 98.68%, Recall: 99.33%, F1: 99.0%
