In [1]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, Trainer
from pprint import pprint
from torchinfo import summary
import torch
import numpy as np #required implicitly for training process

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_datasets = load_dataset("glue", "sst2")
# cam also use amazon_polarity dataset for sentiment analysis but takes a long time to process
# raw_datasets = load_dataset("amazon_polarity")

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [4]:
raw_datasets['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [5]:
dir(raw_datasets['train'])

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguo

In [6]:
type(raw_datasets['train'])

datasets.arrow_dataset.Dataset

In [7]:
raw_datasets['train'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [8]:
raw_datasets['train'].data[0]

<pyarrow.lib.ChunkedArray object at 0x00000243381A0770>
[
  [
    "hide new secretions from the parental units ",
    "contains no wit , only labored gags ",
    "that loves its characters and communicates something rather beautiful about human nature ",
    "remains utterly satisfied to remain the same throughout ",
    "on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",
    ...
    "you wish you were at home watching that movie instead of in the theater watching this one ",
    "'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ",
    "underdeveloped ",
    "the jokes are flat ",
    "a heartening tale of small victories "
  ],
  [
    "suspense , intriguing characters and bizarre bank robberies , ",
    "a gritty police thriller with all the dysfunctional family dynamics one could wish for ",
    "with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to li

In [9]:
raw_datasets['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [10]:
raw_datasets['train'][100:105]

{'sentence': ['in memory ',
  'respectable new one ',
  'yet this grating showcase ',
  'hate to tear your eyes away from the images long enough to read the subtitles ',
  'addition to sporting one of the worst titles in recent cinematic history '],
 'label': [1, 1, 0, 1, 0],
 'idx': [100, 101, 102, 103, 104]}

In [11]:
#despite label names in the dataset, model by default outputs only generic labels
raw_datasets['train'].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [12]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [13]:
tokenized_sentences = tokenizer(raw_datasets['train'][0:3]['sentence'])
pprint(tokenized_sentences)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
               [101,
                3397,
                2053,
                15966,
                1010,
                2069,
                4450,
                2098,
                18201,
                2015,
                102],
               [101,
                2008,
                7459,
                2049,
                3494,
                1998,
                10639,
                2015,
                2242,
                2738,
                3376,
                2055,
                2529,
                3267,
                102]]}


In [14]:
def tokenize_fn(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [15]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

In [16]:
training_args = TrainingArguments(
    output_dir='my_trainer',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=1
)

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [19]:
#to check if the entire network gets trained or only head
params_before = []
for name, p in model.named_parameters(): 
    params_before.append(p.detach().cpu().numpy())

In [20]:
metric = load_metric("glue", "sst2", trust_remote_code=True)

  metric = load_metric("glue", "sst2", trust_remote_code=True)


In [21]:
metric.compute(predictions=[1, 0, 1], references=[1, 1, 1])

{'accuracy': 0.6666666666666666}

In [22]:
#the compute_metrics function:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions = predictions, references=labels)    

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [24]:
trainer.train()

  0%|          | 0/8419 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  6%|▌         | 504/8419 [00:26<06:43, 19.59it/s]

{'loss': 0.4083, 'learning_rate': 4.7030526190759e-05, 'epoch': 0.06}


 12%|█▏        | 1002/8419 [00:52<06:29, 19.06it/s]

{'loss': 0.3365, 'learning_rate': 4.4061052381518e-05, 'epoch': 0.12}


 18%|█▊        | 1502/8419 [01:18<07:30, 15.35it/s]

{'loss': 0.3388, 'learning_rate': 4.109157857227699e-05, 'epoch': 0.18}


 24%|██▍       | 2004/8419 [01:46<05:32, 19.32it/s]

{'loss': 0.3025, 'learning_rate': 3.812210476303599e-05, 'epoch': 0.24}


 30%|██▉       | 2503/8419 [02:12<05:34, 17.68it/s]

{'loss': 0.2859, 'learning_rate': 3.515263095379499e-05, 'epoch': 0.3}


 36%|███▌      | 3002/8419 [02:40<05:16, 17.13it/s]

{'loss': 0.2881, 'learning_rate': 3.218315714455399e-05, 'epoch': 0.36}


 42%|████▏     | 3503/8419 [03:08<04:33, 17.94it/s]

{'loss': 0.2633, 'learning_rate': 2.9213683335312986e-05, 'epoch': 0.42}


 48%|████▊     | 4002/8419 [03:35<03:58, 18.54it/s]

{'loss': 0.2552, 'learning_rate': 2.6244209526071984e-05, 'epoch': 0.48}


 53%|█████▎    | 4502/8419 [04:06<03:48, 17.13it/s]

{'loss': 0.234, 'learning_rate': 2.3274735716830978e-05, 'epoch': 0.53}


 59%|█████▉    | 5003/8419 [04:38<03:18, 17.25it/s]

{'loss': 0.2423, 'learning_rate': 2.0305261907589976e-05, 'epoch': 0.59}


 65%|██████▌   | 5504/8419 [05:07<02:39, 18.25it/s]

{'loss': 0.2316, 'learning_rate': 1.7335788098348973e-05, 'epoch': 0.65}


 71%|███████▏  | 6002/8419 [05:35<02:12, 18.30it/s]

{'loss': 0.224, 'learning_rate': 1.4366314289107971e-05, 'epoch': 0.71}


 77%|███████▋  | 6502/8419 [06:03<01:47, 17.79it/s]

{'loss': 0.2178, 'learning_rate': 1.1396840479866969e-05, 'epoch': 0.77}


 83%|████████▎ | 7002/8419 [06:31<01:22, 17.21it/s]

{'loss': 0.2167, 'learning_rate': 8.427366670625965e-06, 'epoch': 0.83}


 89%|████████▉ | 7503/8419 [06:59<00:50, 18.02it/s]

{'loss': 0.1983, 'learning_rate': 5.457892861384962e-06, 'epoch': 0.89}


 95%|█████████▌| 8003/8419 [07:27<00:23, 17.75it/s]

{'loss': 0.2156, 'learning_rate': 2.4884190521439603e-06, 'epoch': 0.95}


                                                   
100%|██████████| 8419/8419 [07:52<00:00, 18.00it/s]

{'eval_loss': 0.30288517475128174, 'eval_accuracy': 0.9185779816513762, 'eval_runtime': 1.6991, 'eval_samples_per_second': 513.205, 'eval_steps_per_second': 64.151, 'epoch': 1.0}


100%|██████████| 8419/8419 [07:53<00:00, 17.78it/s]

{'train_runtime': 473.6342, 'train_samples_per_second': 142.196, 'train_steps_per_second': 17.775, 'train_loss': 0.26388837436214313, 'epoch': 1.0}





TrainOutput(global_step=8419, training_loss=0.26388837436214313, metrics={'train_runtime': 473.6342, 'train_samples_per_second': 142.196, 'train_steps_per_second': 17.775, 'train_loss': 0.26388837436214313, 'epoch': 1.0})

In [27]:
trainer.save_model('model')

In [7]:
import torch 
from transformers import pipeline
import json
#adding id2label in the config file to output label names
config_path = 'model/config.json'
with open(config_path) as f:
    j = json.load(f)
    j['id2label'] = {0: 'negative', 1: 'positive'}
with open(config_path, 'w') as f:
    json.dump(j, f, indent=2) 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
saved_model = pipeline('text-classification', model='model', device = device)

In [8]:
saved_model("This movie is great!")

[{'label': 'positive', 'score': 0.9988046884536743}]

In [9]:
saved_model("This movie is bad")

[{'label': 'negative', 'score': 0.9982507824897766}]

In [None]:
#check if whole network was changed:
params_after = []
for name, p in model.named_parameters():
    params_after.append(p.detach().cpu().numpy())

In [None]:
for p1, p2 in zip(params_before, params_after):
    print(np.sum(np.abs(p1 - p2)))