In [1]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, Trainer
from pprint import pprint
from torchinfo import summary
import torch
import numpy as np #required implicitly for training process

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_datasets = load_dataset("glue", "sst2")
# cam also use amazon_polarity dataset for sentiment analysis but takes a long time to process
# raw_datasets = load_dataset("amazon_polarity")

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [4]:
raw_datasets['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [5]:
dir(raw_datasets['train'])

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguo

In [6]:
type(raw_datasets['train'])

datasets.arrow_dataset.Dataset

In [7]:
raw_datasets['train'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [8]:
raw_datasets['train'].data[0]

<pyarrow.lib.ChunkedArray object at 0x000001789E0E8220>
[
  [
    "hide new secretions from the parental units ",
    "contains no wit , only labored gags ",
    "that loves its characters and communicates something rather beautiful about human nature ",
    "remains utterly satisfied to remain the same throughout ",
    "on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",
    ...
    "you wish you were at home watching that movie instead of in the theater watching this one ",
    "'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ",
    "underdeveloped ",
    "the jokes are flat ",
    "a heartening tale of small victories "
  ],
  [
    "suspense , intriguing characters and bizarre bank robberies , ",
    "a gritty police thriller with all the dysfunctional family dynamics one could wish for ",
    "with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to li

In [9]:
raw_datasets['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [10]:
raw_datasets['train'][100:105]

{'sentence': ['in memory ',
  'respectable new one ',
  'yet this grating showcase ',
  'hate to tear your eyes away from the images long enough to read the subtitles ',
  'addition to sporting one of the worst titles in recent cinematic history '],
 'label': [1, 1, 0, 1, 0],
 'idx': [100, 101, 102, 103, 104]}

In [11]:
#despite label names in the dataset, model by default outputs only generic labels
raw_datasets['train'].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [12]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [13]:
tokenized_sentences = tokenizer(raw_datasets['train'][0:3]['sentence'])
pprint(tokenized_sentences)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
               [101,
                3397,
                2053,
                15966,
                1010,
                2069,
                4450,
                2098,
                18201,
                2015,
                102],
               [101,
                2008,
                7459,
                2049,
                3494,
                1998,
                10639,
                2015,
                2242,
                2738,
                3376,
                2055,
                2529,
                3267,
                102]]}


In [14]:
def tokenize_fn(batch):
    return tokenizer(batch['sentence'], truncation=True)

In [15]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map: 100%|██████████| 1821/1821 [00:00<00:00, 24737.42 examples/s]


In [16]:
training_args = TrainingArguments(
    output_dir='my_trainer',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=1
)

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [19]:
#to check if the entire network gets trained or only head
params_before = []
for name, p in model.named_parameters(): 
    params_before.append(p.detach().cpu().numpy())

In [20]:
metric = load_metric("glue", "sst2", trust_remote_code=True)

  metric = load_metric("glue", "sst2", trust_remote_code=True)


In [21]:
metric.compute(predictions=[1, 0, 1], references=[1, 1, 1])

{'accuracy': 0.6666666666666666}

In [22]:
#the compute_metrics function:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions = predictions, references=labels)    

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [25]:
trainer.train()

  0%|          | 0/8419 [00:00<?, ?it/s]

  6%|▌         | 502/8419 [00:25<06:23, 20.64it/s]

{'loss': 0.4188, 'learning_rate': 4.7030526190759e-05, 'epoch': 0.06}


 12%|█▏        | 1003/8419 [00:50<06:08, 20.10it/s]

{'loss': 0.3351, 'learning_rate': 4.4061052381518e-05, 'epoch': 0.12}


 18%|█▊        | 1504/8419 [01:16<05:52, 19.62it/s]

{'loss': 0.3495, 'learning_rate': 4.109157857227699e-05, 'epoch': 0.18}


 24%|██▍       | 2003/8419 [01:44<05:45, 18.58it/s]

{'loss': 0.2975, 'learning_rate': 3.812210476303599e-05, 'epoch': 0.24}


 30%|██▉       | 2502/8419 [02:11<05:14, 18.81it/s]

{'loss': 0.306, 'learning_rate': 3.515263095379499e-05, 'epoch': 0.3}


 36%|███▌      | 3003/8419 [02:38<04:45, 18.98it/s]

{'loss': 0.2901, 'learning_rate': 3.218315714455399e-05, 'epoch': 0.36}


 42%|████▏     | 3503/8419 [03:07<04:31, 18.12it/s]

{'loss': 0.2715, 'learning_rate': 2.9213683335312986e-05, 'epoch': 0.42}


 48%|████▊     | 4002/8419 [03:34<03:55, 18.72it/s]

{'loss': 0.2632, 'learning_rate': 2.6244209526071984e-05, 'epoch': 0.48}


 53%|█████▎    | 4504/8419 [04:02<03:27, 18.91it/s]

{'loss': 0.232, 'learning_rate': 2.3274735716830978e-05, 'epoch': 0.53}


 59%|█████▉    | 5001/8419 [04:29<02:57, 19.22it/s]

{'loss': 0.2483, 'learning_rate': 2.0305261907589976e-05, 'epoch': 0.59}


 65%|██████▌   | 5503/8419 [05:00<03:03, 15.93it/s]

{'loss': 0.2153, 'learning_rate': 1.7335788098348973e-05, 'epoch': 0.65}


 71%|███████▏  | 6001/8419 [05:28<03:09, 12.76it/s]

{'loss': 0.2187, 'learning_rate': 1.4366314289107971e-05, 'epoch': 0.71}


 77%|███████▋  | 6504/8419 [05:58<01:39, 19.33it/s]

{'loss': 0.2213, 'learning_rate': 1.1396840479866969e-05, 'epoch': 0.77}


 83%|████████▎ | 7003/8419 [06:25<01:14, 18.93it/s]

{'loss': 0.2252, 'learning_rate': 8.427366670625965e-06, 'epoch': 0.83}


 89%|████████▉ | 7502/8419 [06:52<00:49, 18.47it/s]

{'loss': 0.1965, 'learning_rate': 5.457892861384962e-06, 'epoch': 0.89}


 95%|█████████▌| 8003/8419 [07:20<00:21, 19.07it/s]

{'loss': 0.2151, 'learning_rate': 2.4884190521439603e-06, 'epoch': 0.95}


                                                   
100%|██████████| 8419/8419 [07:46<00:00, 17.72it/s]

{'eval_loss': 0.33744385838508606, 'eval_accuracy': 0.9071100917431193, 'eval_runtime': 1.5613, 'eval_samples_per_second': 558.514, 'eval_steps_per_second': 69.814, 'epoch': 1.0}


100%|██████████| 8419/8419 [07:47<00:00, 18.00it/s]

{'train_runtime': 467.5955, 'train_samples_per_second': 144.033, 'train_steps_per_second': 18.005, 'train_loss': 0.26698761856291153, 'epoch': 1.0}





TrainOutput(global_step=8419, training_loss=0.26698761856291153, metrics={'train_runtime': 467.5955, 'train_samples_per_second': 144.033, 'train_steps_per_second': 18.005, 'train_loss': 0.26698761856291153, 'epoch': 1.0})

In [26]:
trainer.save_model('model')

In [27]:
import torch 
from transformers import pipeline
import json
#adding id2label in the config file to output label names
config_path = 'model/config.json'
with open(config_path) as f:
    j = json.load(f)
    j['id2label'] = {0: 'negative', 1: 'positive'}
with open(config_path, 'w') as f:
    json.dump(j, f, indent=2) 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
saved_model = pipeline('text-classification', model='model', device = device)

In [28]:
saved_model("This movie is great!")

[{'label': 'positive', 'score': 0.9989526271820068}]

In [29]:
saved_model("This movie is bad")

[{'label': 'negative', 'score': 0.9987744688987732}]

In [30]:
#check if whole network was changed:
params_after = []
for name, p in model.named_parameters():
    params_after.append(p.detach().cpu().numpy())

In [31]:
for p1, p2 in zip(params_before, params_after):
    print(np.sum(np.abs(p1 - p2)))

13252.66
89.6709
1.7980859
1.075331
1296.6256
1.6503868
1291.849
0.0032623466
1186.1066
1.0579159
1130.8407
0.85436
1.6720567
0.84280396
4925.7744
5.775226
4513.674
0.7013656
1.5714055
0.6868653
1263.667
1.4004593
1265.7064
0.0029367977
1107.6168
0.83688736
1068.6273
0.7181506
1.5539249
0.69827795
4899.924
5.380926
4464.1177
0.6820636
1.5906854
0.6869984
1272.8352
1.5795727
1277.6963
0.0026745007
1109.5337
0.7855022
1103.0457
0.75032735
1.5836492
0.78737414
4920.2007
5.5639796
4354.1943
0.7123836
1.4875911
0.6933131
1287.8937
1.3822637
1305.9032
0.0032039424
1160.582
0.7359677
1120.4573
0.73621786
1.4135668
0.75952303
4815.534
5.4664145
4138.236
0.71775466
1.3287185
0.772182
1211.8398
1.4855764
1206.5618
0.0019893926
991.55084
0.8008404
1006.32166
0.8247078
1.3721833
0.9095831
4411.2153
5.108971
3482.849
0.697425
1.3233086
0.6415199
1167.1584
1.4287767
1151.6066
0.0012809257
925.4199
0.7206658
903.3681
0.9462185
1.243445
1.0946717
3436.3528
4.4317074
3129.0574
0.9437357
1.2795303
0.715