# Imports

In [43]:
import os
import torch
import datasets
from transformers import GPT2TokenizerFast, GPT2ForSequenceClassification

# Hardware Information

In [7]:
processors = os.cpu_count()
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f'Logical Processors: {processors}\nDevice to perform computation: {device}')

Logical Processors: 32
Device to perform computation: cuda:0


# Model Instantiation

In [44]:
model_name = 'gpt2'
model = GPT2ForSequenceClassification.from_pretrained(model_name)

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
model.config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_size": 50258
}

# Loading Tokenizer

In [49]:
# Load tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(
    model_name, pad_token='<pad>', add_prefix_space=True
)

# Resize input embedding matrix of the model
model.resize_token_embeddings(len(tokenizer))

# TODO: look at performance difference between including and excluding space

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50258, 768)

# Dataset Preparation

In [None]:
# Maximum length of input string into the transformer
context_length = 1024

# Simple concatenation with padding
def concat_tokenize_batched(example):
    return tokenizer(
        f'{example["premise"]} {example["hypothesis"]}', 
        padding = 'max_length', 
        max_length = context_length,
        truncation = True
    )

# Simple concatenation with padding
def concat_tokenize_batched(batch):
    batch['input_ids'] = []
    batch['attention_mask'] = []
    for premise, hypothesis in zip(batch['premise'], batch['hypothesis']):
        tokenized_sample = tokenizer(
            f'{premise} {hypothesis}', 
            padding = 'max_length', 
            max_length = context_length,
            truncation = True
        )
        batch['input_ids'].append(tokenized_sample['input_ids'])
        batch['attention_mask'].append(tokenized_sample['attention_mask'])
    return batch
    
def prompt_tokenize(sample):
    raise NotImplementedError

In [None]:
# Load dataset from online/local cache
mnli = datasets.load_dataset('glue', 'mnli', num_proc=processors)

# Tokenize the inputs (sequential, 5min)
# mnli = mnli.map(concat_tokenize_batched, remove_columns={'premise', 'hypothesis', 'idx'}, batched=False, num_proc=processors)

# Tokenize the inputs (batched, 10sec)
mnli = mnli.map(concat_tokenize_batched, remove_columns={'premise', 'hypothesis', 'idx'}, batched=True, num_proc=processors)

Found cached dataset glue (/home/martmichals/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/5 [00:00<?, ?it/s]

                                                                

#2:   0%|          | 0/12272 [00:00<?, ?ex/s]

#13:   0%|          | 0/12272 [00:00<?, ?ex/s]

#4:   0%|          | 0/12272 [00:00<?, ?ex/s]

#5:   0%|          | 0/12272 [00:00<?, ?ex/s]

#8:   0%|          | 0/12272 [00:00<?, ?ex/s]

#10:   0%|          | 0/12272 [00:00<?, ?ex/s]

#11:   0%|          | 0/12272 [00:00<?, ?ex/s]

#7:   0%|          | 0/12272 [00:00<?, ?ex/s]

#12:   0%|          | 0/12272 [00:00<?, ?ex/s]

#3:   0%|          | 0/12272 [00:00<?, ?ex/s]

#6:   0%|          | 0/12272 [00:00<?, ?ex/s]

#1:   0%|          | 0/12272 [00:00<?, ?ex/s]

#0:   0%|          | 0/12272 [00:00<?, ?ex/s]

#9:   0%|          | 0/12272 [00:00<?, ?ex/s]

#14:   0%|          | 0/12272 [00:00<?, ?ex/s]

#19:   0%|          | 0/12272 [00:00<?, ?ex/s]

#16:   0%|          | 0/12272 [00:00<?, ?ex/s]

#20:   0%|          | 0/12272 [00:00<?, ?ex/s]

#17:   0%|          | 0/12272 [00:00<?, ?ex/s]

#15:   0%|          | 0/12272 [00:00<?, ?ex/s]

#18:   0%|          | 0/12272 [00:00<?, ?ex/s]

#22:   0%|          | 0/12272 [00:00<?, ?ex/s]

#26:   0%|          | 0/12272 [00:00<?, ?ex/s]

#25:   0%|          | 0/12272 [00:00<?, ?ex/s]

#23:   0%|          | 0/12272 [00:00<?, ?ex/s]

#27:   0%|          | 0/12272 [00:00<?, ?ex/s]

#29:   0%|          | 0/12272 [00:00<?, ?ex/s]

#30:   0%|          | 0/12271 [00:00<?, ?ex/s]

#21:   0%|          | 0/12272 [00:00<?, ?ex/s]

#24:   0%|          | 0/12272 [00:00<?, ?ex/s]

#28:   0%|          | 0/12272 [00:00<?, ?ex/s]

#31:   0%|          | 0/12271 [00:00<?, ?ex/s]

Process ForkPoolWorker-1190:
Process ForkPoolWorker-1192:
Process ForkPoolWorker-1191:
Process ForkPoolWorker-1188:
Process ForkPoolWorker-1189:
Process ForkPoolWorker-1186:
Process ForkPoolWorker-1185:
Process ForkPoolWorker-1193:
Process ForkPoolWorker-1187:
Traceback (most recent call last):
  File "/home/martmichals/anaconda3/envs/ava/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/martmichals/anaconda3/envs/ava/lib/python3.9/site-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/home/martmichals/anaconda3/envs/ava/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/home/martmichals/anaconda3/envs/ava/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/home/martmichals/anaconda3/envs/ava/lib/python3.9

In [76]:
mnli['train'][0]

{'label': 1,
 'input_ids': [26097,
  935,
  8566,
  1341,
  27428,
  468,
  734,
  4096,
  15225,
  532,
  1720,
  290,
  27876,
  13,
  8721,
  290,
  27876,
  389,
  644,
  787,
  8566,
  1341,
  27428,
  670,
  13,
  220,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
 

In [67]:
mnli['train'][0]

{'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.',
 'hypothesis': 'Product and geography are what make cream skimming work. ',
 'label': 1,
 'idx': 0}