In [1]:
####################################
#
#  ADD THIS TO EVERY COLAB FILE!
#
####################################

#!pip install -q import-ipynb

from google.colab import drive
drive.mount('/content/drive')

#import import_ipynb
import drive.Shareddrives.GPTJ.project.settings as settings

PATH_PROJECT = settings.PATH_PROJECT
PATH_DATA = settings.PATH_DATA

! cd $PATH_PROJECT && pip install -q -r requirements.txt

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


In [2]:
!nvidia-smi

Thu Oct  6 04:14:30 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel, GPTNeoForCausalLM

In [4]:
# MODEL

#model_name = 'EleutherAI/gpt-neo-1.3B' # CUDA out of memory.

model_name = 'gpt2' # GPU vRAM 3495MiB - 3.5G
#model_name = 'gpt2-medium' # GPU vRAM 7389MiB - 7.3G
#model_name = 'gpt2-large' # GPU vRAM 14797MiB - 14.8G

#model_name = 'gpt2-xl' # CUDA out of memory.


In [5]:
# Set the random seed to a fixed value to get reproducible results 
torch.manual_seed(42)

tokenizer = GPT2Tokenizer.from_pretrained(model_name, 
    bos_token='<|startoftext|>',
    eos_token='<|endoftext|>', 
    pad_token='<|pad|>'
)
model = GPT2LMHeadModel.from_pretrained(model_name).cuda()
model.resize_token_embeddings(len(tokenizer))

torch.save(model, os.path.join(PATH_DATA, model_name, 'model'))

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [14]:
descriptions = pd.read_csv(PATH_PROJECT + 'netflix_titles.csv')['description']
max_length = max([len(tokenizer.encode(description)) for description in descriptions])

max_length, descriptions

(42, 0     As her father nears the end of his life, filmm...
 1     After crossing paths at a party, a Cape Town t...
 2     To protect his family from a powerful drug lor...
 3     Feuds, flirtations and toilet talk go down amo...
 4     In a city of coaching centers known to train I...
                             ...                        
 95    Status and strategy collide in this social exp...
 96    Using interviews and archival footage, this do...
 97    A boy's superhero dreams come true when he fin...
 98    The Octonauts expand their exploration beyond ...
 99    Four women — a chef, a single mom, an heiress ...
 Name: description, Length: 100, dtype: object)

In [15]:
class NetflixDataset(Dataset):

    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        for txt in txt_list:
            # Encode the descriptions using the GPT-Neo tokenizer
            encodings_dict = tokenizer('<|startoftext|>' 
                                        + txt +    
                                        '<|endoftext|>',
                                        truncation=True,
                                        max_length=max_length, 
                                        padding='max_length')
            input_ids = torch.tensor(encodings_dict['input_ids'])    
            self.input_ids.append(input_ids)
            mask = torch.tensor(encodings_dict['attention_mask'])
            self.attn_masks.append(mask)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

dataset = NetflixDataset(descriptions, tokenizer, max_length)

print(len(dataset))

train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

100


In [16]:
# Here I will pass the output directory where 
# the model predictions and checkpoints will be stored, 
# batch sizes for the training and validation steps, 
# and warmup_steps to gradually increase the learning rate
training_args = TrainingArguments(
    output_dir=os.path.join(PATH_DATA, model_name, 'partial'),
    num_train_epochs=5,
    logging_steps=5000,
    save_steps=5000,                                   
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=100,
    weight_decay=0.01,  
    logging_dir=os.path.join(PATH_DATA, model_name, 'logs')
)

trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset, 
    # This custom collate function is necessary 
    # to built batches of data
    data_collator=lambda data: {
        'input_ids': torch.stack([f[0] for f in data]),       
        'attention_mask': torch.stack([f[1] for f in data]),
        'labels': torch.stack([f[0] for f in data])
    }
)
# Start training process!
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 90
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 225


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=225, training_loss=1.425220675998264, metrics={'train_runtime': 12.9613, 'train_samples_per_second': 34.719, 'train_steps_per_second': 17.359, 'total_flos': 9645350400000.0, 'train_loss': 1.425220675998264, 'epoch': 5.0})

In [17]:
trainer.save_model(os.path.join(PATH_DATA, model_name, 'model-trained'))

Saving model checkpoint to /content/drive/Shareddrives/GPTJ/data/gpt2/model-trained
Configuration saved in /content/drive/Shareddrives/GPTJ/data/gpt2/model-trained/config.json
Model weights saved in /content/drive/Shareddrives/GPTJ/data/gpt2/model-trained/pytorch_model.bin


In [18]:

# Start every description with a special BOS token
generated = tokenizer('<|startoftext|>', return_tensors='pt').input_ids.cuda()

# Start every description with a special BOS token
#prompt = 'mov ebp var_A dx' # mov edx ebp var_E'
#prompt = 'An apple is'
#generated = tokenizer(prompt, return_tensors='pt').input_ids.cuda()

# Generate 3 movie descriptions
sample_outputs = model.generate(generated, 
    # Use sampling instead of greedy decoding 
    do_sample=True, 
    # Keep only top 50 token with the highest probability
    top_k=50, 
    # Maximum sequence length
    max_length=50,
    #max_new_tokens=100,
    # Keep only the most probable tokens with cumulative probability of 95%
    top_p=0.95, 
    # Changes randomness of generated sequences to 1.9
    temperature=0.01,
    # Number of sequences to generate                 
    num_return_sequences=10
)

# Print generated descriptions
for i, sample_output in enumerate(sample_outputs): 
    print('{}: {}'.format(i, tokenizer.decode(sample_output, skip_special_tokens=True)).replace('\n', ' '))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: A young woman's life is turned upside down when she falls for a man she meets at a party and begins to suspect she's being framed.
1: A young woman's life is turned upside down when her boyfriend is taken into custody and she's forced to hide out in order towing a hefty ransom.
2: A young man's peaceful life is threatened when he's sent to live with his grandmother in a small town that's threatened by a powerful drug lord.
3: A young man's life is turned upside down when his father agrees to escort him on a trip to visit his ex-lover's mother.
4: A young man and a woman fall in love and discover a deep connection that unites them decades later. But a major misunderstanding soon divides them.
5: A young woman’s peaceful life is threatened when her father’s mysterious death sends her grieving into a tailspin of extreme emotion, lust and self-discovery.
6: A young man and a woman with Down syndrome navigate the ups and downs of their lives while trying to make it as a jazz musician.
7:

In [19]:
!nvidia-smi

Thu Oct  6 04:36:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    44W / 250W |   3545MiB / 16280MiB |     39%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces