In [1]:
import torch
import numpy as np
from tqdm import tqdm

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### imdb dataset

In [4]:
from datasets import load_dataset

In [5]:
ds = load_dataset('imdb')
ds = ds.rename_columns({'text': 'review', 'label': 'sentiment'})
ds = ds.filter(lambda x: len(x["review"])>200, batched=False)
ds = ds.shuffle(seed=1)
ds

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 24895
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 24872
    })
    unsupervised: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 49776
    })
})

### default model 

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

#### < test ...

In [7]:
text = 'the largest city in USA is'

In [8]:
input_ids = tokenizer(text, return_tensors = 'pt')['input_ids']
input_ids

tensor([[1169, 4387, 1748,  287, 4916,  318]])

In [9]:
out = model.generate(input_ids, 
                    do_sample=True,
                    num_beams=2,
                    temperature=1.1,
                    top_p=0.9,
                    max_length=100,
                    )
out

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[ 1169,  4387,  1748,   287,  4916,   318,   287,   262,  1294,    11,
           475,   612,   389,   991,   257,  1271,   286,  4736,   326,   423,
           257,  4025,  3265,   621, 19170,    13,   198,   198,  4342,   318,
           257,  1351,   286,   262,  1353,   838,  4736,   287,   262,  1294,
           351,   262,  4387,  3265,   286,   661,    13,   198,   198,    16,
            13,   968,  1971,  2254,   198,   198,    17,    13,  5401,  5652,
           198,   198,    18,    13,  2986,  6033,   198,   198,    19,    13,
          8437,   198,   198,    20,    13,  2986,  9500,   198,   198,    21,
            13,  4842,   198,   198,    22,    13,   968,  1971,  2254,   198,
           198,    23,    13,  2986,  6033,   198,   198,    24,    13,  8437]])

In [10]:
generated_text = list(map(tokenizer.decode, out))
generated_text

['the largest city in USA is in the US, but there are still a number of cities that have a larger population than NYC.\n\nHere is a list of the top 10 cities in the US with the largest population of people.\n\n1. New York City\n\n2. Los Angeles\n\n3. San Francisco\n\n4. Miami\n\n5. San Diego\n\n6. Chicago\n\n7. New York City\n\n8. San Francisco\n\n9. Miami']

#### ...test >

### tokenizer

In [11]:
def preprocess_func(examples):
    return tokenizer(examples['review'], truncation=True, max_length = 256)

tokenized_ds = ds.map(preprocess_func, batched=True)
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 24895
    })
    test: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 24872
    })
    unsupervised: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 49776
    })
})

In [12]:
lengths = np.zeros(len(tokenized_ds['train']['input_ids']))
for i, ids in enumerate(tokenized_ds['train']['input_ids']):
    lengths[i] = len(ids)
print(f'{lengths.max()=}')
print(f'{lengths.min()=}')
print(f'{lengths.mean()=}')

lengths.max()=256.0
lengths.min()=39.0
lengths.mean()=203.6788913436433


In [13]:
tokenized_ds = tokenized_ds.remove_columns(['review', 'sentiment'])

In [14]:
tokenized_ds.set_format('torch')

In [15]:
train_dataset = tokenized_ds['train']['input_ids'][0:10000]

In [16]:
for i, train_ds in enumerate(train_dataset):
    print(train_ds.shape)
    if i == 10: break

torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([204])
torch.Size([143])
torch.Size([256])
torch.Size([127])
torch.Size([151])
torch.Size([256])


In [17]:
tokenizer.pad_token = tokenizer.eos_token

In [18]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

### training

In [19]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./finetuned", # The output directory
    overwrite_output_dir=True, # Overwrite the content of the output dir
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=8,  # batch size for evaluation
    warmup_steps=10, # number of warmup steps for learning rate scheduler
    # gradient_accumulation_steps=16, # to make "virtual" batch size larger
    logging_steps=50
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5), None)
)

In [20]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmaxfil333[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,3.8482
100,3.8
150,3.7757
200,3.7727
250,3.7526
300,3.6955
350,3.7214
400,3.7463
450,3.7205
500,3.6868


TrainOutput(global_step=3750, training_loss=3.647882454427083, metrics={'train_runtime': 1373.2032, 'train_samples_per_second': 21.847, 'train_steps_per_second': 2.731, 'total_flos': 3913529988096000.0, 'train_loss': 3.647882454427083, 'epoch': 3.0})

### evaluation

In [25]:
text = "This movie is"
input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
model.eval()
with torch.no_grad():
    out = model.generate(input_ids, 
                        do_sample=True,
                        num_beams=2,
                        temperature=1.5,
                        top_p=0.9,
                        max_length=200,
                        )

generated_text = list(map(tokenizer.decode, out))[0]

print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


This movie is really good. I think that if you watch it on DVD you will see a little bit more of it, especially the ending. The story is well written. The movie is also a good story, but it doesn't really do it justice in the end. The acting is good, and the movie gets more and more realistic. I hope that the people who watched the movie will find it a little bit better, since it is a very funny movie. The movie is good for the budget and its a good way to get your money's worth. The movie is very good for the fact that it is a movie, but I hope that it gets a little bit better in the future. I hope that the people watching the movie will find it a little bit better, since it is a funny movie, but I hope that the people watching the movie will find it a little bit better, because it is a funny movie, and because it is a very funny movie. And I hope


In [26]:
text = "It is definetely"
input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
model.eval()
with torch.no_grad():
    out = model.generate(input_ids, 
                        do_sample=True,
                        num_beams=2,
                        temperature=1.5,
                        top_p=0.9,
                        max_length=200,
                        )

generated_text = list(map(tokenizer.decode, out))[0]

print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It is definetely sad when a movie is made without even a single mention of the actual story. The idea that it would have been better if all the plot elements were explained in a single, concise, and coherent piece of film is simply not true. I am not sure if this is a coincidence or a combination of the two, but it is disappointing that this movie was never given a chance to truly shine in any form.<br /><br />In the end, I can only comment that the movie was a disappointment and a shame. If you are a fan of the genre of video games and want to see a better version of this masterpiece, go for it. If you are a fan of any other genre, you will love this movie. It's an entertaining, enjoyable, and enjoyable experience. <br /><br />I will admit though that I have had a few criticisms about the movie, and I will try to respond to them in the next post. But, if


### pushing to hub

In [27]:
model.push_to_hub('gpt2_imdb_generator')

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/maxfil333/gpt2_imdb_generator/commit/b0de0e31c514bd46533798c1aa334f3e907cf3d0', commit_message='Upload model', commit_description='', oid='b0de0e31c514bd46533798c1aa334f3e907cf3d0', pr_url=None, pr_revision=None, pr_num=None)

In [28]:
tokenizer.push_to_hub('gpt2_imdb_generator')

CommitInfo(commit_url='https://huggingface.co/maxfil333/gpt2_imdb_generator/commit/7d604a42d1d84db56128697d4b056bc0ed0ebcb4', commit_message='Upload tokenizer', commit_description='', oid='7d604a42d1d84db56128697d4b056bc0ed0ebcb4', pr_url=None, pr_revision=None, pr_num=None)