In [1]:
import torch
import numpy as np
from tqdm import tqdm

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### imdb dataset

In [4]:
from datasets import load_dataset

In [5]:
ds = load_dataset('imdb')
ds = ds.rename_columns({'text': 'review', 'label': 'sentiment'})
ds = ds.filter(lambda x: len(x["review"])>200, batched=False)
ds = ds.shuffle(seed=1)
ds

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 24895
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 24872
    })
    unsupervised: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 49776
    })
})

### default model 

In [6]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

### tokenization

In [7]:
def preprocess_func(examples):
    return tokenizer(examples['review'], truncation=True, max_length = 256)

tokenized_ds = ds.map(preprocess_func, batched=True)
tokenized_ds

Map:   0%|          | 0/24895 [00:00<?, ? examples/s]

Map:   0%|          | 0/24872 [00:00<?, ? examples/s]

Map:   0%|          | 0/49776 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 24895
    })
    test: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 24872
    })
    unsupervised: Dataset({
        features: ['review', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 49776
    })
})

In [8]:
lengths = np.zeros(len(tokenized_ds['train']['input_ids']))
for i, ids in enumerate(tokenized_ds['train']['input_ids']):
    lengths[i] = len(ids)
print(f'{lengths.max()=}')
print(f'{lengths.min()=}')
print(f'{lengths.mean()=}')

lengths.max()=256.0
lengths.min()=39.0
lengths.mean()=203.6788913436433


In [9]:
tokenized_ds = tokenized_ds.remove_columns(['review', 'sentiment'])

In [10]:
tokenized_ds.set_format('torch')

In [11]:
train_dataset = tokenized_ds['train']['input_ids']

In [12]:
for i, train_ds in enumerate(train_dataset):
    print(train_ds.shape)
    if i == 10: break

torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([204])
torch.Size([143])
torch.Size([256])
torch.Size([127])
torch.Size([151])
torch.Size([256])


In [13]:
tokenizer.pad_token = tokenizer.eos_token

In [14]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

### training

In [15]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./finetuned", # The output directory
    overwrite_output_dir=True, # Overwrite the content of the output dir
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=8,  # batch size for evaluation
    warmup_steps=10, # number of warmup steps for learning rate scheduler
    logging_steps=300,
    # gradient_accumulation_steps=16, # to make "virtual" batch size larger
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5), None)
)

In [16]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmaxfil333[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
300,3.79
600,3.7057
900,3.6769
1200,3.6724
1500,3.6625
1800,3.6601
2100,3.6602
2400,3.6498
2700,3.6472
3000,3.6612


TrainOutput(global_step=3112, training_loss=3.6778667714724502, metrics={'train_runtime': 6354.3866, 'train_samples_per_second': 3.918, 'train_steps_per_second': 0.49, 'total_flos': 3247419027456000.0, 'train_loss': 3.6778667714724502, 'epoch': 1.0})

### evaluation

In [38]:
generation_kwargs = {'do_sample': True,
                     'num_beams': 2,
                     'temperature': 1.5,
                     'top_p': 0.9,
                     'top_k': 30,
                     'max_length': 200,
                     'pad_token_id': tokenizer.eos_token_id}

In [39]:
contexts = ["This movie is", "I have to say", "I'm extremely disappointed"]

for text in contexts:
    input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
    model.eval()
    with torch.no_grad():
        out = model.generate(input_ids, **generation_kwargs)
    generated_text = list(map(tokenizer.decode, out))[0]
    print(generated_text)
    print('-' * 20)

This movie is one of the best I've ever seen.<br /><br />The acting is top notch.<br /><br />The plot is predictable at first, and the acting is a little too predictable at the end. However, I was able to watch this movie with a bunch of other friends, and it really made a great film.<br /><br />I think this is a great movie to see if you want to have fun watching movies.<br /><br />The acting is good, the acting is good, and the story is good. The acting is great, the acting is good, and the story is good. I would recommend this movie to all movie buffs.<br /><br />I recommend this movie to everyone.<br /><br />If you're into movies like this, I recommend this movie. It is a great movie to see if you like watching movies, but not if you're into movies.<br /><br />I would also
--------------------
I have to say I was very disappointed with this movie. The first time I saw this movie I was completely blown away by how well it made me laugh. I don't mean to compare it to anything else, I

### pushing to hub

In [19]:
model.push_to_hub('gpt2_imdb_generator_v2')

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/maxfil333/gpt2_imdb_generator_v2/commit/239e5f3504ac877da8bc72f321d7e02a3a81c2cb', commit_message='Upload model', commit_description='', oid='239e5f3504ac877da8bc72f321d7e02a3a81c2cb', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
tokenizer.push_to_hub('gpt2_imdb_generator_v2')

CommitInfo(commit_url='https://huggingface.co/maxfil333/gpt2_imdb_generator_v2/commit/47c15193a4855f212bb70e490dcdefd4bc02be80', commit_message='Upload tokenizer', commit_description='', oid='47c15193a4855f212bb70e490dcdefd4bc02be80', pr_url=None, pr_revision=None, pr_num=None)