In [1]:
import os
import time
os.environ['PYTORCH_ENABLE_MPS_FALLBACK']='1'

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


In [4]:
root = '/Users/pals/MICS/MICS_207/Project/datasets'
wikitext2_root = root + '/kaggle-wikitext/wikitext-2/'
train_file = wikitext2_root + 'wiki.train.tokens'
test_file  = wikitext2_root + 'wiki.test.tokens'
valid_file = wikitext2_root + 'wiki.valid.tokens'
unittest_file = wikitext2_root + 'unittest.tokens'

In [5]:
BATCH_SIZE = 4
NUM_EPOCHS = 1
SEQUENCE_LENGTH = 512
SHUFFLE_SIZE = 128
#BLOCK_SIZE = 512

In [6]:
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=SEQUENCE_LENGTH):
        with open(file_path, 'r', encoding='utf-8') as fd:
            self.tokens = []
            self.attention_masks = [] # Attention masks
            for line in fd:
                sline = line.strip()
                if len(sline) > 0:
                    tokens = tokenizer.encode(sline, truncation=True, max_length=max_length, padding='max_length')
                    self.tokens.append(torch.tensor(tokens, dtype=torch.long))
                    self.attention_masks.append(torch.tensor([1] * len(tokens), dtype=torch.long))

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, i):
        return self.tokens[i], self.attention_masks[i]


In [7]:
def load_dataset(file_path, tokenizer, shuffle=False, max_length=SEQUENCE_LENGTH, batch_size=BATCH_SIZE):
    dataset = TextDataset(file_path, tokenizer, max_length=max_length)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [8]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
#gpt2_tokenizer.padding_side = 'left'


In [9]:
train_dataloader    = load_dataset(train_file, gpt2_tokenizer, shuffle=True)
test_dataloader     = load_dataset(test_file, gpt2_tokenizer)
valid_dataloader    = load_dataset(valid_file, gpt2_tokenizer)
unittest_dataloader = load_dataset(unittest_file, gpt2_tokenizer)

In [10]:
for ele in unittest_dataloader:
    print(ele)

[tensor([[15496,  2159,    13,  ..., 50256, 50256, 50256],
        [49488,   318,  2901,  ..., 50256, 50256, 50256],
        [34784,  1365,  1110,  ..., 50256, 50256, 50256]]), tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])]


In [11]:
model_name = 'gpt2'
gpt2_lm = GPT2LMHeadModel.from_pretrained(model_name)
gpt2_lm.resize_token_embeddings(len(gpt2_tokenizer)) 

Embedding(50257, 768)

In [12]:
#gpt2_lm.summary()

In [13]:
optimizer = torch.optim.AdamW(gpt2_lm.parameters(), lr=5e-5)

In [14]:
device = None
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")
gpt2_lm.to(device)

Using device: mps


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [15]:
def generate(input_text, max_length=256):
    #device = torch.device("cpu")
    gpt2_lm.to(device)
    gpt2_lm.eval()
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt').to(device)
    attention_mask = torch.tensor([1] * len(input_ids[0]), dtype=torch.long).unsqueeze(0).to(device)
   
    with torch.no_grad():
        output = gpt2_lm.generate(input_ids, attention_mask=attention_mask, max_length=max_length, 
                                  pad_token_id=gpt2_tokenizer.eos_token_id, do_sample=True,
                                  num_return_sequences=5,
                                  no_repeat_ngram_size=2,
                                  temperature=0.7, 
                                  top_k=50, top_p=0.95)
    gen_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
    return gen_text

In [16]:
print(generate('I went on a trip to see Tajmahal in Agra. My trip was'))

  if eos_token_id is not None and torch.isin(elements=eos_token_id, test_elements=pad_token_id).any():


I went on a trip to see Tajmahal in Agra. My trip was very successful and it was a very rewarding experience. I am very happy to be here.

J.P.: How did you decide to travel to India, and how did your parents decide that you wanted to go to the country?
 (laughs)
...
. (laughter) And I think I had a lot of fun in India. It was amazing. And there are many of them. We are all very lucky and privileged to live in a country where people are very keen to come to this country. So I decided to get a chance to visit India and I did. But I was also very surprised by the people and by what was happening there. Many of the stories I heard in the newspapers about people who came to these places were very interesting, but they were not true. There were lots of people. Some of these people were from India who were living in Pakistan, some from Pakistan. One of my friends told me that there were thousands of Indian immigrants to Delhi, who had been living here for a long time and they had never even 

In [17]:
# Training function
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for inputs, attention_mask in tqdm(dataloader, desc="Training gpt2_lm"):
        optimizer.zero_grad()
        inputs = inputs.to(device)
        attention_mask = attention_mask.to(device)
        outputs = model(inputs, attention_mask=attention_mask, labels=inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


In [18]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, attention_mask in tqdm(dataloader, desc="Evaluating gpt2_lm"):
            inputs = inputs.to(device)
            attention_mask = attention_mask.to(device)
            outputs = model(inputs, attention_mask=attention_mask, labels=inputs)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [19]:
# Training loop
print(f"Using device: {device}")
st = time.time()
epochs = NUM_EPOCHS
train_dl = train_dataloader
valid_dl = valid_dataloader
for epoch in range(epochs):
    train_loss = train(gpt2_lm, train_dl, optimizer, device)
    valid_loss = evaluate(gpt2_lm, valid_dl, device)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss}, Validation Loss: {valid_loss}")

en = time.time()

save_path = './gpt2_finetuned_pt_v1'
gpt2_lm.save_pretrained(save_path)
gpt2_tokenizer.save_pretrained(save_path)


Using device: mps


Training gpt2_lm: 100%|██████████████████████████████████| 5942/5942 [1:21:10<00:00,  1.22it/s]
Evaluating gpt2_lm: 100%|████████████████████████████████████| 616/616 [02:26<00:00,  4.19it/s]


Epoch 1, Train Loss: 0.6345567315467673, Validation Loss: 0.5842087776400149


('./gpt2_finetuned_pt_v1/tokenizer_config.json',
 './gpt2_finetuned_pt_v1/special_tokens_map.json',
 './gpt2_finetuned_pt_v1/vocab.json',
 './gpt2_finetuned_pt_v1/merges.txt',
 './gpt2_finetuned_pt_v1/added_tokens.json')

In [20]:
print(f'Training time {(en-st)/3600} hours')

Training time 1.3937067453066507 hours


In [21]:
print(generate('I went on a trip to see Tajmahal in Agra. My trip was'))

I went on a trip to see Tajmahal in Agra. My trip was an amazing experience and I was in the right place at the wrong time. I saw a lot of things, from the Taj Mahal to the <unk> in Kolkata. There were many things that I didn = t see in a few months, but I remember I had an idea that would work. The idea of a tourist attraction in our city was born out of curiosity and curiosity. It was very important to me that there was a place to visit.


In [None]:
!jupyter kernelspec list

In [None]:
!jupyter kernelspec uninstall gpt2_pytorch


In [None]:
!jupyter kernelspec uninstall gpt2_pytorch


In [24]:
y

NameError: name 'y' is not defined