In [1]:
import torch 
import transformers
from transformers import GPTNeoXForCausalLM, GPTNeoXConfig, GPT2Tokenizer

from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm 
import os

In [2]:
# Define the training params
num_iters = 10
eval_interval = 5  
save_interval = 10 
checkpoint_dir = './checkpoints' 
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
    
model_dir='./configs/150m'
max_len=30#None #none for setting it by model config

# model setup

In [11]:
prev_model = GPTNeoXForCausalLM.from_pretrained("NinedayWang/PolyCoder-2.7B")
tokenizer = transformers.AutoTokenizer.from_pretrained("NinedayWang/PolyCoder-2.7B")

#tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
#config=prev_model.config
#config

In [5]:
#config.save_pretrained('./configs/2.7b')

In [13]:
tokenizer.save_pretrained('./configs/tokenizer')
#tokenizer
tokenizer=GPT2Tokenizer.from_pretrained('./configs/tokenizer')
tokenizer

GPT2Tokenizer(name_or_path='./configs/tokenizer', vocab_size=50257, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [36]:
tuple(model.gpt_neox.embed_in.parameters())[0].size(0)

50304

In [45]:
tokenizer.decode([1])

'<|padding|>'

In [47]:
inputs=torch.IntTensor([tokenizer('hey').input_ids+[1]])
attention_mask = (inputs != -100).type(inputs.type())
model(inputs,attention_mask=attention_mask)

CausalLMOutputWithPast(loss=None, logits=tensor([[[-0.0349, -0.0444, -0.4438,  ..., -0.6171, -0.7699, -1.2379],
         [ 0.1571,  0.3002, -1.2636,  ..., -0.1776, -0.4174, -1.4724],
         [ 0.1678, -0.1682, -0.8703,  ..., -0.2473, -0.4771, -1.4917]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-8.4947e-01, -5.5434e-01,  2.8327e-01,  ..., -3.0628e-01,
            6.6414e-03,  6.7636e-01],
          [-3.4268e-01,  2.2157e-01, -7.5297e-01,  ...,  8.6427e-01,
            6.6669e-01, -2.2103e-01],
          [-1.0553e-01, -7.2902e-01, -1.0978e+00,  ..., -1.9962e-01,
            1.9036e-01, -1.9859e-01]],

         [[-9.8604e-02, -2.0614e-01,  2.3647e-01,  ..., -4.8637e-01,
           -6.1660e-01, -4.9936e-01],
          [ 4.0238e-01, -1.2090e+00,  2.9234e-01,  ..., -1.2248e-02,
            3.0021e-01,  2.3248e-01],
          [-4.8322e-01,  2.7399e-01,  2.7650e-01,  ...,  1.3746e-01,
            7.0240e-02,  1.5610e-01]],

         [[ 8.7333e-01,  1.1480e+00,  5.9

In [7]:
tokenizer=GPT2Tokenizer.from_pretrained('./configs/tokenizer')
config=GPTNeoXConfig.from_pretrained(model_dir)
model=GPTNeoXForCausalLM(config)
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 768)
    (layers): ModuleList(
      (0-11): 12 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attention): GPTNeoXAttention(
          (rotary_emb): RotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (embed_out): Linear(in_features=768, out_features=50304, bias=False)
)

In [9]:
tokenizer('heya',padding='max_length',max_length=4)

{'input_ids': [281, 18585, 50257, 50257], 'attention_mask': [1, 1, 0, 0]}

In [8]:
if max_len==None:
    max_len=config.max_position_embeddings

# data

In [9]:
import json
data_file='cpp000000000302.json'

In [10]:
data = []
errors=[]
faultys=[]
with open(data_file,'rb') as f:
    for i,line in enumerate(f):
        try:
          data.append(json.loads(line))
        except Exception as e:
          print(f'errored at {i}')
          errors.append(e)
          faultys.append(line)

# Now 'data' is a list of all the JSON objects in the file
print(f'data: {len(data)} errors: {len(errors)}')

data: 10108 errors: 0


In [11]:
codes=[d['content'] for d in data[0:100] if 'content' in d.keys()]

In [47]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        for text in tqdm(texts):
            encodings = tokenizer(text, truncation=True,
                                  #padding='max_length',
                                  max_length=max_len)
            #encodings=tokenizer(text)
            self.inputs.append(encodings['input_ids'])
            self.targets.append(encodings['input_ids'])

    def __getitem__(self, idx):
        item = {"input_ids": torch.IntTensor(self.inputs[idx]), 
                "labels": torch.IntTensor(self.targets[idx])}
        return item

    def __len__(self):
        return len(self.inputs)


In [48]:
# Tokenize your dataset and create a PyTorch Dataset
# Assuming `codes` is a list of strings
dataset = TextDataset(codes, tokenizer)

# Split dataset into training and test set
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_dataset

  0%|          | 0/100 [00:00<?, ?it/s]

<torch.utils.data.dataset.Subset at 0x7fcda0262320>

In [49]:
train_dataset[1]['input_ids'].shape

torch.Size([30])

In [50]:
tokenizer.pad_token_id

50257

In [51]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True)
    labels = pad_sequence([item['labels'] for item in batch], batch_first=True)
    return {"input_ids": input_ids, "labels": labels}

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)


In [52]:
for batch in train_loader:
    x=batch
    break
    
{k:v.shape for k,v in x.items()}

{'input_ids': torch.Size([16, 30]), 'labels': torch.Size([16, 30])}

# training

In [53]:
from transformers import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts




# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)
model=torch.nn.DataParallel(model, device_ids=['cpu'])

# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00016, betas=(0.9, 0.999), eps=1.0e-8)


# Define the learning rate scheduler
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=num_iters, T_mult=1, eta_min=0, last_epoch=-1)

In [54]:
model.train()
# Training Loop
for epoch in range(1, num_iters+1):
    print(f'Epoch {epoch}/{num_iters}')
    
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop with tqdm
    train_loader_tqdm = tqdm(train_loader)
    for batch in train_loader_tqdm:
        # Zero the gradients
        optimizer.zero_grad()

        # Load data and labels
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device).to(torch.long)
        
        # Forward pass
        outputs = model(input_ids, labels=labels)
        
        # Get the loss from the outputs
        loss = outputs.loss
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Update the learning rate.
        scheduler.step()

        # Add the loss to the total loss
        total_loss += loss.cpu().detach().item()

        # Update the progress bar
        train_loader_tqdm.set_postfix({'running_loss': total_loss /  (train_loader_tqdm.n + 1)})

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_loader)
    print(f"Average training loss: {avg_train_loss}")

    # Evaluation
    # Evaluation
if epoch % eval_interval == 0:
    model.eval()
    eval_total_loss = 0

    # Adding tqdm to evaluation loop
    test_loader_tqdm = tqdm(test_loader, desc="Evaluating")
    for batch in test_loader_tqdm:
        with torch.no_grad():
            # Load data and labels
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device).to(torch.long)
            
            # Forward pass
            outputs = model(input_ids, labels=labels)
            
            # Get the loss from the outputs
            loss = outputs.loss

            # Add the loss to the total loss
            eval_total_loss += loss.cpu().detach().item()

            # Update the progress bar
            test_loader_tqdm.set_postfix({'eval_loss': eval_total_loss / (test_loader_tqdm.n + 1)})

    avg_eval_loss = eval_total_loss / len(test_loader)
    print(f"Average evaluation loss: {avg_eval_loss}")
    model.train()


    # Save a checkpoint
    if epoch % save_interval == 0:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_train_loss,
        }, f'{checkpoint_dir}/checkpoint_{epoch}.pt')


Epoch 1/10


  0%|          | 0/6 [00:00<?, ?it/s]

Average training loss: 8.92725165685018
Epoch 2/10


  0%|          | 0/6 [00:00<?, ?it/s]

Average training loss: 7.505497694015503
Epoch 3/10


  0%|          | 0/6 [00:00<?, ?it/s]

Average training loss: 6.513497829437256
Epoch 4/10


  0%|          | 0/6 [00:00<?, ?it/s]

Average training loss: 5.762369712193807
Epoch 5/10


  0%|          | 0/6 [00:00<?, ?it/s]

Average training loss: 4.894678036371867
Epoch 6/10


  0%|          | 0/6 [00:00<?, ?it/s]

Average training loss: 4.205408215522766
Epoch 7/10


  0%|          | 0/6 [00:00<?, ?it/s]

Average training loss: 3.3666664759318032
Epoch 8/10


  0%|          | 0/6 [00:00<?, ?it/s]

Average training loss: 2.965507745742798
Epoch 9/10


  0%|          | 0/6 [00:00<?, ?it/s]

Average training loss: 2.5062397718429565
Epoch 10/10


  0%|          | 0/6 [00:00<?, ?it/s]

Average training loss: 2.0898282329241433


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Average evaluation loss: 4.286717891693115


In [55]:
# Load a checkpoint
checkpoint = torch.load(f'{checkpoint_dir}/checkpoint_{epoch}.pt')

# Load the checkpoint into your model and optimizer
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>