In [1]:
!pip install numpy torch tiktoken datasets tqdm

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [2]:
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import tiktoken
from datasets import load_dataset
import torch.nn as nn
from torch.nn import functional as F
base_dir = ''
batch_size = 48 # 15.4gb na p100 (16gb)
    # dla 12 -- 4gb
    # dla 32 -- 10gb
block_size = 256 # context length
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
vocab_size=50304
torch.manual_seed(1337)
enc = tiktoken.get_encoding("gpt2")
encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
decode = lambda l: enc.decode(l)

In [3]:
dataset = load_dataset('wikitext', 'wikitext-103-raw-v1')

# tokenize dataset
# define encoding function 
# gpt2 bpe
def _tokenize(dataset, sft=False):
    def process(example):
        ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
        ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
        out = {'ids': ids, 'len': len(ids)}
        return out

    tokenized = dataset.map(
        process,
        remove_columns=['text'],
        desc="tokenizing the splits",
    )

    # concatenate ids in each dataset into one large file
    for split, dset in tokenized.items():
        arr_len = np.sum(dset['len'], dtype=np.uint64)
        if sft:
            filename = os.path.join(base_dir, f'{split}_sft.bin')
        else:
            filename = os.path.join(base_dir, f'{split}.bin')


        dtype = np.uint16
        arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
        total_batches = 256

        idx = 0
        for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
            # Batch together samples for faster write
            batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
            arr_batch = np.concatenate(batch['ids'])
            # Write into mmap
            arr[idx : idx + len(arr_batch)] = arr_batch
            idx += len(arr_batch)
        arr.flush()
        
_tokenize(dataset)

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

tokenizing the splits:   0%|          | 0/4358 [00:00<?, ? examples/s]

tokenizing the splits:   0%|          | 0/1801350 [00:00<?, ? examples/s]

tokenizing the splits:   0%|          | 0/3760 [00:00<?, ? examples/s]

writing test.bin: 100%|██████████| 256/256 [00:00<00:00, 330.47it/s]
writing train.bin: 100%|██████████| 256/256 [00:21<00:00, 11.85it/s]
writing validation.bin: 100%|██████████| 256/256 [00:00<00:00, 361.09it/s]


In [4]:
def get_batch(split):
    # np.memmap every batch avoids memory leak
    if split == 'train':
        data = np.memmap('train.bin', dtype=np.uint16, mode='r')
    elif split == 'val': 
        data = np.memmap('validation.bin', dtype=np.uint16, mode='r')
    elif split == 'train_sft':
        data = np.memmap('train_sft.bin', dtype=np.uint16, mode='r')
    elif split == 'val_sft':
        data = np.memmap('test_sft.bin', dtype=np.uint16, mode='r')
    else:
        print("ERROR wrong split)")
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

In [5]:
@torch.no_grad()
def estimate_loss(splits):
    out = {}
    model.eval()
    for split in splits:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [6]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            #idx_cond = idx[:, -block_size:]
            idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]

            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [7]:
def train(iters, eval_interval):
    for iter in range(iters):

        # eval loss on train and val
        if iter % eval_interval == 0 or iter == iters - 1:
            losses = estimate_loss(splits = ['train', 'val'])
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        xb, yb = get_batch('train')

        # evaluate loss
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

In [8]:
def generate(max_new, user_prompt='Write a comprehensive blog post of at least 1000 words about the top 10 most eco-friendly cities in the world and their renewable energy initiatives.', chat_template=False):
    prompt=f"<|system|>\n<|endoftext|>\n<|user|>\n{user_prompt}<|endoftext|>\n<|assistant|>\n"

    prompt_ids = encode(prompt)
    context = torch.tensor(prompt_ids, dtype=torch.long, device=device)[None, ...]
    decoded = decode(model.generate(context, max_new_tokens=max_new)[0].tolist())
    
    if chat_template:
        messages = decoded.split('<|endoftext|>\n')
        system = messages[0].strip("<|system|>")
        user = messages[1].strip("<|user|>")
        assistant = messages[2].strip("<|assistant|>")
    
        print(
            #system + 
            #user + 
            assistant
        )
    else:
        print("\n\n\n\ndebugging\n", repr(decoded))

In [9]:
def save_model():
    torch.save(model.state_dict(), 'model_state_dict.pth')

In [10]:
model = GPTLanguageModel()
model = model.to(device)
model.load_state_dict(torch.load('model_state_dict.pth'))

## or, import to kaggle, load from kaggle models dir
#cp ../input/llm/pytorch/llm/1/model_state_dict.pth model_state_dict.pth

print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

49.42272 M parameters


In [22]:
generate(50)


 Calculator HPV cons All Seconds relegated payoff created we be for so his vividly in the and viewed , matched<|endoftext|> fansag secondthritis . E . qualifier winner embr East messenger also . hadout than but Silver below D. hall vir closed daughter been@ Williams


In [27]:
train(40,10)

step 0: train loss 7.2426, val loss 7.2136
step 10: train loss 7.0737, val loss 7.0456
step 20: train loss 6.9688, val loss 6.9339
step 30: train loss 6.8742, val loss 6.8345
step 39: train loss 6.7833, val loss 6.7515


In [28]:
generate(50)


<|endoftext|> has them agent ,ian was barrels @-@ twoy and ay , War of receive by repetitive to the road. ' hold Important . 
<|endoftext|> = Eng team , 2s case pay as English band 21osaurus that which the Og


In [14]:
train(200, 10)

step 0: train loss 6.7688, val loss 6.7408
step 10: train loss 6.7276, val loss 6.6897
step 20: train loss 6.6497, val loss 6.6190
step 30: train loss 6.5806, val loss 6.5419
step 39: train loss 6.5332, val loss 6.4964
step 40: train loss 6.5284, val loss 6.4970
step 50: train loss 6.4697, val loss 6.4426
step 60: train loss 6.4207, val loss 6.3892
step 70: train loss 6.3867, val loss 6.3462
step 80: train loss 6.3448, val loss 6.3074
step 90: train loss 6.3100, val loss 6.2733
step 100: train loss 6.2688, val loss 6.2361
step 110: train loss 6.2324, val loss 6.2042
step 120: train loss 6.2097, val loss 6.1655
step 130: train loss 6.1757, val loss 6.1365
step 140: train loss 6.1388, val loss 6.1200
step 150: train loss 6.1259, val loss 6.0944
step 160: train loss 6.0965, val loss 6.0610
step 170: train loss 6.0717, val loss 6.0383
step 180: train loss 6.0471, val loss 6.0235
step 190: train loss 6.0228, val loss 5.9971


In [10]:
generate(50)


<|endoftext|><|endoftext|> = = Release cop payoff created in the destroyer harows , in the secondary studies , with a eastern enshrulation of costs ) . He winner was formed from the 1947 had an than coming on her ditch and for viritudes of thereg Williams


In [9]:
train(401, 40)

step 0: train loss 6.0047, val loss 5.9808
step 40: train loss 5.9719, val loss 5.9470
step 80: train loss 5.9024, val loss 5.8678
step 120: train loss 5.8569, val loss 5.8164
step 160: train loss 5.7957, val loss 5.7671
step 200: train loss 5.7344, val loss 5.7188
step 240: train loss 5.6947, val loss 5.6601
step 280: train loss 5.6538, val loss 5.6171
step 320: train loss 5.6059, val loss 5.5790
step 360: train loss 5.5522, val loss 5.5319
step 400: train loss 5.5144, val loss 5.4841


In [12]:
train(1001, 100)

step 0: train loss 5.5173, val loss 5.4824
step 100: train loss 5.4261, val loss 5.3862
step 200: train loss 5.3426, val loss 5.2972
step 300: train loss 5.2418, val loss 5.2157
step 400: train loss 5.1732, val loss 5.1393
step 500: train loss 5.1061, val loss 5.0724
step 600: train loss 5.0446, val loss 5.0092
step 700: train loss 4.9920, val loss 4.9525
step 800: train loss 4.9460, val loss 4.8913
step 900: train loss 4.8946, val loss 4.8504
step 1000: train loss 4.8488, val loss 4.8100


In [14]:
generate(100)


<|endoftext|><|endoftext|> = = Scientific outlets = = 
<|endoftext|><|endoftext|> The association of New Mexico = = 
<|endoftext|><|endoftext|> Spiceニ Chemomeras of Shetta varieties have Ont qualify for most unknown Parkinson 'm Particularly alterations to family members in the Danish virtues of their living holidays , including performing in university Persian and poli , written in captivity on the 1980s . In 1950 of recent years , hundreds is required to meet where they are Queen Canadian American and sports . It appears to have been abundant modelfilm


In [15]:
train(2001, 200)

step 0: train loss 4.8504, val loss 4.8109
step 200: train loss 4.7605, val loss 4.7189
step 400: train loss 4.6924, val loss 4.6513
step 600: train loss 4.6252, val loss 4.5869
step 800: train loss 4.5701, val loss 4.5429
step 1000: train loss 4.5191, val loss 4.4950
step 1200: train loss 4.4668, val loss 4.4604
step 1400: train loss 4.4277, val loss 4.4138
step 1600: train loss 4.3929, val loss 4.3737
step 1800: train loss 4.3553, val loss 4.3509
step 2000: train loss 4.3191, val loss 4.3259


In [18]:
train(5001, 500)

step 0: train loss 4.3213, val loss 4.3188
step 500: train loss 4.2568, val loss 4.2489
step 1000: train loss 4.1952, val loss 4.1928
step 1500: train loss 4.1385, val loss 4.1267
step 2000: train loss 4.1005, val loss 4.0926
step 2500: train loss 4.0464, val loss 4.0406
step 3000: train loss 4.0084, val loss 4.0092
step 3500: train loss 3.9651, val loss 3.9822
step 4000: train loss 3.9328, val loss 3.9411
step 4500: train loss 3.8859, val loss 3.9271
step 5000: train loss 3.8747, val loss 3.8898


In [21]:
train(5001, 500)

step 0: train loss 3.8604, val loss 3.8885
step 500: train loss 3.8475, val loss 3.8731
step 1000: train loss 3.8315, val loss 3.8464
step 1500: train loss 3.8131, val loss 3.8321
step 2000: train loss 3.7791, val loss 3.8089
step 2500: train loss 3.7660, val loss 3.7893
step 3000: train loss 3.7425, val loss 3.7786
step 3500: train loss 3.7329, val loss 3.7655
step 4000: train loss 3.7228, val loss 3.7470
step 4500: train loss 3.7041, val loss 3.7285
step 5000: train loss 3.6993, val loss 3.7184


In [24]:
train(10001, 1000)

step 0: train loss 3.7006, val loss 3.7250
step 1000: train loss 3.6765, val loss 3.7028
step 2000: train loss 3.6556, val loss 3.6777
step 3000: train loss 3.6234, val loss 3.6730
step 4000: train loss 3.5968, val loss 3.6447
step 5000: train loss 3.5926, val loss 3.6306
step 6000: train loss 3.5811, val loss 3.6300
step 7000: train loss 3.5701, val loss 3.6117
step 8000: train loss 3.5433, val loss 3.6027
step 9000: train loss 3.5315, val loss 3.5925
step 10000: train loss 3.5249, val loss 3.5813


In [9]:
generate(100)


<|endoftext|><|endoftext|> = = Release = = 
<|endoftext|><|endoftext|> The song was released for purchase on November 3 , 2008 as the second single from Who Homogenic : Reloaded . The song had previously been featured on a two @-@ track EP entitled Who Homogenic : Sony / AT , surrounded by the Rolling Stone and Little A & R executive man . It served as " guest guest single " , as they actually written a few beats ari , becoming the second and third singles in the UK . In the


In [None]:
train(30001, 3000)

step 0: train loss 3.5193, val loss 3.5781
step 3000: train loss 3.5075, val loss 3.5679
step 6000: train loss 3.4657, val loss 3.5264


In [None]:
train(30001, 3000)

step 0: train loss 3.5193, val loss 3.5781
step 3000: train loss 3.5075, val loss 3.5679
step 6000: train loss 3.4657, val loss 3.5264
step 9000: train loss 3.4574, val loss 3.5214
step 12000: train loss 3.4281, val loss 3.5054
step 15000: train loss 3.4130, val loss 3.4819


In [17]:
train(3001, 3000)

step 0: train loss 3.3350, val loss 3.4293
step 3000: train loss 3.3449, val loss 3.4379


In [17]:
generate(300)

<|system|>
<|endoftext|><|user|>
hello Jan<|endoftext|>
[nergy And Workán: How does you handle a lot of speeds of costs and conditions in these industries. Here are some specific examples by my mentors includes

1. Collaborate: We've stumbled upon a comprehensive framework of everything that you want, and what works well with?" I want to experiment with patience.<|endoftext|>
<|assistant|>
NoKE is responsible for Unleashing Industry
 Including personalized design, sake. We can build the base of the clinics to meet technical challenges and cooperation intervals. The development and development of HIStory as a way of coordinating where you help you to meet your needs and resources can vary among other contexts, nature, and cultural partners. Symposium [DesignATE]. I've not looked to be a beginner group and research my mentors you would started with you to get to the organization to start my project, action application protocols. Side one, co-lining multiple campaigns now” according, and 

In [22]:
train(3001, 3000)

step 0: train loss 3.3268, val loss 3.4324
step 3000: train loss 3.3215, val loss 3.4267


In [12]:
train(3001, 1000)

step 0: train loss 3.3191, val loss 3.4154
step 1000: train loss 3.2787, val loss 3.4207
step 2000: train loss 3.2694, val loss 3.4165
step 3000: train loss 3.2716, val loss 3.4169


In [38]:
generate(300)



 = = Rules and considerations = = 


 The legality of change has not been given some degree. Due to noted flaws in pre-existing laws, some cases do not require material exclusive to her. This classification is not used by the Supreme Court into a series of books. However, as the decision progressed, these documents have were given no special privileges and would thus adopt certain systems. 

 Novello 's " Folding " hypothesis has been extensively supported by its approach purposes and is contested by AI researchers. In addition, it has been suggested that failing to find a clean basis for inheritance conducted by newly established AI would reduce ; failing to obtain normal resources by generation of individuals, especially the liable for the hard work of the labour force and the working group found them the engineers. In contrast, the inclusion of senior AI researchers places the normal finding point of AI as a priority meant that, unlike older AI researchers, the purpose was to indi

In [31]:
train(3001, 1000)

step 0: train loss 3.3229, val loss 3.4246
step 1000: train loss 3.2552, val loss 3.4304
step 2000: train loss 3.2542, val loss 3.4246
step 3000: train loss 3.2616, val loss 3.4284


In [32]:
train(6001, 2000)

step 0: train loss 3.2606, val loss 3.4322
step 2000: train loss 3.2680, val loss 3.4210
step 4000: train loss 3.2939, val loss 3.4042
step 6000: train loss 3.3060, val loss 3.3977


In [33]:
train(6001, 2000)

step 0: train loss 3.2994, val loss 3.3946
step 2000: train loss 3.2939, val loss 3.4050
step 4000: train loss 3.2970, val loss 3.3895
step 6000: train loss 3.2774, val loss 3.3883


In [None]:
# SFT

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('gpt2')

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = 256

DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [12]:
sft_datasets = load_dataset("HuggingFaceH4/ultrachat_200k")
sft_datasets

Downloading readme:   0%|          | 0.00/4.44k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/81.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/80.4M [00:00<?, ?B/s]

Generating train_sft split:   0%|          | 0/207865 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/23110 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/256032 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/28304 [00:00<?, ? examples/s]

DatasetDict({
    train_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 207865
    })
    test_sft: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 23110
    })
    train_gen: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 256032
    })
    test_gen: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 28304
    })
})

In [13]:
from datasets import DatasetDict

dataset_dict = {"train": sft_datasets["train_sft"],
                "test": sft_datasets["test_sft"]}

sft_datasets_split = DatasetDict(dataset_dict)
sft_datasets_split

DatasetDict({
    train: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 207865
    })
    test: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 23110
    })
})

In [14]:
import re
import random

def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

    return example

column_names = list(sft_datasets_split["train"].features)
sft_datasets_split_chat_template =sft_datasets_split.map(apply_chat_template,
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template",)

Applying chat template:   0%|          | 0/207865 [00:00<?, ? examples/s]

Applying chat template:   0%|          | 0/23110 [00:00<?, ? examples/s]

In [15]:
_tokenize(sft_datasets_split_chat_template, sft=True)

tokenizing the splits:   0%|          | 0/207865 [00:00<?, ? examples/s]

tokenizing the splits:   0%|          | 0/23110 [00:00<?, ? examples/s]

writing train_sft.bin: 100%|██████████| 256/256 [00:04<00:00, 54.97it/s]
writing test_sft.bin: 100%|██████████| 256/256 [00:01<00:00, 223.28it/s]


In [16]:
@torch.no_grad()
def estimate_loss_sft():
    out = {}
    model.eval()
    for split in ['train_sft', 'val_sft']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)

            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [17]:
def train_sft(iters, eval_interval):
    
    for iter in range(iters):

        # eval on train and val
        if iter % eval_interval == 0 or iter == iters - 1:
            losses = estimate_loss_sft()
            print(f"step {iter}: train loss {losses['train_sft']:.4f}, val loss {losses['val_sft']:.4f}")

        xb, yb = get_batch('train_sft')


        # eval loss
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

In [77]:
train_sft(300, 100)

step 100: train loss 3.6887, val loss 3.7085
step 200: train loss 3.5232, val loss 3.5200
step 299: train loss 3.4085, val loss 3.4129


In [37]:
train_sft(300, 100)

step 0: train loss 3.4175, val loss 3.4386
step 100: train loss 3.3510, val loss 3.3742
step 200: train loss 3.3064, val loss 3.3071
step 299: train loss 3.2555, val loss 3.2635


In [39]:
train_sft(6000, 1000)

step 0: train loss 3.2581, val loss 3.2730
step 1000: train loss 3.0120, val loss 3.0260
step 2000: train loss 2.9026, val loss 2.9350
step 3000: train loss 2.8292, val loss 2.8625
step 4000: train loss 2.7787, val loss 2.8059
step 5000: train loss 2.7437, val loss 2.7750
step 5999: train loss 2.7038, val loss 2.7505


In [47]:
train_sft(6000, 1000)

step 0: train loss 2.7078, val loss 2.7455
step 1000: train loss 2.6950, val loss 2.7341
step 2000: train loss 2.6709, val loss 2.7212
step 3000: train loss 2.6247, val loss 2.6900
step 4000: train loss 2.6019, val loss 2.6751
step 5000: train loss 2.5957, val loss 2.6610
step 5999: train loss 2.5628, val loss 2.6468


In [93]:
train_sft(12000, 2000)

step 0: train loss 2.5861, val loss 2.6568
step 2000: train loss 2.5430, val loss 2.6298
step 4000: train loss 2.5724, val loss 2.6216
step 6000: train loss 2.5560, val loss 2.6032
step 8000: train loss 2.5307, val loss 2.5827
step 10000: train loss 2.5076, val loss 2.5574
step 11999: train loss 2.5006, val loss 2.5719


In [210]:
train_sft(6000, 1000)

step 0: train loss 2.4938, val loss 2.5556
step 1000: train loss 2.5002, val loss 2.5623
step 2000: train loss 2.4909, val loss 2.5637
step 3000: train loss 2.4754, val loss 2.5444
step 4000: train loss 2.4698, val loss 2.5463
step 5000: train loss 2.4739, val loss 2.5401
step 5999: train loss 2.4515, val loss 2.5342


In [31]:
train_sft(6000, 1000)

step 0: train loss 2.4679, val loss 2.5351
step 1000: train loss 2.4776, val loss 2.5460
step 2000: train loss 2.4722, val loss 2.5502
step 3000: train loss 2.4588, val loss 2.5342
step 4000: train loss 2.4548, val loss 2.5346
step 5000: train loss 2.4625, val loss 2.5330
step 5999: train loss 2.4387, val loss 2.5248


In [27]:
train_sft(3000, 1000)
save_model() 

step 0: train loss 2.4455, val loss 2.5953
step 1000: train loss 2.4013, val loss 2.5249
step 2000: train loss 2.3865, val loss 2.5351
step 2999: train loss 2.4326, val loss 2.5251


In [25]:
train_sft(3000, 500)
save_model() 

step 0: train loss 2.4549, val loss 2.5201
step 500: train loss 2.4551, val loss 2.5329
step 1000: train loss 2.4476, val loss 2.5122
step 1500: train loss 2.4499, val loss 2.5074
step 2000: train loss 2.4429, val loss 2.5076
step 2500: train loss 2.4372, val loss 2.5180
step 2999: train loss 2.4309, val loss 2.5189


In [27]:
train_sft(3000, 1000)
save_model() 

step 0: train loss 2.4384, val loss 2.5026
step 1000: train loss 2.3664, val loss 2.5247
step 2000: train loss 2.4311, val loss 2.5013
step 2999: train loss 2.3775, val loss 2.5006


In [28]:
train_sft(3000, 1000)
save_model() 

step 0: train loss 2.4056, val loss 2.5406
step 1000: train loss 2.3866, val loss 2.4915
step 2000: train loss 2.3629, val loss 2.5062
step 2999: train loss 2.4023, val loss 2.4992


In [35]:
train_sft(16000, 3000)
save_model() 

step 0: train loss 2.4024, val loss 2.5084
step 3000: train loss 2.3941, val loss 2.4819
step 6000: train loss 2.4046, val loss 2.4790
step 9000: train loss 2.3782, val loss 2.4671
step 12000: train loss 2.3592, val loss 2.4554
step 15000: train loss 2.3769, val loss 2.4657
step 15999: train loss 2.3654, val loss 2.4427


In [22]:
train_sft(5000, 1000)
save_model() 

step 1000: train loss 2.3655, val loss 2.4621
step 2000: train loss 2.3637, val loss 2.4672
step 3000: train loss 2.3622, val loss 2.4544
step 4000: train loss 2.3309, val loss 2.4499
step 4999: train loss 2.3446, val loss 2.4540


In [18]:
generate(1024, user_prompt='Write a comprehensive blog post of at least 1000 words about the top 10 most eco-friendly cities in the world and their renewable energy initiatives.', chat_template=True)


Comprehensively reviewing and promoting the National Renewable, Renewable Energy System (NREL) development programs for federal, state, and local communities can take a competitive approach, take customer satisfaction surveys, and demonstrate the duty of investments in the Flexible, Wind, Public Transit Industry.

In this post, we will explore various sustainable cities and their renewable energy initiatives, provide recommendations for sustainable development, and address the need for fuel, public transport, subsidies, Electroc, and other incentives to encourage and support the program. We will design and implement policies and practices that promote renewable energy for major waterways and address common challenges.

Benefits: Advertising in the Green Business Neighborhood Service include:

1. Cost Savings
2. Environmental Cost
3. Reducing greenhouse gas emissions and energy consumption
4. Energy Use Extensive Maintenance
5. Boosts an efficient utilization of carbon emissions
   D2 