In [1]:
import warnings
from transformers import TextStreamer
from transformers import AutoTokenizer, AutoModelForCausalLM
import datasets

warnings.filterwarnings('ignore')

In [2]:
import torch
def fix_torch_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False  

fix_torch_seed()

# Loading a general pretrained model

In [3]:
model_name='upstage/TinySolar-248m-4k'

In [4]:
tiny_general_model=AutoModelForCausalLM.from_pretrained(
                      model_name,
                      device_map='cpu',
                      torch_dtype=torch.bfloat16
                      )                                    

In [5]:
tiny_general_tokenizer=AutoTokenizer.from_pretrained(model_name)

# Generate text samples

In [6]:
prompt="I am an engineer. I love"

In [7]:
inputs=tiny_general_tokenizer(prompt, return_tensors="pt")


In [8]:
streamer=TextStreamer(tiny_general_tokenizer,skip_prompt=True, skip_special_tokens=True)

In [9]:
output=tiny_general_model.generate(**inputs, streamer=streamer, max_new_tokens=128,do_sample=False,temperature=0.0,repetition_penalty=1.1)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


to travel and have a great time, but I'm not sure if I can do it all again.
I've been working on my first book for the last 10 years, and I've always wanted to write about something that has happened in my life. It's been a long journey, but I've finally found my voice. I'm so excited to share this story with you!
I'm a huge fan of the "Sweet Home Alabama" series, and I've read some of their books before. I've also enjoyed reading them as well. I've read


In [10]:
prompt="def find_max(arr):"

In [11]:
inputs=tiny_general_tokenizer(prompt, return_tensors="pt").to(tiny_general_model.device)

streamer=TextStreamer(tiny_general_tokenizer,skip_prompt=True, skip_special_tokens=True)

In [12]:
outputs=tiny_general_model.generate(**inputs, streamer=streamer, max_new_tokens=128,do_sample=False,temperature=0.0,repetition_penalty=1.1)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



   """
   Returns the number of rows in a row.
   """
   if not arr:
       return 1
   else:
       return 0


def get_rows(row):
   """
   Returns the number of rows in a row.
   """
   if len(row) == 2:
       return 1
   else:
       return 0


def get_rows_from_table(table, table_name):
   """
   Returns the number of rows in a table.
   """
   if len


# Generate Python samples with finetuned Python model

In [13]:
model_name='upstage/TinySolar-248m-4k-code-instruct'

In [14]:
tiny_finetuned_model=AutoModelForCausalLM.from_pretrained(
                      model_name,
                      device_map='cpu',
                      torch_dtype=torch.bfloat16
                      )
tiny_finetuned_tokenizer=AutoTokenizer.from_pretrained(model_name)

In [15]:
prompt="def find_max(arr):"

inputs=tiny_finetuned_tokenizer(prompt, return_tensors="pt").to(tiny_finetuned_model.device)

streamer=TextStreamer(tiny_finetuned_tokenizer,skip_prompt=True, skip_special_tokens=True)

outputs=tiny_finetuned_model.generate(**inputs, streamer=streamer, max_new_tokens=128,do_sample=False,temperature=0.0,repetition_penalty=1.1)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



   max_len = arr[0]
   max_len_index = 0
   
   for i in range(1, len(arr)):
       if arr[i] > max_len:
           max_len_index = i
           
   return max_len

# Test the function
arr = [5, 2, 9, 8, 3, 7, 6, 4, 2, 1, 8, 1, 6, 4, 2, 1, 6, 4, 


# Generate Python samples with pretrained Python model


In [16]:
model_name='upstage/TinySolar-248m-4k-py'

In [17]:
tiny_general_model=AutoModelForCausalLM.from_pretrained(
                      model_name,
                      device_map='cpu',
                      torch_dtype=torch.bfloat16
                      )

In [18]:
tiny_general_tokenizer=AutoTokenizer.from_pretrained(model_name)

In [19]:
prompt="def find_max(numbers):"

inputs=tiny_general_tokenizer(prompt, return_tensors="pt").to(tiny_general_model.device)

streamer=TextStreamer(tiny_general_tokenizer,skip_prompt=True, skip_special_tokens=True)

outputs=tiny_general_model.generate(**inputs, streamer=streamer, max_new_tokens=128,do_sample=False,temperature=0.0,repetition_penalty=1.1)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



   """Find the maximum number of numbers in a list."""
   max = 0
   for num in numbers:
       if num > max:
           max = num
   return max


def get_min_max(numbers, min_value=1):
   """Get the minimum value of a list."""
   min_value = min_value or 1
   for num in numbers:
       if num < min_value:
           min_value = num
   return min_value



In [20]:
import datasets
pretraining_dataset = datasets.load_dataset(
    "upstage/Pretraining_Dataset",
    split="train"
)

In [21]:
print(pretraining_dataset)

Dataset({
    features: ['text', 'meta'],
    num_rows: 60000
})


In [22]:
pretraining_dataset=pretraining_dataset.select_columns(['text'])

In [23]:
print(pretraining_dataset[1]['text'][:500])

Canada Pension Plan
Find sources: "Canada Pension Plan" – news · newspapers · books · scholar · JSTOR (March 2013) (Learn how and when to remove this template message)
The Canada Pension Plan (CPP; French: Régime de pensions du Canada) is a contributory, earnings-related social insurance program. It forms one of the two major components of Canada's public retirement income system, the other component being Old Age Security (OAS). Other parts of Canada's retirement system are private pensions, ei


# Instruction based dataset generated by chat gpt-4o

In [24]:
instruction_dataset=datasets.load_dataset(
    'c-s-ale/alpaca-gpt4-data',
    split='train'
)
print(instruction_dataset)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 52002
})


In [25]:
i=0

print('Instruction: '+instruction_dataset[i]['instruction']
     +'\nInput: '+instruction_dataset[i]['input']
      +'\nOutput: '+instruction_dataset[i]['output']
     )

Instruction: Give three tips for staying healthy.
Input: 
Output: 1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.

2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.

3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.


In [26]:
import os
import requests

#Path to directory to store python scripts

code_dir="/home/nitish/Documents/GenAI/code"

In [27]:
urls = [
    "https://raw.githubusercontent.com/TheAlgorithms/Python/master/searches/double_linear_search_recursion.py",
    "https://raw.githubusercontent.com/KosingZhu/tensorflow/master/tensorflow/python/tools/module_util.py",
    "https://raw.githubusercontent.com/EricRemmerswaal/tensorflow/master/tensorflow/python/distribute/distribute_coordinator_context.py",
    "https://raw.githubusercontent.com/computationalartist/tensorflow/master/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/numpy_mlp.py",
    "https://raw.githubusercontent.com/Van-an/tensorflow/master/tensorflow/python/distribute/coordinator/values.py",
    "https://raw.githubusercontent.com/nkgwer/tensorflow/master/tensorflow/lite/tools/visualize.py",
    "https://raw.githubusercontent.com/gitblazer/youtube-dl/master/youtube_dl/version.py",
    "https://raw.githubusercontent.com/Joshua-Barawa/My-Photos/master/venv/lib/python3.8/site-packages/django/contrib/messages/__init__.py",
    "https://raw.githubusercontent.com/PaliC/pytorch/master/test/fx/test_subgraph_rewriter.py"
]

In [28]:
for url in urls:
    print(f'Working on url: {url}')
    response=requests.get(url)
    file_name=os.path.basename(url)
    file_path=os.path.join(code_dir,file_name)

    with open(file_path,'wb') as file:
        file.write(response.content)

Working on url: https://raw.githubusercontent.com/TheAlgorithms/Python/master/searches/double_linear_search_recursion.py
Working on url: https://raw.githubusercontent.com/KosingZhu/tensorflow/master/tensorflow/python/tools/module_util.py
Working on url: https://raw.githubusercontent.com/EricRemmerswaal/tensorflow/master/tensorflow/python/distribute/distribute_coordinator_context.py
Working on url: https://raw.githubusercontent.com/computationalartist/tensorflow/master/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/numpy_mlp.py
Working on url: https://raw.githubusercontent.com/Van-an/tensorflow/master/tensorflow/python/distribute/coordinator/values.py
Working on url: https://raw.githubusercontent.com/nkgwer/tensorflow/master/tensorflow/lite/tools/visualize.py
Working on url: https://raw.githubusercontent.com/gitblazer/youtube-dl/master/youtube_dl/version.py
Working on url: https://raw.githubusercontent.com/Joshua-Barawa/My-Photos/master/venv/lib/python3.8/site-packages/djan

In [29]:
files=os.listdir(code_dir)
for file in files:
    print(file)

double_linear_search_recursion.py
values.py
module_util.py
visualize.py
__pycache__
test_subgraph_rewriter.py
distribute_coordinator_context.py
version.py
__init__.py
numpy_mlp.py


In [30]:
code_dataset = []

for item in os.listdir(code_dir):
    path = os.path.join(code_dir, item)
    if os.path.isfile(path):
        with open(path, 'r') as file:
            code_dataset.append({'text': file.read()})

In [31]:
code_dataset=datasets.Dataset.from_list(code_dataset)


In [32]:
dataset=datasets.concatenate_datasets(
    [pretraining_dataset,code_dataset]
)
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 60009
})


# Data Cleaning

In [33]:
dataset.num_rows

60009

In [34]:
import heapq

def paragraph_length_filter(x):
    """Returns False iff a page has too few lines or lines are too short"""
    lines=x['text'].split('\n')
    if (
        len(lines)<3 or min(heapq.nlargest(3,[len(line) for line in lines]))<3
    ):
        return False
    return True

In [35]:
dataset=dataset.filter(
    paragraph_length_filter,
    load_from_cache_file=False
)

Filter:   0%|          | 0/60009 [00:00<?, ? examples/s]

In [36]:
dataset.num_rows

52357

In [37]:
def find_duplicates(paragraphs):
    """
    use this function to find the number of repetitions in the paragraphs.
    """
    unique_x=set()
    duplicate_chars=0
    duplicate_elements=0

    for element in paragraphs:
        if element in unique_x:
            duplicate_chars+=len(element)
            duplicate_elements+=1
        else:
            unique_x.add(element)
    return duplicate_elements,duplicate_chars


In [38]:
import re

def paragraph_repetition_filter(x):
    """
    Returns False iff a page has too many repetitions
    """
    text=x['text']

    paragraphs=re.compile(r'\n{2,}').split(text.strip())
    paragraphs_duplicates,char_duplicates=find_duplicates(paragraphs)

    if paragraphs_duplicates/len(paragraphs)>0.2:
        return False
    if char_duplicates/len(text)>0.2:
        return False
    return True


In [39]:
dataset=dataset.filter(
    paragraph_repetition_filter,
    load_from_cache_file=False
)

Filter:   0%|          | 0/52357 [00:00<?, ? examples/s]

In [40]:
dataset.num_rows

52289

In [41]:
def deduplications(ds):
    def dedup_func(x):
        """Use this function to remove duplicate entries"""
        if x['text'] in unique_text:
            return False
        else:
            unique_text.add(x['text'])
            return True
    unique_text=set()

    ds=ds.filter(dedup_func,load_from_cache_file=False,num_proc=1)
    return ds

dataset=deduplications(dataset)

Filter:   0%|          | 0/52289 [00:00<?, ? examples/s]

In [42]:
dataset.num_rows

43566

In [43]:
import urllib
from fasttext import load_model
from fasttext.FastText import _FastText

def english_language_filter(ds):
    #load language detection model

    current_dir = os.getcwd()
    model_path = os.path.join(current_dir, "lid.176.bin")
    

    model = load_model(model_path)

    def is_english(x):
        #Predict language of the text and probability
        language,score=model.predict(x['text'].replace('\n',''))

        language=language[0].split("__")[2]

        return score>0.4 and language=='en'

    ds=ds.filter(is_english,load_from_cache_file=False,num_proc=1)

    return ds

dataset=english_language_filter(dataset)
        




Filter:   0%|          | 0/43566 [00:00<?, ? examples/s]

In [44]:
dataset.num_rows

40454

In [45]:
file_path='./preprocessed_dataset.parquet'
dataset.to_parquet(file_path)

Creating parquet from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

196961832

# Packing Data for Pretraining

# 1.Tokenizing and Creating input_ids

In [46]:
import datasets

In [47]:
dataset=datasets.load_dataset(
    'parquet',
    data_files="./preprocessed_dataset.parquet",
    split='train'
)

print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 40454
})


In [48]:
dataset=dataset.shard(num_shards=10,index=0)
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 4046
})


In [49]:
from transformers import AutoTokenizer
model_name='upstage/SOLAR-10.7B-v1.0'

tokenizer=AutoTokenizer.from_pretrained(
    model_name,
    use_fast=False
)

In [50]:
tokenizer.tokenize("I'm a Nitish")

['▁I', "'", 'm', '▁a', '▁N', 'it', 'ish']

In [51]:
def tokenization(example):
    #Tokenize
    tokens=tokenizer.tokenize(example['text'])

    #convert tokens to ids
    token_ids=tokenizer.convert_tokens_to_ids(tokens)

    #Add <bos>,<eos> token to the front and back of token_ids
    #bos: begin of sequence, eos: end of sequence
    token_ids=[
        tokenizer.bos_token_id]\
        +token_ids \
        +[tokenizer.eos_token_id
    ]
    example['input_ids']=token_ids

    #we will be using this column to count the total number of tokens
    #in this final dataset

    example['num_tokens']=len(token_ids)

    return example



    

In [52]:
dataset=dataset.map(tokenization,load_from_cache_file=False)

print(dataset)

Map:   0%|          | 0/4046 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'num_tokens'],
    num_rows: 4046
})


In [53]:
sample=dataset[41]

print('Text',sample['text'][:30])
print('\ninput_ids',sample['input_ids'][:30])
print('\nnum_tokens',sample['num_tokens'])

Text Adele Hall nee: Grant
Adele (G

input_ids [1, 330, 450, 291, 6756, 435, 28706, 28747, 14301, 13, 28741, 450, 291, 325, 28777, 6804, 28731, 6756, 4568, 1753, 356, 10983, 28725, 4624, 28705, 28750, 28783, 28725, 28705, 28750]

num_tokens 579


In [54]:
import numpy as np

np.sum(dataset['num_tokens'])

5303633

In [55]:
input_ids=np.concatenate(dataset['input_ids'])
print(len(input_ids))

5303633


In [56]:
#A longer maximum sequence length will enable better performance for long text
max_seq_length=32
#solar and llama-2 uses 4096

In [57]:
total_length=len(input_ids)-len(input_ids)%max_seq_length

In [58]:
print(total_length)

5303616


In [59]:
input_ids=input_ids[:total_length]
print(input_ids.shape)

(5303616,)


In [60]:
input_ids_reshaped=input_ids.reshape(-1,max_seq_length).astype(np.int32)
input_ids_reshaped.shape

(165738, 32)

In [61]:
type(input_ids_reshaped)

numpy.ndarray

In [62]:
#Transforming out input_ids into a list
input_ids_list=input_ids_reshaped.tolist()

#Transforming our list into dictionary
packaged_pretrain_dataset=datasets.Dataset.from_dict(
    {'input_ids':input_ids_list}
)

print(packaged_pretrain_dataset)
print(type(packaged_pretrain_dataset))

Dataset({
    features: ['input_ids'],
    num_rows: 165738
})
<class 'datasets.arrow_dataset.Dataset'>


In [63]:
packaged_pretrain_dataset.to_parquet('./packages_pretrain_dataset.parquet')

Creating parquet from Arrow format:   0%|          | 0/166 [00:00<?, ?ba/s]

21877416

# Preparing our model for training

# 1. Model configuration 

In [64]:
from transformers import LlamaConfig

config=LlamaConfig()
print(config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.37.2",
  "use_cache": true,
  "vocab_size": 32000
}



In [65]:
config.num_hidden_layers=12   #reduced from 32 to 12
config.hidden_size=1024       #reduced 1/4 from 4096 to 1024
config.intermediate_size=4096 #reduced 1/3 from 11008 to 4096
config.num_key_value_heads=8  #reduced 1/4 from 32 to 8 
config.torch_dtype='bfloat16' # for half-precision training
config.use_cache=False        #`True` is incompatible w/ gradient
print(config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 12,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.37.2",
  "use_cache": false,
  "vocab_size": 32000
}



# 2 Weight initialization

# 2.1 Random weight initialization

In [66]:
from transformers import LlamaForCausalLM
model=LlamaForCausalLM(config)
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=256, bias=False)
          (v_proj): Linear(in_features=1024, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [67]:
def print_nparams(model):
    """Calculate the total number of model parameters"""
    nparams=sum(p.numel() for p in model.parameters())
    print(f'The total number of parameters is: {nparams}')

print_nparams(model)  #248013824 =>248M

The total number of parameters is: 248013824


In [68]:
layer_name="model.layers.0.self_attn.q_proj.weight"

for name,param in model.named_parameters():
    if name==layer_name:
        print(f"First 30 weights of layer'{layer_name}':")
        print(param.data.view(-1)[:30])
        break

First 30 weights of layer'model.layers.0.self_attn.q_proj.weight':
tensor([ 1.5794e-02, -2.2748e-02,  2.0156e-02, -2.6072e-02, -8.3267e-05,
         8.7432e-03, -9.0255e-04, -4.2442e-02,  1.5337e-02,  1.4482e-02,
         1.3526e-02,  1.9171e-03, -2.3141e-02, -4.2336e-03,  6.9818e-04,
         8.9955e-03, -2.0524e-02, -1.3378e-02,  2.3255e-02,  9.5167e-04,
         2.1053e-02,  1.2794e-02, -7.6783e-03, -3.7832e-03, -8.9180e-03,
         7.4018e-04, -2.5204e-02, -1.7069e-02,  1.3481e-03,  4.7622e-02])


In [69]:
from transformers import LlamaTokenizer

model_dir='upstage/SOLAR-10.7B-v1.0'

tokenizer=LlamaTokenizer.from_pretrained(model_dir)

from transformers import TextStreamer

prompt='I am an engineer. I love'

inputs=tokenizer(prompt,return_tensors='pt').to(model.device)

streamer=TextStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=True
)

outputs = model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_length=128 + len(prompt),  # Adjust the max length here
    do_sample=False
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


possessed possessed possessed possessed possessed possessedcontinuecontinuecontinuecontinuecontinueDownloadџcontinueDownloadcontinueDownloadcontinueertsxE Point remoterts remoterts remoterts갑continuecontinuecontinue wide wide atr wide atr wide wide wide wide wide wide wide wide wide wide wide wideursor otra FC otraopesopesopesopesopesopesopesopesopesopesopes wideopes wideopes wideopes wideopes wideopes wideopes wideopesimpse Library wideopesasterasterasterasterasterasterasterasterasterasterasterasterasterasterasterasterasterasterasteraster primarily primarily primarily primarily primarily primarily primarilyasterasterasterasterasterasterasterasterasterasterasteraster primarilyitä primarilyitä primarilyitä primarilyitä primarilyitä primarilyitä primarilyitä primarilyitä primarilyasterpriseasterpriseasterpriseaster


In [70]:
#Note: we're running large models in a limited environment
import gc
del model
del streamer
del outputs
gc.collect()

18

# 3. Reuse general pretrained model weights

In [71]:
from transformers import AutoModelForCausalLM

model_name='upstage/TinySolar-248m-4k'

model=AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='cpu',
    torch_dtype=torch.bfloat16
)

In [72]:
del model
gc.collect()

66

# 4. Downscaling from a general pretrained model 

In [73]:
from transformers import AutoTokenizer,AutoConfig

model_name='upstage/TinySolar-248m-4k'

model=AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='cpu',
    torch_dtype=torch.bfloat16
)

tokenizer=AutoTokenizer.from_pretrained(model_name)

In [74]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=256, bias=False)
          (v_proj): Linear(in_features=1024, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [75]:
layers=model.model.layers

model.model.layers=layers[:5]+layers[-5:]

config=AutoConfig.from_pretrained(
    model_name,
    num_hidden_layers=len(model.model.layers)
)

model.config=config

print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-9): 10 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=256, bias=False)
          (v_proj): Linear(in_features=1024, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linea

In [76]:
print_nparams(model) #217M

The total number of parameters is: 217601024


In [77]:
import gc
del model
gc.collect()

99

# 5. Depth Upscaling from a general pretrained model

In [78]:
config=LlamaConfig(
    num_hidden_layers=16,
    hidden_size=1024,
    intermediate_size=4096,
    num_attention_heads=32,
    num_key_value_heads=8,
    torch_dtype='bfloat16',
    use_cache=False
)

print(config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.37.2",
  "use_cache": false,
  "vocab_size": 32000
}



In [81]:
model = LlamaForCausalLM(config)
model = model.to(dtype=torch.bfloat16)  # convert to bfloat16
print_nparams(model)  #308M

The total number of parameters is: 308839424


In [79]:
model_name='upstage/TinySolar-248m-4k'

pretrained_model=AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='cpu',
    torch_dtype=torch.bfloat16
)

tokenizer=AutoTokenizer.from_pretrained(model_name)

print_nparams(pretrained_model)  #248M

The total number of parameters is: 248013824


In [82]:
from copy import deepcopy

model.model.layers=deepcopy(pretrained_model.model.layers[:-4])\
                    +deepcopy(pretrained_model.model.layers[4:])

model.model.embed_tokens=deepcopy(pretrained_model.model.embed_tokens)

model.lm_head=deepcopy(pretrained_model.lm_head)

print(model.config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.37.2",
  "use_cache": false,
  "vocab_size": 32000
}



In [83]:
print_nparams(model)  #308M

The total number of parameters is: 308839424


In [84]:
prompt="I am an engineer. I love"

inputs=tokenizer(prompt, return_tensors='pt').to(model.device)

streamer=TextStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=True
)

outputs=model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


to work with people who are not afraid to look at the world and are not afraid to look at the world with a little bit of a twist.
I am a very humble person and I am very fortunate to have a great team of people who work hard to make a difference.
I am very fortunate to have a great team of people who work hard to make a difference.
I am very fortunate to have a great team of people who work hard to make a difference.
I am very fortunate to have a great team of people who work hard to make a difference.
I am very fortunate to have a great team


In [85]:
model.save_pretrained('./TinySolar-308-4k-init')

# Training in Action

In [86]:
pretrained_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=256, bias=False)
          (v_proj): Linear(in_features=1024, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

# Load Dataset

In [108]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self,args,split='train'):
        self.args=args
        self.dataset=datasets.load_dataset('parquet',data_files=args.dataset_name,split=split)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self,idx):
        #convert the list to a longtensor for pythorch

        input_ids=torch.LongTensor(self.dataset[idx]['input_ids'])
        labels=torch.LongTensor(self.dataset[idx]['input_ids'])

        #Return the sample as a dicdtionary
        return {'input_ids':input_ids,'labels':labels}


# Configure Training Arguments

In [109]:
from dataclasses import dataclass, field
import transformers

@dataclass
class CustomArguments(transformers.TrainingArguments):
    dataset_name: str = field(                           # Dataset configuration
        default="./packages_pretrain_dataset.parquet")
    num_proc: int = field(default=1)                     # Number of subprocesses for data preprocessing
    max_seq_length: int = field(default=32)              # Maximum sequence length

    # Core training configurations
    seed: int = field(default=0)                         # Random seed for initialization, ensuring reproducibility
    optim: str = field(default="adamw_torch")            # Optimizer, here it's AdamW implemented in PyTorch
    max_steps: int = field(default=30)                   # Number of maximum training steps
    per_device_train_batch_size: int = field(default=2)  # Batch size per device during training

    # Other training configurations
    learning_rate: float = field(default=5e-5)           # Initial learning rate for the optimizer
    weight_decay: float = field(default=0)               # Weight decay
    warmup_steps: int = field(default=10)                # Number of steps for the learning rate warmup phase
    lr_scheduler_type: str = field(default="linear")     # Type of learning rate scheduler
    gradient_checkpointing: bool = field(default=True)   # Enable gradient checkpointing to save memory
    dataloader_num_workers: int = field(default=2)       # Number of subprocesses for data loading
    bf16: bool = field(default=True)                     # Use bfloat16 precision for training on supported hardware
    gradient_accumulation_steps: int = field(default=1)  # Number of steps to accumulate gradients before updating model weights
    
    # Logging configuration
    logging_steps: int = field(default=3)                # Frequency of logging training information
    report_to: str = field(default="none")               # Destination for logging (e.g., WandB, TensorBoard)

    # Saving configuration
    # save_strategy: str = field(default="steps")          # Can be replaced with "epoch"
    # save_steps: int = field(default=3)                   # Frequency of saving training checkpoint
    # save_total_limit: int = field(default=2)             # The total number of checkpoints to be saved

In [110]:
no_cuda=True,
use_cpu=True
fp16=False,
bf16=False

In [111]:
parser = transformers.HfArgumentParser(CustomArguments)
args, = parser.parse_args_into_dataclasses(
    args=["--output_dir", "output", "--no_cuda", "True", "--use_cpu", "True"]
)

In [112]:
train_dataset=CustomDataset(args=args)

In [113]:
print('Input shape: ',train_dataset[0]['input_ids'].shape)

Input shape:  torch.Size([32])


In [115]:
from transformers import Trainer,TrainingArguments,TrainerCallback

#Define a custom callback to log the loss values

class LossLoggingCallback(TrainerCallback):
    def on_log(self,args,state,control,logs=None,**kwargs):
        if logs is not None:
            self.logs.append(logs)

    def __init__(self):
        self.logs=[]

loss_logging_callback=LossLoggingCallback()

In [117]:
trainer=Trainer(
    model=pretrained_model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=None,
    callbacks=[loss_logging_callback]
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
3,4.1524
6,3.7226
9,4.0917
12,3.6982
15,4.1968
18,4.2846
21,4.0762
24,4.0058
27,4.0247
30,3.9445


TrainOutput(global_step=30, training_loss=4.019758351643881, metrics={'train_runtime': 2128.2922, 'train_samples_per_second': 0.028, 'train_steps_per_second': 0.014, 'total_flos': 2479631892480.0, 'train_loss': 4.019758351643881, 'epoch': 0.0})

# Model Evaluations

In [119]:
!pip install -U git+https://github.com/EleutherAI/lm-evaluation-harness

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting git+https://github.com/EleutherAI/lm-evaluation-harness
  Cloning https://github.com/EleutherAI/lm-evaluation-harness to /tmp/pip-req-build-38x3eabl
  Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness /tmp/pip-req-build-38x3eabl
  Resolved https://github.com/EleutherAI/lm-evaluation-harness to commit 42dc244867889a19ae80847254a481f446f6e4b7
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting evaluate (from lm_eval==0.4.3)
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting jsonlines (from lm_eval==0.4.3)
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting numexpr (from lm_eval==0.4.3)
  Downloading numexpr-2.10.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting peft>=0.2.0 (from lm_eval==0.4.3)
  Downloading peft-

In [121]:
!lm_eval --model hf \
    --model_args pretrained=./upstage/TinySolar-248m-4k \
    --tasks truthfulqa_mc2 \
    --device cpu \
    --limit 5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-07-24:22:21:44,740 INFO     [__main__.py:272] Verbosity set to INFO
2024-07-24:22:21:46,256 INFO     [__init__.py:491] `group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. `tag` will be used to allow to call a collection of tasks just like `group`. `group` will be removed in order to not cause confusion with the new ConfigurableGroup which will be the offical way to create groups with addition of group-wide configuations.
2024-07-24:22:21:48,245 INFO     [__init__.py:512] The tag mmlu is already registered as a group, this tag will not be registered. This may affect tasks you want to call.
2024-07-24:22:21:48,247 INFO     [__init__.py:512] The tag mmlu is already registered as a group, this tag will not be registered. This may affect tasks you want to call.
2024-07-24:22:21:48,249 INFO     [__init__.py:512] The tag mmlu is already registered as a group, this tag will not be registered. This may affect tasks you want to call.
2024-

In [None]:
import os

def h6_open_llm_leaderboard(model_name):
  task_and_shot = [
      ('arc_challenge', 25),
      ('hellaswag', 10),
      ('mmlu', 5),
      ('truthfulqa_mc2', 0),
      ('winogrande', 5),
      ('gsm8k', 5)
  ]

  for task, fewshot in task_and_shot:
    eval_cmd = f"""
    lm_eval --model hf \
        --model_args pretrained={model_name} \
        --tasks {task} \
        --device cpu \
        --num_fewshot {fewshot}
    """
    os.system(eval_cmd)

h6_open_llm_leaderboard(model_name="YOUR_MODEL")