# **Connect the runtime to the T4 GPU before running the Model's cells**

In [None]:
#model dependencies
!pip install --upgrade torch
!pip install --upgrade transformers==4.41.3 #pre-trained models
!pip install --upgrade datasets
!pip install --upgrade huggingface_hub
!pip install --upgrade accelerate
!pip install --upgrade bitsandbytes #for quantization
!pip install --upgrade peft #for low rank adapter (LoRA)
!pip install --upgrade trl
!pip install --upgrade sentence-transformers #for text embedding

[31mERROR: Ignored the following yanked versions: 4.14.0, 4.25.0, 4.46.0[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement transformers==4.41.3 (from versions: 0.1, 2.0.0, 2.1.0, 2.1.1, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0, 2.8.0, 2.9.0, 2.9.1, 2.10.0, 2.11.0, 3.0.0, 3.0.1, 3.0.2, 3.1.0, 3.2.0, 3.3.0, 3.3.1, 3.4.0, 3.5.0, 3.5.1, 4.0.0rc1, 4.0.0, 4.0.1, 4.1.0, 4.1.1, 4.2.0, 4.2.1, 4.2.2, 4.3.0rc1, 4.3.0, 4.3.1, 4.3.2, 4.3.3, 4.4.0, 4.4.1, 4.4.2, 4.5.0, 4.5.1, 4.6.0, 4.6.1, 4.7.0, 4.8.0, 4.8.1, 4.8.2, 4.9.0, 4.9.1, 4.9.2, 4.10.0, 4.10.1, 4.10.2, 4.10.3, 4.11.0, 4.11.1, 4.11.2, 4.11.3, 4.12.0, 4.12.1, 4.12.2, 4.12.3, 4.12.4, 4.12.5, 4.13.0, 4.14.1, 4.15.0, 4.16.0, 4.16.1, 4.16.2, 4.17.0, 4.18.0, 4.19.0, 4.19.1, 4.19.2, 4.19.3, 4.19.4, 4.20.0, 4.20.1, 4.21.0, 4.21.1, 4.21.2, 4.21.3, 4.22.0, 4.22.1, 4.22.2, 4.23.0, 4.23.1, 4.24.0, 4.25.1, 4.26.0, 4.26.1, 4.27.0, 4.27.1, 4.27.2, 4.27.3, 4.27.4, 4.28.0, 4.28.1, 4.29.0, 4.29.1, 4.

# **Logging in**

In [None]:
#getting hf tokens to perform operations with their API
from google.colab import userdata
readToken = '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'
writeToken = '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'

from huggingface_hub import login

login(readToken)

# **Loader**

In [None]:
from datasets import get_dataset_split_names

def splitLoader(Repos, Split="train"): #loads specific splits (train split by default)... will be used in model training and testing
  Names = []
  dfs = []
  for dataset in Repos:
    try:
      dfs.append(load_dataset(dataset, split=Split))
      Names.append(i)
    except Exception as e:
      print(f"Exception received: {e}")
      inp = input("Enter a different split: ")
      availableSplits = get_dataset_split_names(dataset)
      splitLen = len(availableSplits)
      c = 0
      while (inp.strip() != "") and (inp in availableSplits) and (c < splitLen):
        dfs.append(load_dataset(dataset, split=inp))
        Names.append(dataset + "_" + inp)
        inp = input("Enter another split for the same dataset [do not enter anything otherwise]: ")
        c += 1
  return Names, dfs


# **All imports necessary**

In [None]:


import torch
from datasets import Dataset, load_dataset

from peft import (
    LoraConfig, PeftModel, TaskType, get_peft_model, prepare_model_for_kbit_training
    )

from transformers import (
     AutoTokenizer, BitsAndBytesConfig, pipeline
)
from transformers import AutoModelForCausalLM
from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer
from sentence_transformers import SentenceTransformer
from random import *

In [None]:
#setting up important globals


PAD_TOKEN = "<|pad|>" #the token used for padding to ensure tensor sizes that are independent
#of string length variability

MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct" #"meta-llama/Llama-3.2-3B" #model path on hf
NEW_MODEL = "Llama-3.2-3B-Humorous-STEM-Tutor" #trained model's name


In [None]:

print(torch.cuda.is_available())
device = torch.cuda.get_device_name(0)
print(device)

True
Tesla T4


# **Model Configs (Quantization and LoRA)**

In [None]:
#configure the model to store its parameters in 4-bit representation using normalized float 4 (nf4)
#and when it actively uses the parameters in training/inference it extends them to 16-bit floats
quantizationConfig = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16,
    torch_dtype = torch.float16 #using float16 dtype instead of bfloat16 because it's natively supported on Tesla T4 GPUs; bfloat is more preferable on TPUs and higher GPUs like A100; but not supported on the T4
)

""" #helpful for SFT strategy
loraConfig = LoraConfig(r=8, target_modules="all-linear", bias="none",
                       task_type="CAUSAL_LM")
"""

#use original meta tokenizer for the Llama 3.2 3B model with a fast compiled (C or Rust) based tokenizer if available
Tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

#adding pad token to the tokenizer
Tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
Tokenizer.padding_side = "right" #refines model output and controls the issue of generated text repitition

LlamaModel = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, quantization_config = quantizationConfig,
    #attn_implementation = "flash_attention_2",
    #attn_implementation = "sdpa", #different attention implementations for the transformer
    device_map = "auto", #defaults to the GPU if available
    torch_dtype = torch.float16
)

gen = pipeline(
    "text-generation",
    model=LlamaModel,
    tokenizer = Tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:

LlamaModel.resize_token_embeddings(len(Tokenizer), pad_to_multiple_of=8)
print(LlamaModel.config)
print("-----------------------------------------")
query = 'Explain dot products in vector mathematics in a comprehensive way.'
print(f"{query}\n{gen(query, min_length=256, max_length = 1024)}")


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
 

Sentence Transformer for efficiently storing text embeddings

In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# **Snippet constructor, Retriever, and Generator**

In [None]:
separator = "[SEP]"
embeddingBase = []

#by default won't update the embedding base which is safer for more testing and experimentation with the function
def extendBase(dataset, updateEmbeddingBase=False, instColumnName = "instruction",
                respColumnName = "response", splits=["train", "test"]):
  snippetBase = []
  for split in splits:
    if(split in dataset.keys()):
      for row in dataset[split]:
        snippetBase.append((row[instColumnName].strip() + separator + row[respColumnName].strip()))
  if(updateEmbeddingBase):
    embeddingBase.extend(snippetBase)
  else:
    return snippetBase

def clearEmbeddingBase(): #will clear all of the embedding base
  embeddingBase.clear()

def retSnippet(prompt):
  embeddedPrompt = embedder.encode([prompt])
  #embeddingBase is the whole knowledge base that the RAG model will retrieve from
  #best to handle updates to embeddingBase by using the extendBase function
  instructionBase = []

  for inst_resp in embeddingBase:
    inst = inst_resp.split(separator)[0]
    instructionBase.append(inst)

  similarities = embedder.similarity(embedder.encode(instructionBase),
                                     embeddedPrompt) #cosine similarity used
  return embeddingBase[similarities.argmax().item()].split(separator)[1]


In [None]:
ds = load_dataset("RwanAshraf/All_hf_Scrapping")
clearEmbeddingBase()
extendBase(ds, respColumnName="response", updateEmbeddingBase=True)

#print(b[0], f"\nlen:{len(embeddingBase)}")
#print(embeddingBase)

README.md:   0%|          | 0.00/326 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/179060 [00:00<?, ? examples/s]

In [None]:
print(len(embeddingBase))
embeddings_text_snippets = embedder.encode(embeddingBase)
retSnippet("Find the last three digits of the product of the positive roots of $\\sqrt{1995}x^{\\log_{1995}x}=x^2.")


179060


'Okay, so I need to find the last three digits of the product of the positive roots of the equation √(1995) * x^(log_{1995} x) = x². Hmm, let\'s see. First, let me try to understand the equation and figure out how to solve it. \n\nThe equation is given as √(1995) * x^(log_{1995} x) = x². That looks a bit complicated with the logarithm in the exponent. Maybe I can take logarithms on both sides to simplify it? Or perhaps rewrite the equation using properties of exponents and logarithms. \n\nLet me recall that log_b a = (ln a)/(ln b), so log_{1995} x is equal to (ln x)/(ln 1995). Also, x raised to log_{1995} x can be rewritten using exponentials. Wait, maybe I can express x^(log_{1995} x) as e^{(ln x) * (log_{1995} x)}. But that might not be helpful. Let me think differently.\n\nAlternatively, since the equation has x in the exponent and also x squared, maybe substituting t = log_{1995} x would help. Let me try that. Let’s set t = log_{1995} x, which means that x = 1995^t. Then substituti

In [None]:
minTokens = 200
maxTokens = 500
def promptModel(prompt): #RAG-powered
  retrievedText = retSnippet(prompt)

    #prepare the messages for the text generation pipeline
  messages = [
        {"role": "system", "content": """
        You are a knowledgeable AI assistant about math, physics, computing, and engineering.
        You will answer questions with a touch of geeky humour, joke about the technical context of the prompt."""
                "Provide one Answer ONLY the following prompt based on the context provided below. "
                "Do not generate or answer any other questions. "
                "Do not make up or infer any information that is not directly stated in the context. "
                "Provide a concise answer."
                f"{retrievedText}"},
        {"role": "user", "content": prompt}
    ]
  response = gen(messages, max_new_tokens=128, min_length=minTokens, max_length=maxTokens)[-1]["generated_text"][-1]["content"]
  print(f"Query: \n\t{prompt}")
  if(retrievedText):
    print(f"Context: \n\t{retrievedText}")
  else:
    print("Context: \n\tmodel could not find similar prompt in the RAG base")
  print(f"Answer: \n\t{response}")

In [None]:
p = "Explain vector mathematics"
promptModel(p)

Both `max_new_tokens` (=128) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Query: 
	Explain vector mathematics
Context: 
	Here is Python code for a `Vector3D` class, which accomplishes what you are asking:

```python
import math

class Vector3D:
    def __init__(self, x=0, y=0, z=0):
        self.x = x
        self.y = y
        self.z = z

    def __add__(self, other):
        return Vector3D(self.x + other.x, self.y + other.y, self.z + other.z)

    def __sub__(self, other):
        return Vector3D(self.x - other.x, self.y - other.y, self.z - other.z)

    def dot(self, other):
        return self.x * other.x + self.y * other.y + self.z * other.z

    def cross(self, other):
        return Vector3D(self.y * other.z - self.z * other.y,
                        self.z * other.x - self.x * other.z,
                        self.x * other.y - self.y * other.x)

    def magnitude(self):
        return math.sqrt(self.x ** 2 + self.y ** 2 + self.z ** 2)

    def __str__(self):
        return f'({self.x}, {self.y}, {self.z})'
```

Example usage:

```python
v1 = Vecto

# **END OF RAG**

Checking important sequence tokens in the tokenizer

In [None]:
print(tokenizer.bos_token, tokenizer.bos_token_id, tokenizer.eos_token, tokenizer.eos_token_id, tokenizer.pad_token, tokenizer.pad_token_id)
tokenizer.convert_tokens_to_ids(PAD_TOKEN)

print(tokenizer.chat_template)

<|begin_of_text|> 128000 <|eot_id|> 128009 <|pad|> 128256
{{- bos_token }}
{%- if custom_tools is defined %}
    {%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
    {%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
    {%- if strftime_now is defined %}
        {%- set date_string = strftime_now("%d %b %Y") %}
    {%- else %}
        {%- set date_string = "26 Jul 2024" %}
    {%- endif %}
{%- endif %}
{%- if not tools is defined %}
    {%- set tools = none %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
    {%- set system_message = messages[0]['content']|trim %}
    {%- set messages = messages[1:] %}
{%- else %}
    {%- set system_message = "" %}
{%- endif %}

{#- System message #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if tools is not none %}
    {{- "Environment: ipython\n" }}
{%- endi

In [None]:
dataset = load_dataset("O047/prepSIFT_Code1") #testing the trainnig pipeline and improvement on preprocessed magicoder-evol-instruct

README.md:   0%|          | 0.00/326 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/136M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/111183 [00:00<?, ? examples/s]

# **Converting to a pandas dataframe for training**

In [None]:
#enables more powerful preprocessing

def convertDatasetToDataframe(dataset, split="train"):
  datasetColumns = dataset[split].features.keys()
  if not datasetColumns:
    raise Exception("dataset must have at least 1 feature.")

  rows = []
  for row in dataset[split]:
    rows.append({column : row[column] for column in datasetColumns})
  return pd.DataFrame(rows)

df = convertDatasetToDataframe(dataset)

df.head()

Unnamed: 0,instruction,response
0,Please amend the subsequent Python script so t...,```python\n# Establish an integer list\narr = ...
1,"i've got this python code from an ocr tool, bu...",There are several issues with your code like f...
2,Create a recursive function in Java that predi...,You can achieve this using a combination of re...
3,Develop a program that uses natural language p...,This task requires writing of a significant vo...
4,I am trying to create a load testing script fo...,The correct approach would depend on the preci...


In [None]:
df.isnull().sum().sum().item() #clean

0

# **RAG preprocessor (dataset augmentation)**

In [None]:
#this feeds the model with input examples that are formatted in a chatbot prompt-like way
#has context integration formatter and one for datasets without contexts - but these are very specific and depend on the exact context
#hence they are very liable to experimentation and changes
#any prompts that are forwarded to the model should ideally be formatted here first too (used in both trainng and testing)


def format_example_withCTX(row: dict):
  prompt = dedent( #removes indentations from multi-line strings (used here to process back the string which is coded in multiple lines for clarity)
      f"""
      {row['instruction']}

      Information:

      ```
      {row['context']}
      ```
      """
  )
  chat = [
      {
          "role": "system",
          "content": "You are a retrieval-based assistant. Only use the provided information to answer the question. Do not add any extra details or assumptions. If the information is unclear or incomplete, say so explicitly.",
      },
      {"role": "user", "content": prompt},
      {"role": "assistant", "content": row["response"]}
  ]

  return tokenizer.apply_chat_template(chat, tokenize=False)

def format_example(row: dict):

  prompt = row["instruction"]
  chat = [
      {
          "role": "system",
          "content": "You are a retrieval-based assistant. Use only the informtation to answer the question.",
      },
      {"role": "user", "content": prompt},
      {"role": "assistant", "content": row["response"]}
  ]


  return tokenizer.apply_chat_template(chat, tokenize=True)

In [None]:

df["text"] = df.apply(format_example, axis=1)
df.head()

Unnamed: 0,instruction,response,text
0,Please amend the subsequent Python script so t...,```python\n# Establish an integer list\narr = ...,"[128000, 128006, 9125, 128007, 271, 38766, 130..."
1,"i've got this python code from an ocr tool, bu...",There are several issues with your code like f...,"[128000, 128006, 9125, 128007, 271, 38766, 130..."
2,Create a recursive function in Java that predi...,You can achieve this using a combination of re...,"[128000, 128006, 9125, 128007, 271, 38766, 130..."
3,Develop a program that uses natural language p...,This task requires writing of a significant vo...,"[128000, 128006, 9125, 128007, 271, 38766, 130..."
4,I am trying to create a load testing script fo...,The correct approach would depend on the preci...,"[128000, 128006, 9125, 128007, 271, 38766, 130..."


In [None]:

model_id = "meta-llama/Llama-3.2-3B-Instruct"

pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)
pipe.model = torch.compile(pipe.model)

pipe.tokenizer.pad_token_id = pipe.tokenizer.eos_token_id

pipe("Explain Newton's law of universal gravitation", min_length = 256, max_length=1024)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': "Explain Newton's law of universal gravitation\nNewton's law of universal gravitation states that every point mass attracts every other point mass by a force acting along the line intersecting both points. The force is proportional to the product of the two masses and inversely proportional to the square of the distance between them.\nMathematically, the law can be expressed as:\nF = G \\* (m1 \\* m2) / r^2\nWhere:\nF = gravitational force between the two masses\nG = gravitational constant (6.67408e-11 N\\*m^2/kg^2)\nm1 = mass of the first object\nm2 = mass of the second object\nr = distance between the centers of the two masses\n\nThis law explains the gravitational force between two objects, which is responsible for the motion of planets, moons, and stars in our solar system, as well as the behavior of galaxies and galaxy clusters. It also explains the falling of objects towards the ground, as the Earth's mass attracts the object's mass, causing it to accelerate t

In [None]:
#some pipe tests:
pipe("Hello")[0]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'generated_text': 'Hello everyone, welcome back to my channel. Today, I want to talk about a very important topic:'}

In [None]:
prompt = "explain matrices like I am 5"
response = pipe(prompt,
     max_new_tokens=50,
     do_sample=True,
     temperature=0.7,
     top_p=0.9,
     repetition_penalty=1.1,
     eos_token_id=128001
)[0]["generated_text"][len(prompt):].strip()
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Imagine you have a bunch of toys, and each toy has a special name. Let's say you have a teddy bear named Mr. Whiskers, a doll named Princess Sparkles, and a car named Zoom.

Now imagine that each
