In [None]:
import pandas as pd
import numpy as np
import torch
import os
import torch.nn.functional as F
import random
from spacy.lang.en import English
from tqdm.auto import tqdm

### Device Setup

### Read Information from CSV

In [None]:
df = pd.read_csv('smus_page.csv')
df.head()
df.shape

(99, 2)

### Assigning Properties to each page

In [None]:
pages_and_text = []
for index, page in enumerate(df["Page Content"].tolist()):
    pages_and_text.append({"page_number": index,
                           "page_char_count": len(page),
                           "page_word_count": len(page.split(" ")),
                           "page_sentence_count": len(page.split(".")),
                           "page_token_count": len(page)/4, # 1 token ~ 4 characters
                           "text": page})
random.sample(pages_and_text, 1)

[{'page_number': 13,
  'page_char_count': 1010,
  'page_word_count': 154,
  'page_sentence_count': 7,
  'page_token_count': 252.5,
  'text': '   Breadcrumb  Contact Advancement From stewarding donors and alumni relations, to ensuring the behind-the-scenes work is done to manage your investments in SMUS, our team is deeply committed to ensuring every student has every opportunity to succeed at SMUS and in life. We look forward to hearing from you. The Advancement Team Meet the Advancement team to learn about the work they do on behalf of the SMUS community and our students. Shara Campsall Director of Advancement250-370-6197 (office), 250-216-6460 (cell) Joanna Verano Annual Fund Manager250-370-6106 Denise Rees Advancement Associate - Alumni Relations250-370-7508 Tim Vuksic Advancement Database and Campaign Administrator250-370-6175 St. Michaels University School is an independent day and boarding school of 1,000 students from Junior Kindergarten to Grade 12 in Victoria, BC, Canada. Main

In [None]:
df = pd.DataFrame(pages_and_text)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,text
0,0,1496,241,14,374.0,Explore Cookie Settings When you visit any...
1,1,3488,567,30,872.0,Breadcrumb Start Here Thank you for choosin...
2,2,1169,194,10,292.25,Breadcrumb Admissions Publications If you ...
3,3,2809,462,20,702.25,Breadcrumb Middle School The Middle School ...
4,4,2167,345,15,541.75,Schaffter Hall for music (left) is home to ...


In [None]:
df.describe().round()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count
count,99.0,99.0,99.0,99.0,99.0
mean,49.0,3013.0,508.0,24.0,753.0
std,29.0,2864.0,487.0,27.0,716.0
min,0.0,25.0,4.0,1.0,6.0
25%,24.0,1180.0,186.0,10.0,295.0
50%,49.0,2470.0,438.0,17.0,618.0
75%,74.0,3594.0,628.0,29.0,898.0
max,98.0,18617.0,3169.0,180.0,4654.0


### Splitting pages into sentences

- using spacy library

In [None]:
nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This another sentence. I like Elephants")

In [None]:
for item in tqdm(pages_and_text):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sent) for sent in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])


  0%|          | 0/99 [00:00<?, ?it/s]

In [None]:
random.sample(pages_and_text, 1)

[{'page_number': 33,
  'page_char_count': 2356,
  'page_word_count': 395,
  'page_sentence_count': 20,
  'page_token_count': 589.0,
  'text': "   Breadcrumb Alumni Our global alumni community has almost 10,000 members located in 70 countries worldwide. From world-class athletes, to notable inventors and celebrated artists, you can find a SMUS graduate in every walk of life. Welcome Back! Thank you for stopping by the alumni section of the SMUS website. Whether you are a graduate of University School (1906), St. Michael’s School (1910) or the amalgamated organization which became St. Michaels University School in 1971, these pages provide many ways to connect and reconnect with your fellow alumni and the school community. On these pages, you will find alumni news, details of upcoming events as well as networking opportunities. If you have any questions about how you can engage with the school, please don’t hesitate tocontact Alumni Relationsat SMUS. Alumni News and Events SMUS Connect Y

In [None]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy
count,99.0,99.0,99.0,99.0,99.0,99.0
mean,49.0,3013.41,507.79,24.12,753.35,21.33
std,28.72,2864.27,486.92,26.55,716.07,23.28
min,0.0,25.0,4.0,1.0,6.25,1.0
25%,24.5,1179.5,186.0,10.0,294.88,7.5
50%,49.0,2470.0,438.0,17.0,617.5,14.0
75%,73.5,3594.0,628.0,29.0,898.5,26.0
max,98.0,18617.0,3169.0,180.0,4654.25,143.0


### Chunking


#### How to do?
- experiment how much sentence used for one chunk of the information
- it depends on each type of data

#### Purpose
- Our text is easier to filter
- Our text Chunk can fit into our embedding model (limit size depends on the model)
- Our context passed in LLMs will be more specific



In [None]:
num_sentence_chuck_size = 10

def split_list(input_list: list, slice_size: int = num_sentence_chuck_size) -> list[list[str]]:
    return [input_list[i : i + slice_size] for i in range(0, len(input_list), slice_size)]


In [None]:
for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chuck_size)
    item["num_chunk"] = len(item["sentence_chunks"])

  0%|          | 0/99 [00:00<?, ?it/s]

In [None]:
random.sample(pages_and_text, 1)

[{'page_number': 27,
  'page_char_count': 5746,
  'page_word_count': 933,
  'page_sentence_count': 41,
  'page_token_count': 1436.5,
  'text': "   Breadcrumb A Lifetime of Leadership By\nGreg Gilks\n-\nJune 19, 2023 Tags: Share: This year, the Canadian Secondary Schools Rowing Association (CSSRA) recognized Susanne Walker Curry for her years of service to high school rowing by presenting her with a Lifetime Service Award. Susanne was honoured with the esteemed accolade during the 2023 CSSRA Championships held in St. Catharines, Ontario.Award recipients must have served high school rowing for at least 25 years. Susanne surpassed that mark through her 11 years at Brentwood College School and 17 years atSt. Michaels University School. Nevertheless, the essential prerequisite for Susanne was the need to have contributed to high school rowing by creating innovative, influential, and motivational initiatives. As Head of the SMUS rowing program, Susanne is known for her ability to develop wel

In [None]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy
count,99.0,99.0,99.0,99.0,99.0,99.0
mean,49.0,3013.41,507.79,24.12,753.35,21.33
std,28.72,2864.27,486.92,26.55,716.07,23.28
min,0.0,25.0,4.0,1.0,6.25,1.0
25%,24.5,1179.5,186.0,10.0,294.88,7.5
50%,49.0,2470.0,438.0,17.0,617.5,14.0
75%,73.5,3594.0,628.0,29.0,898.5,26.0
max,98.0,18617.0,3169.0,180.0,4654.25,143.0


In [None]:
import re

pages_and_chunks = []

for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences
        joined_sentence_chunk = "".join(sentence_chunk).replace(" ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk)/4

        pages_and_chunks.append(chunk_dict)


  0%|          | 0/99 [00:00<?, ?it/s]

In [None]:
len(pages_and_chunks)

0

In [None]:
sameple = random.sample(pages_and_chunks, 1)
sameple[0]["sentence_chunk"]

"Breadcrumb  Rugby Camp The all-boys Rugby Camp is for athletes who are ready to take their game to the next level!Our rugby camp has been designed to develop technical skills and rugby IQ. Our athletes will take part in daily controlled scrums, engage in video analysis sessions, and learn proper fitness training. The training sessions, both on and off the field, will pinpoint areas for improvement in your game and fitness. Each athlete will receive personalized feedback from our group of experience coaches. Join us for a week of rugby, and make lifelong friends along the way!What's Included On-Field Focus There are also daily themes around off-field development, such as growth, awareness, innovation, and nurture, to further enhance and round out the skills learned on the field. Meet Scott Manning, Head Rugby Coach Over the past two decades Rugby has been woven into the life of Scott Manning, the Head Coach the University of Victoria men’s rugby team. While Scott has represented Canada

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,259.0,259.0,259.0,259.0
mean,50.19,1147.81,190.71,286.95
std,27.05,451.48,77.84,112.87
min,0.0,24.0,3.0,6.0
25%,27.5,898.5,149.5,224.62
50%,49.0,1175.0,194.0,293.75
75%,72.5,1439.0,234.5,359.75
max,98.0,2857.0,506.0,714.25


### Embedding

- convert text to number that can be understanable by the computer

   #### Search for the model

   - MTEB Leaderboard on huggingface

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path= "all-mpnet-base-v2",
                                      device="cuda")

sentences = ["The Sentence Transformer library provides an wasy way to create embeddings.",
             "Sentences can be embedded one by one or in a list",
             "I like horses"]

embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
    print(sentence)
    print(embedding)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


The Sentence Transformer library provides an wasy way to create embeddings.
[-1.53364372e-02  3.33118662e-02 -1.25479177e-02  5.33722974e-02
 -1.75136216e-02 -4.48477594e-03  1.22677935e-02 -4.26853038e-02
  2.69533116e-02 -3.18118930e-02  1.87260173e-02  3.54916230e-02
 -3.72126438e-02 -2.11609658e-02  3.34009565e-02 -2.71013491e-02
  5.45411669e-02  1.54202022e-02 -2.61604823e-02 -3.00866924e-03
  2.56527029e-02  2.48738285e-02  2.35219859e-02  3.97322960e-02
 -1.60349756e-02 -2.88301520e-02 -9.60131362e-03 -3.82793918e-02
  4.49242704e-02 -1.63893905e-02 -1.33881876e-02 -4.96864179e-03
  4.76707444e-02 -3.02254013e-03  1.15981470e-06  2.12925971e-02
 -1.58309676e-02 -2.74923556e-02  1.93826505e-03  1.84243210e-02
  4.54363413e-02 -3.32866088e-02  9.40077845e-03  3.00573949e-02
 -4.61623222e-02 -9.16058291e-03  4.53574136e-02  2.07901001e-02
  7.93580487e-02  3.99667509e-02 -1.74309723e-02 -4.36245985e-02
  7.84611516e-03 -7.98133388e-03 -2.96688471e-02  4.62609679e-02
 -2.29533706e-

In [None]:
embeddings[0].shape

(768,)

In [None]:
%%time

embedding_model = SentenceTransformer(model_name_or_path= "all-mpnet-base-v2",
                                      device="cuda")
# Embed each chunk one by one

for item in tqdm(pages_and_chunks):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])



  0%|          | 0/259 [00:00<?, ?it/s]

CPU times: user 6.8 s, sys: 373 ms, total: 7.18 s
Wall time: 10 s


In [None]:
sample = random.sample(pages_and_chunks, 1)
sample[0]["embedding"]


array([ 2.01097429e-02,  6.36952966e-02,  1.04703978e-02, -9.91522241e-03,
        2.63529699e-02, -2.74645239e-02, -2.20427127e-03, -2.84896288e-02,
       -2.38120016e-02, -1.71676278e-02,  8.76023322e-02, -4.11731238e-03,
        2.96306238e-02,  3.32866833e-02, -2.30753813e-02, -1.94692295e-02,
        5.68944123e-03, -2.97529418e-02, -1.00075513e-01,  1.86980348e-02,
       -6.14522286e-02,  3.08230612e-02, -1.68346229e-03,  2.29487172e-03,
        1.97232999e-02, -2.79476792e-02,  3.48706655e-02,  4.44663689e-02,
        5.54133058e-02, -5.23478873e-02, -7.28687504e-04,  1.80187095e-02,
       -7.68829230e-03, -3.47028524e-02,  3.34142806e-06, -1.43009108e-02,
        1.44045688e-02, -2.06898209e-02, -8.63935053e-02,  3.49654593e-02,
       -4.74326387e-02,  7.17559978e-02, -1.41848754e-02, -2.71349354e-03,
       -5.54504246e-02, -8.70015007e-03,  3.28037073e-03, -1.29999325e-01,
       -1.92982890e-02, -3.03924102e-02, -1.15700308e-02,  2.14751484e-03,
       -1.03538863e-01, -

In [None]:
# Save embeddings to file

text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks)
text_chunks_and_embeddings_df.to_csv("text_chunks_and_embeddings_df.csv", index=False)

If your embedding database is really large (more than 100000 embeddings), you might need a vector database

### RAG Search

In [None]:
text_chunks_and_embeddings_df_load = pd.read_csv("text_chunks_and_embeddings_df.csv")
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,Explore Cookie Settings When you visit any web...,1488,233,372.0,[-1.47433244e-02 -8.54152255e-03 8.76629446e-...
1,1,Breadcrumb Start Here Thank you for choosing t...,1459,243,364.75,[ 2.52863625e-03 -1.32396845e-02 -1.79529879e-...
2,1,All applications at SMUS are completed online....,1071,164,267.75,[-1.49623724e-02 -8.15583467e-02 -1.13896967e-...
3,1,Request a Meeting You can schedule a visit to ...,949,153,237.25,[-2.03965530e-02 -1.69680640e-02 1.02613913e-...
4,2,Breadcrumb Admissions Publications If you are...,1162,187,290.5,[-5.64908609e-03 -8.05202033e-03 6.52202684e-...


In [None]:
# create numpy array from string in the excel

text_chunks_and_embeddings_df_load["embedding"] = text_chunks_and_embeddings_df_load["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

In [None]:
pages_and_chunks = text_chunks_and_embeddings_df_load.to_dict(orient="records")

In [None]:
embeddings = torch.tensor(np.array(text_chunks_and_embeddings_df_load["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([259, 768])

In [None]:
text_chunks_and_embeddings_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,Explore Cookie Settings When you visit any web...,1488,233,372.0,"[-0.014743324, -0.008541523, 0.008766294, -0.0..."
1,1,Breadcrumb Start Here Thank you for choosing t...,1459,243,364.75,"[0.0025286363, -0.0132396845, -0.017952988, -0..."
2,1,All applications at SMUS are completed online....,1071,164,267.75,"[-0.014962372, -0.08155835, -0.0011389697, -0...."
3,1,Request a Meeting You can schedule a visit to ...,949,153,237.25,"[-0.020396553, -0.016968064, 0.010261391, -0.0..."
4,2,Breadcrumb Admissions Publications If you are...,1162,187,290.5,"[-0.005649086, -0.00805202, 0.006522027, -0.01..."


In [None]:
embeddings[0]

tensor([-1.4743e-02, -8.5415e-03,  8.7663e-03, -6.7721e-03,  3.5212e-02,
        -3.4428e-02,  1.8326e-02, -9.2706e-02, -2.1836e-02, -1.7262e-02,
         5.8780e-02, -5.5275e-02,  1.6331e-02,  4.3085e-02,  1.4266e-02,
        -4.6976e-03,  1.2160e-02, -2.5105e-02, -3.9222e-02, -1.8223e-04,
        -5.6515e-02,  3.0148e-02, -2.3729e-02, -6.2771e-03,  1.5246e-02,
        -5.8004e-02,  3.4947e-02,  3.6927e-02,  5.6707e-02, -2.4298e-02,
        -1.2276e-02,  3.9633e-02, -9.0226e-03,  2.0421e-02,  2.6676e-06,
        -2.0374e-02,  6.3007e-03, -4.5921e-03, -6.5341e-02,  1.3661e-02,
        -7.6420e-02,  5.7642e-02, -1.9695e-02,  2.8077e-02, -3.9989e-02,
         7.1893e-03,  2.4605e-02, -1.2449e-01,  4.4408e-03, -6.1733e-03,
        -1.9946e-02, -4.0943e-02, -9.0203e-02,  3.1184e-03, -1.3094e-02,
         2.5796e-02,  2.1730e-02, -2.6932e-02,  2.3366e-02,  4.1744e-02,
        -4.9328e-03, -3.8559e-03, -4.4390e-02, -7.3335e-02,  6.4618e-02,
         2.3109e-02, -2.5401e-02,  3.0434e-02,  6.8

In [None]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")



### Finding the closest embeddings to the query

![Local Image](https://github.com/monzzzz/SMUS-AI-Club/blob/main/AI-02-SMUS-Assistance/images/dot-product-visualize.png?raw=1)

In [None]:
query = "Who is the head of the St. Michael University School?"

query_embedding = embedding_model.encode(query)

# get simliarity score

from time import perf_counter as timer

start_time = timer()
# dot product of every embedding with the query embedding and rank them
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

# take the top 5 results
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product


torch.return_types.topk(
values=tensor([0.6863, 0.6664, 0.6563, 0.6467, 0.6408]),
indices=tensor([108, 153, 135, 214,  33]))

In [None]:
# Define helper function to print wrapped text
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

### Show the related information to the piece

In [None]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print("\n")

Query: 'Who is the head of the St. Michael University School?'

Results:
Score: 0.6863
Text:
Tags: Mark Turner Mark Turner is Head of School at St. Michaels University
School. You might also be interested in Head of School Head of School NEWS -
August 29, 2024 Start of Year Welcome - August 29, 2024 Head of School Head of
School NEWS - May 2, 2024 Welcoming New Head of School, Dr. Jeff Aitken Head of
School Head of School NEWS - April 4, 2024 Embracing Spring: Cultivating
Excellence in Student Pursuits St. Michaels University School is an independent
day and boarding school of 1,000 students from Junior Kindergarten to Grade 12
in Victoria, BC, Canada. Main Reception 3400 Richmond Road Victoria, BC, Canada,
V8P 4P5 © St. Michaels University SchoolWebsite Feedback We are a proud member
of: Event Details


Score: 0.6664
Text:
Breadcrumb Governance and Community SMUS is thankful to have a dedicated Board
of Governors as well as several organizations within our community who are
working on

We can also link the model back to their original url to search it

### LLM (Required GPU)

In [None]:
# check memory in the GPU

import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes/2**30)
print(gpu_memory_gb)

# flash attention 2. See Github

15


In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.0


accept terms and conditions on huggingface before using a model (Huggingface CLI)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.utils import is_flash_attn_2_available

In [None]:
print(f"Total VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024 ** 3:.2f} GB")

Total VRAM: 14.75 GB


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
!pip install bitsandbytes --upgrade



In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Enables double quantization to further reduce memory
    bnb_4bit_quant_type="nf4",  # Use the optimized NF4 quantization (or "fp4" for normal 4-bit)
    bnb_4bit_compute_dtype=torch.float16  # Compute using float16 to optimize speed and memory
)

In [None]:
torch.cuda.get_device_capability(0)
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash"
else:
    attn_implementation = "eager"

model_id = "google/gemma-7b-it"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

llm_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_id,
    torch_dtype=torch.float16,
    quantization_config=quantization_config,  # Enable 8-bit quantization
    low_cpu_mem_usage=True,  # Further reduce memory usage
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3072, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear4bit(in_features=24576, out_features=3072, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((3072,), eps=1e-06)
        (post_attention_layernorm):

In [None]:
def get_model_num_params(model: torch.nn.Module):
    return sum([p.numel() for p in model.parameters()])

get_model_num_params(llm_model)


4662144000

In [None]:

def get_model_mem_size(model: torch.nn.Module):
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffer = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    model_mem_bytes = mem_params + mem_buffer
    model_mem_mb= model_mem_bytes / 1024**2
    model_mem_gb = model_mem_bytes / 1024**3
    return {"model_mem_bytes": model_mem_bytes, "model_mem_mb": model_mem_mb, "model_mem_gb": model_mem_gb}

get_model_mem_size


In [None]:
# WE got the size our model. This means we need minimum of 16 GB GPU to run this model

In [None]:
input_text = "Who is the head of St. Michael University School?"
print(f"Input text", input_text)

# Create a prompt template
dialogue_template = [
    {"role": "user",
     "content": input_text},
]

prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False,
                                       add_generation_prompt=True)

print(prompt)


Input text What is the head of St. Michael University School?
<bos><start_of_turn>user
What is the head of St. Michael University School?<end_of_turn>
<start_of_turn>model



In [None]:
%%time

# Tokenizer

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

CPU times: user 1.64 ms, sys: 666 µs, total: 2.31 ms
Wall time: 19.2 ms


In [None]:
# generate the output

outputs = llm_model.generate(**input_ids, max_new_tokens=256)

In [None]:
outputs

tensor([[     2,      2,    106,   1645,    108,   1841,    603,    573,   2206,
            576,    997, 235265,   7939,   2895,   4249, 235336,    107,    108,
            106,   2516,    108, 235285,    749,    780,    791,   3684,    577,
           1879, 235290,   1602,   2113, 235269,   5852,    590,   2952,   3448,
            573,   2872,   1105,    573,   2206,    576,    997, 235265,   7939,
           2895,   4249, 235265,      1]], device='cuda:0')

In [None]:
outputs_decoded = tokenizer.decode(outputs[0])

In [None]:
outputs_decoded

'<bos><bos><start_of_turn>user\nWhat is the head of St. Michael University School?<end_of_turn>\n<start_of_turn>model\nI do not have access to real-time information, therefore I cannot answer the question about the head of St. Michael University School.<eos>'

In [2]:
!git add .

fatal: not a git repository (or any of the parent directories): .git
