In [2]:
import pandas as pd
import numpy as np
import torch
import os
import torch.nn.functional as F
from langchain_openai.chat_models import ChatOpenAI 
import random  
from spacy.lang.en import English
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Device Setup

In [3]:
device = "cpu"

### Read Information from CSV

In [4]:
df = pd.read_csv('smus_page.csv')
df.head()
df.shape

(99, 2)

### Assigning Properties to each page

In [5]:
pages_and_text = []
for index, page in enumerate(df["Page Content"].tolist()):
    pages_and_text.append({"page number": index,
                           "page_char_count": len(page),
                           "page_word_count": len(page.split(" ")),
                           "page_sentence_count": len(page.split(".")),
                           "page_token_count": len(page)/4, # 1 token ~ 4 characters
                           "text": page})
random.sample(pages_and_text, 1)

[{'page number': 84,
  'page_char_count': 3691,
  'page_word_count': 625,
  'page_sentence_count': 23,
  'page_token_count': 922.75,
  'text': '   Breadcrumb VIDEO: Ontario University Tour By\nKyle Slavin\n-\nNovember 18, 2022 Tags: Share: As our Grade 12 students look ahead to June and what path they will take after graduation, our University Counselling department is helping ensure students have as much information and as many experiences to make confident choices about their future. That\'s why 29 students travelled to southern Ontario this past weekend to visit 11 universities and get a feel for the campuses, learn about the programs, see the facilities and connect with SMUS alumni who study there.(You can watch a video recap on the trip at the bottom of this story.) "We\'re all really glad this tour is back up and running again after COVID because we\'ve seen year after year the value of a trip like this for our students," says Ruth McGhee, Director of University Counselling. "Not

In [6]:
df = pd.DataFrame(pages_and_text)
df.head()

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count,page_token_count,text
0,0,1496,241,14,374.0,Explore Cookie Settings When you visit any...
1,1,3488,567,30,872.0,Breadcrumb Start Here Thank you for choosin...
2,2,1169,194,10,292.25,Breadcrumb Admissions Publications If you ...
3,3,2809,462,20,702.25,Breadcrumb Middle School The Middle School ...
4,4,2167,345,15,541.75,Schaffter Hall for music (left) is home to ...


In [7]:
df.describe().round()

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count,page_token_count
count,99.0,99.0,99.0,99.0,99.0
mean,49.0,3013.0,508.0,24.0,753.0
std,29.0,2864.0,487.0,27.0,716.0
min,0.0,25.0,4.0,1.0,6.0
25%,24.0,1180.0,186.0,10.0,295.0
50%,49.0,2470.0,438.0,17.0,618.0
75%,74.0,3594.0,628.0,29.0,898.0
max,98.0,18617.0,3169.0,180.0,4654.0


### Splitting pages into sentences

- using spacy library

In [8]:
nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This another sentence. I like Elephants")

In [9]:
for item in tqdm(pages_and_text):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sent) for sent in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])
    

100%|██████████| 99/99 [00:00<00:00, 274.92it/s]


In [10]:
random.sample(pages_and_text, 1)

[{'page number': 27,
  'page_char_count': 5746,
  'page_word_count': 933,
  'page_sentence_count': 41,
  'page_token_count': 1436.5,
  'text': "   Breadcrumb A Lifetime of Leadership By\nGreg Gilks\n-\nJune 19, 2023 Tags: Share: This year, the Canadian Secondary Schools Rowing Association (CSSRA) recognized Susanne Walker Curry for her years of service to high school rowing by presenting her with a Lifetime Service Award. Susanne was honoured with the esteemed accolade during the 2023 CSSRA Championships held in St. Catharines, Ontario.Award recipients must have served high school rowing for at least 25 years. Susanne surpassed that mark through her 11 years at Brentwood College School and 17 years atSt. Michaels University School. Nevertheless, the essential prerequisite for Susanne was the need to have contributed to high school rowing by creating innovative, influential, and motivational initiatives. As Head of the SMUS rowing program, Susanne is known for her ability to develop wel

In [11]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy
count,99.0,99.0,99.0,99.0,99.0,99.0
mean,49.0,3013.41,507.79,24.12,753.35,21.33
std,28.72,2864.27,486.92,26.55,716.07,23.28
min,0.0,25.0,4.0,1.0,6.25,1.0
25%,24.5,1179.5,186.0,10.0,294.88,7.5
50%,49.0,2470.0,438.0,17.0,617.5,14.0
75%,73.5,3594.0,628.0,29.0,898.5,26.0
max,98.0,18617.0,3169.0,180.0,4654.25,143.0


### Chunking


#### How to do?
- experiment how much sentence used for one chunk of the information
- it depends on each type of data

#### Purpose
- Our text is easier to filter
- Our text Chunk can fit into our embedding model (limit size depends on the model)
- Our context passed in LLMs will be more specific



In [12]:
num_sentence_chuck_size = 10

def split_list(input_list: list, slice_size: int = num_sentence_chuck_size) -> list[list[str]]:
    return [input_list[i : i + slice_size] for i in range(0, len(input_list), slice_size)]


In [13]:
for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chuck_size)
    item["num_chunk"] = len(item["sentence_chunks"])

100%|██████████| 99/99 [00:00<00:00, 224573.33it/s]


In [14]:
random.sample(pages_and_text, 1)

[{'page number': 83,
  'page_char_count': 3578,
  'page_word_count': 596,
  'page_sentence_count': 34,
  'page_token_count': 894.5,
  'text': '   Breadcrumb 2019 Retirees: Janice Iverson By\nGillie Easdon\n-\nJune 30, 2019 Tags: Share: We are honoured to recognize members of the SMUS community as they retire and take on new adventures. Read the 2019 Retiree series to learn more about their outstanding contributions to the school. In this story, we recognize Janice Iverson, former Senior School receptionist. Janice Iverson Anyone who entered School House at the Senior School campus can likely attest to the warm welcome they received from Janice Iverson. In 1999, Janice came on board at St. Michaels University School for part-time reception. During the first years, she also explored a few other areas including admissions and university counselling before settling into her role at reception. Sincere and warm, Janice set the tone and managed the needs of each person who entered the school.

In [15]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy,num_chunk
count,99.0,99.0,99.0,99.0,99.0,99.0,99.0
mean,49.0,3013.41,507.79,24.12,753.35,21.33,2.62
std,28.72,2864.27,486.92,26.55,716.07,23.28,2.37
min,0.0,25.0,4.0,1.0,6.25,1.0,1.0
25%,24.5,1179.5,186.0,10.0,294.88,7.5,1.0
50%,49.0,2470.0,438.0,17.0,617.5,14.0,2.0
75%,73.5,3594.0,628.0,29.0,898.5,26.0,3.0
max,98.0,18617.0,3169.0,180.0,4654.25,143.0,15.0


In [16]:
import re

pages_and_chunks = []

for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # Join the sentences
        joined_sentence_chunk = "".join(sentence_chunk).replace(" ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk)/4

        pages_and_chunks.append(chunk_dict)


  0%|          | 0/99 [00:00<?, ?it/s]


KeyError: 'page_number'

In [None]:
len(pages_and_chunks)

259

In [None]:
sameple = random.sample(pages_and_chunks, 1)
sameple[0]["sentence_chunk"]

'Susanne says there is no way this could happen without the generosity and kindness of our donors and supporters. This year the rowing program hopes to purchase a “heavy four”, with the cost of this boat being $30,000. As well, a new trailer is needed at a cost of $40,000. We hope to raise this $70,000 in our annual appeal in support of the SMUS Rowing program. The deadline to give is June 30. You can donate online atwww.smus.ca/givingor contact me atshara.campsall@smus.caor\xa0250-370-6197. “Thank you,” from our rowers!We asked four students currently competing at the CSSRA Regatta (national competition) to explain what the rowing program means to them. Hannah Look (Victoria; Grade 9) says the rowing program gives her a “sense of home” at the school and has instilled a confidence that she carries into life outside of rowing. The busy training schedule has taught her to manage her time wisely.'

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,259.0,259.0,259.0,259.0
mean,50.19,1158.53,198.47,289.63
std,27.05,451.73,83.6,112.93
min,0.0,152.0,22.0,38.0
25%,27.5,901.5,150.0,225.38
50%,49.0,1180.0,201.0,295.0
75%,72.5,1451.0,252.5,362.75
max,98.0,2857.0,506.0,714.25


### Embedding

- convert text to number that can be understanable by the computer

   #### Search for the model

   - MTEB Leaderboard on huggingface

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path= "all-mpnet-base-v2",
                                      device="cpu")

sentences = ["The Sentence Transformer library provides an wasy way to create embeddings.",
             "Sentences can be embedded one by one or in a list",
             "I like horses"]

embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

for sentence, embedding in embeddings_dict.items():
    print(sentence)
    print(embedding)





The Sentence Transformer library provides an wasy way to create embeddings.
[-1.53364204e-02  3.33119258e-02 -1.25479260e-02  5.33723645e-02
 -1.75136682e-02 -4.48479317e-03  1.22677190e-02 -4.26852703e-02
  2.69533657e-02 -3.18119265e-02  1.87259670e-02  3.54916975e-02
 -3.72126848e-02 -2.11610347e-02  3.34009379e-02 -2.71013509e-02
  5.45411706e-02  1.54201956e-02 -2.61605289e-02 -3.00867110e-03
  2.56527402e-02  2.48738341e-02  2.35220846e-02  3.97322252e-02
 -1.60350464e-02 -2.88301669e-02 -9.60128661e-03 -3.82793844e-02
  4.49242778e-02 -1.63893923e-02 -1.33881047e-02 -4.96859755e-03
  4.76707667e-02 -3.02251009e-03  1.15981516e-06  2.12925449e-02
 -1.58308856e-02 -2.74923947e-02  1.93825702e-03  1.84242986e-02
  4.54363376e-02 -3.32864821e-02  9.40076634e-03  3.00573558e-02
 -4.61622812e-02 -9.16054752e-03  4.53573912e-02  2.07901243e-02
  7.93581232e-02  3.99667434e-02 -1.74309313e-02 -4.36245799e-02
  7.84619618e-03 -7.98129011e-03 -2.96687819e-02  4.62609567e-02
 -2.29533743e-

In [None]:
embeddings[0].shape

(768,)

In [None]:
%%time

embedding_model.to("cpu")

# Embed each chunk one by one

for item in tqdm(pages_and_chunks):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 259/259 [01:30<00:00,  2.87it/s]

CPU times: user 8min 57s, sys: 41.3 s, total: 9min 39s
Wall time: 1min 30s





In [None]:
sample = random.sample(pages_and_chunks, 1)
sample[0]["embedding"]


array([ 4.33917865e-02, -1.51862632e-02, -1.09376712e-03,  4.61332649e-02,
        6.94343001e-02, -1.18158190e-02, -5.67911863e-02, -2.91468594e-02,
       -8.30745101e-02,  9.52781457e-03,  7.20625510e-03, -1.33986091e-02,
        4.42636684e-02, -9.17292163e-02, -1.29950894e-02, -3.93224619e-02,
       -1.76010095e-02,  1.83008574e-02, -6.25984520e-02, -7.56102148e-03,
       -5.20483516e-02,  7.79342744e-03, -1.20072148e-03, -7.27459183e-03,
        2.40627266e-02, -8.84974189e-03,  2.96161231e-02,  5.59710823e-02,
       -1.96063425e-02, -2.54095122e-02,  2.71397159e-02,  3.54757160e-02,
       -5.79457637e-03,  6.47468492e-03,  2.01984517e-06, -1.30684441e-02,
        2.61300150e-02, -5.28658405e-02, -8.39629769e-03, -1.30333221e-02,
       -2.49877367e-02,  3.16087827e-02, -4.64923047e-02, -2.89334357e-02,
       -3.70982327e-02,  4.27614786e-02, -1.97627340e-02, -4.72929738e-02,
        2.97264848e-03,  3.94299440e-02,  9.96607076e-03, -6.58278316e-02,
       -2.75939796e-02, -

In [None]:
# Save embeddings to file

text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks)
text_chunks_and_embeddings_df.to_csv("text_chunks_and_embeddings_df.csv", index=False)

If your embedding database is really large (more than 100000 embeddings), you might need a vector database

### RAG Search

In [None]:
text_chunks_and_embeddings_df_load = pd.read_csv("text_chunks_and_embeddings_df.csv")
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,Explore Cookie Settings When you visit any web...,1488,233,372.0,[-1.47433793e-02 -8.54157656e-03 8.76627397e-...
1,1,Breadcrumb Start Here Thank you for choosing t...,1459,243,364.75,[ 2.52854894e-03 -1.32397227e-02 -1.79530345e-...
2,1,All applications at SMUS are completed online....,1071,164,267.75,[-1.49624804e-02 -8.15582946e-02 -1.13902811e-...
3,1,Request a Meeting You can schedule a visit to ...,949,153,237.25,[-2.03966144e-02 -1.69681329e-02 1.02613587e-...
4,2,Breadcrumb Admissions Publications If you are...,1162,187,290.5,[-5.64916804e-03 -8.05208087e-03 6.52203569e-...


In [None]:
# create numpy array from string in the excel

text_chunks_and_embeddings_df_load["embedding"] = text_chunks_and_embeddings_df_load["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

In [None]:
pages_and_chunks = text_chunks_and_embeddings_df_load.to_dict(orient="records")

In [None]:
embeddings = torch.tensor(np.array(text_chunks_and_embeddings_df_load["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([259, 768])

In [None]:
text_chunks_and_embeddings_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,Explore Cookie Settings When you visit any web...,1488,233,372.0,"[-0.0147433793, -0.00854157656, 0.00876627397,..."
1,1,Breadcrumb Start Here Thank you for choosing t...,1459,243,364.75,"[0.00252854894, -0.0132397227, -0.0179530345, ..."
2,1,All applications at SMUS are completed online....,1071,164,267.75,"[-0.0149624804, -0.0815582946, -0.00113902811,..."
3,1,Request a Meeting You can schedule a visit to ...,949,153,237.25,"[-0.0203966144, -0.0169681329, 0.0102613587, -..."
4,2,Breadcrumb Admissions Publications If you are...,1162,187,290.5,"[-0.00564916804, -0.00805208087, 0.00652203569..."


In [None]:
embeddings[0]

tensor([-1.4743e-02, -8.5416e-03,  8.7663e-03, -6.7720e-03,  3.5212e-02,
        -3.4428e-02,  1.8327e-02, -9.2706e-02, -2.1836e-02, -1.7262e-02,
         5.8780e-02, -5.5275e-02,  1.6331e-02,  4.3085e-02,  1.4266e-02,
        -4.6976e-03,  1.2160e-02, -2.5105e-02, -3.9222e-02, -1.8225e-04,
        -5.6515e-02,  3.0148e-02, -2.3729e-02, -6.2771e-03,  1.5246e-02,
        -5.8004e-02,  3.4947e-02,  3.6927e-02,  5.6707e-02, -2.4298e-02,
        -1.2276e-02,  3.9633e-02, -9.0226e-03,  2.0421e-02,  2.6676e-06,
        -2.0374e-02,  6.3007e-03, -4.5921e-03, -6.5341e-02,  1.3661e-02,
        -7.6420e-02,  5.7642e-02, -1.9695e-02,  2.8077e-02, -3.9989e-02,
         7.1892e-03,  2.4605e-02, -1.2449e-01,  4.4408e-03, -6.1733e-03,
        -1.9946e-02, -4.0943e-02, -9.0203e-02,  3.1184e-03, -1.3094e-02,
         2.5796e-02,  2.1730e-02, -2.6932e-02,  2.3366e-02,  4.1744e-02,
        -4.9328e-03, -3.8559e-03, -4.4390e-02, -7.3335e-02,  6.4618e-02,
         2.3109e-02, -2.5400e-02,  3.0434e-02,  6.8

In [None]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device=device)



### Finding the closest embeddings to the query

![Local Image](../images/dot-product-visualize.png)

In [None]:
query = "Who is the head of the St. Michael University School?"

query_embedding = embedding_model.encode(query)

# get simliarity score

from time import perf_counter as timer

start_time = timer()
# dot product of every embedding with the query embedding and rank them
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

# take the top 5 results
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product


torch.return_types.topk(
values=tensor([0.6863, 0.6664, 0.6563, 0.6467, 0.6408]),
indices=tensor([108, 153, 135, 214,  33]))

In [None]:
# Define helper function to print wrapped text 
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

### Show the related information to the piece

In [None]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print("\n")

Query: 'Who is the head of the St. Michael University School?'

Results:
Score: 0.6863
Text:
Tags: Mark Turner Mark Turner is Head of School at St. Michaels University
School. You might also be interested in Head of School Head of School NEWS -
August 29, 2024 Start of Year Welcome - August 29, 2024 Head of School Head of
School NEWS - May 2, 2024 Welcoming New Head of School, Dr. Jeff Aitken Head of
School Head of School NEWS - April 4, 2024 Embracing Spring: Cultivating
Excellence in Student Pursuits St. Michaels University School is an independent
day and boarding school of 1,000 students from Junior Kindergarten to Grade 12
in Victoria, BC, Canada. Main Reception 3400 Richmond Road Victoria, BC, Canada,
V8P 4P5 © St. Michaels University SchoolWebsite Feedback We are a proud member
of: Event Details


Score: 0.6664
Text:
Breadcrumb Governance and Community SMUS is thankful to have a dedicated Board
of Governors as well as several organizations within our community who are
working on

We can also link the model back to their original url to search it

### LLM