In [26]:
import pandas as pd
from rag import get_embedding
import numpy as np
import torch
import os
import torch.nn.functional as F
from langchain_openai.chat_models import ChatOpenAI 
import random  
from spacy.lang.en import English
from tqdm.auto import tqdm

### Read Information from CSV

In [8]:
df = pd.read_csv('smus_page.csv')
df.head()
df.shape



torch.Size([1, 279, 768])

### Assigning Properties to each page

In [13]:
pages_and_text = []
for index, page in enumerate(df["Page Content"].tolist()):
    pages_and_text.append({"page number": index,
                           "page_char_count": len(page),
                           "page_word_count": len(page.split(" ")),
                           "page_sentence_count": len(page.split(".")),
                           "page_token_count": len(page)/4, # 1 token ~ 4 characters
                           "text": page})
random.sample(pages_and_text, 1)

[{'page number': 80,
  'page_char_count': 6501,
  'page_word_count': 1008,
  'page_sentence_count': 56,
  'page_token_count': 1625.25,
  'text': '   Breadcrumb Privacy Policy We at St. Michaels University School want you, our parents, students and staff, to be aware of how and why we handle your personal information. We work hard to respect and maintain your privacy. However, the very nature of our business is such that the collection, use and disclosure of personal information is fundamental to the services we provide. SMUS has adopted these privacy principles, which apply to the collection, use and disclosure of personal information. Personal information, for the purposes of these privacy principles, means information that identifies an individual. For example: an individual’s name, birth date, address, age, health and financial information is personal information which SMUS may collect, use and in certain circumstances, where necessary, disclose, in the course of carrying on busines

In [14]:
df = pd.DataFrame(pages_and_text)
df.head()

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count,page_token_count,text
0,0,1496,241,14,374.0,Explore Cookie Settings When you visit any...
1,1,3488,567,30,872.0,Breadcrumb Start Here Thank you for choosin...
2,2,1169,194,10,292.25,Breadcrumb Admissions Publications If you ...
3,3,2809,462,20,702.25,Breadcrumb Middle School The Middle School ...
4,4,2167,345,15,541.75,Schaffter Hall for music (left) is home to ...


In [15]:
df.describe().round()

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count,page_token_count
count,99.0,99.0,99.0,99.0,99.0
mean,49.0,3041.0,528.0,24.0,760.0
std,29.0,2867.0,497.0,27.0,717.0
min,0.0,162.0,23.0,1.0,40.0
25%,24.0,1180.0,196.0,10.0,295.0
50%,49.0,2517.0,467.0,17.0,629.0
75%,74.0,3641.0,665.0,29.0,910.0
max,98.0,18664.0,3216.0,180.0,4666.0


### Splitting pages into sentences

- using spacy library

In [21]:
nlp = English()

nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This another sentence. I like Elephants")

[This is a sentence., This another sentence., I like Elephants]

In [30]:
for item in tqdm(pages_and_text):
    item["sentences"] = list(nlp(item["text"]).sents)

    item["sentences"] = [str(sent) for sent in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])
    

100%|██████████| 99/99 [00:00<00:00, 554.90it/s]


In [31]:
random.sample(pages_and_text, 1)

[{'page number': 75,
  'page_char_count': 2930,
  'page_word_count': 513,
  'page_sentence_count': 24,
  'page_token_count': 732.5,
  'text': '   Breadcrumb  Food in Boarding Boarders at SMUS have their meals in Graves Hall, one of the most spectacular dining halls in all of Canada. You don’t need us to tell you that good food is important. And when you’re busy with school and your extracurricular activities, the last thing you will want to think about is “What’s for dinner?” Don’t worry – we have that covered with a nutritious, balanced diet. As a boarding student, you will be served three balanced meals a day: breakfast, lunch and dinner, as well as snacks throughout the day. On Sundays, boarders get a special brunch (a late breakfast/early lunch). All meals are served in Graves Hall in the Sun Centre, which is just steps from your boarding house. "I like that if you want to be healthy, the dining hall has a lot of options. You can get so many fruits and yogurts and salads, and there

In [34]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy
count,99.0,99.0,99.0,99.0,99.0,99.0
mean,49.0,3041.46,528.09,24.3,760.37,21.33
std,28.72,2867.09,496.72,26.54,716.77,23.28
min,0.0,162.0,23.0,1.0,40.5,1.0
25%,24.5,1179.5,196.5,10.0,294.88,7.5
50%,49.0,2517.0,467.0,17.0,629.25,14.0
75%,73.5,3641.0,665.0,29.0,910.25,26.0
max,98.0,18664.0,3216.0,180.0,4666.0,143.0


### Chunking


#### How to do?
- experiment how much sentence used for one chunk of the information
- it depends on each type of data

#### Purpose
- Our text is easier to filter
- Our text Chunk can fit into our embedding model (limit size depends on the model)
- Our context passed in LLMs will be more specific



In [37]:
num_sentence_chuck_size = 10

def split_list(input_list: list, slice_size: int = num_sentence_chuck_size) -> list[list[str]]:
    return [input_list[i : i + slice_size] for i in range(0, len(input_list), slice_size)]


In [36]:
for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chuck_size)
    item["num_chunk"] = len(item["sentence_chunks"])

100%|██████████| 99/99 [00:00<00:00, 68082.65it/s]


In [38]:
random.sample(pages_and_text, 1)

[{'page number': 93,
  'page_char_count': 1811,
  'page_word_count': 336,
  'page_sentence_count': 10,
  'page_token_count': 452.75,
  'text': '   Breadcrumb VIDEO: Primary Christmas Concert By\n                  Kyle Slavin\n                 -\n            December 16, 2022 Tags: Share: After weeks of practice and rehearsals, our youngest Junior School students excitedly took to the stage this week to perform in a beautiful and fun Primary Christmas Concert. The show featured our Kindergarten, Grade 1 and Grade 2 students, as they performed songs from "A Charlie Brown Christmas," songs inspired by "\'Twas the Night Before Christmas," and well-known songs adapted to have a Christmas twist. Thank you to Junior School music teacher Christopher Smith for all of his work preparing the students for the concert, as well as French teacher Stephanie Geehan \'95 for teaching the students "Vive le Vent" – a French version of "Jingle Bells." And a huge thank you to Junior School teacher Lindsey A

In [39]:
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy,num_chunk
count,99.0,99.0,99.0,99.0,99.0,99.0,99.0
mean,49.0,3041.46,528.09,24.3,760.37,21.33,2.62
std,28.72,2867.09,496.72,26.54,716.77,23.28,2.37
min,0.0,162.0,23.0,1.0,40.5,1.0,1.0
25%,24.5,1179.5,196.5,10.0,294.88,7.5,1.0
50%,49.0,2517.0,467.0,17.0,629.25,14.0,2.0
75%,73.5,3641.0,665.0,29.0,910.25,26.0,3.0
max,98.0,18664.0,3216.0,180.0,4666.0,143.0,15.0
