In [3]:
from unstructured.partition.pdf import partition_pdf
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from nltk.tokenize.punkt import PunktSentenceTokenizer
import json
import pandas as pd
from collections import Counter
import time
import random
import ast
import os
import re
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
load_dotenv()

HF_API_TOKEN = os.environ.get("HF_API_TOKEN")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.environ.get("PINECONE_ENVIRONMENT")

In [None]:
catalog_filename = "data/nova_catalog_24_25.pdf"
ocr_start = time.time()
catalog_elements_ocr = partition_pdf(filename=catalog_filename, strategy="ocr_only")
ocr_end = time.time()
ocr_time = (ocr_end - ocr_start) / 60
print(f"OCR Finished in {ocr_time}m")
#catalog_elements_hires = partition_pdf(filename=catalog_filename, strategy="hi_res")
#(len(catalog_elements_ocr), len(catalog_elements_hires))
len(catalog_elements_ocr)

In [None]:
Counter(type(element) for element in catalog_elements_ocr)

In [None]:
#Counter(type(element) for element in catalog_elements_hires)

In [None]:
display(*[element.to_dict() for element in catalog_elements_ocr[95:100]])

In [None]:
#[(type(element), element.text) for element in catalog_elements_hires[50:100]]

In [None]:
catalog_element_dict_list = [element.to_dict() for element in catalog_elements_ocr]
catalog_element_dict_list

In [None]:
catalog_element_df = pd.DataFrame(catalog_element_dict_list)
print(len(catalog_element_df))
catalog_element_df.head()

In [None]:
catalog_element_df.to_csv("catalog_data.csv", index=False)

In [None]:
year = "2024-2025"
pattern = f'^\\d+ \\| {year} NOVA Catalog \\|$'
rows_to_drop = catalog_element_df[catalog_element_df['text'].str.match(pattern))]
print(len(rows_to_drop))

In [None]:
joined_text = ""
for index, row in catalog_element_df.iterrows():
    joined_text += f"{row["text"]} " 

joined_text[:1000]

In [5]:
tokenizer = PunktSentenceTokenizer()

csv_path = "catalog_data.csv"
catalog_df = pd.read_csv(csv_path)

document_text_string = ""
for index, row in catalog_df.iterrows():
    document_text_string += f"{row["text"]} "

sentence_list = tokenizer.tokenize(document_text_string)

print(len(sentence_list))
sentence_list[120:130]

11541


['Professional Readiness The ability to work well with others and display situationally and culturally appropriate demeanor and behavior.',
 'Degree graduates will demonstrate skills important for successful transition into the workplace and pursuit of further education.',
 'Quantitative Literacy The ability to perform accurate calculations, interpret quantitative information, apply and analyze relevant numerical data, and use results to support conclusions.',
 'Degree graduates will calculate, interpret, and use numerical and quantitative information in a variety of settings.',
 'Scientific Literacy The ability to apply the scientific method and related concepts and principles to make informed decisions and engage with issues related to the natural, physical, and social world.',
 'Degree graduates will recognize and know how to use the scientific method, and to evaluate empirical information.',
 'Written Communication the ability to develop, convey, and exchange ideas in writing, as a

In [6]:
print(len(catalog_df))
catalog_df.head()

7779


Unnamed: 0,type,element_id,text,metadata
0,Title,d580873c31da0f85af2c4f0e955d2960,CATALOG,"{'coordinates': {'points': ((704.0, 1544.0), (..."
1,UncategorizedText,50ddb73ff8f2553cf0d3442632bc2049,2024-2025,"{'coordinates': {'points': ((914.0, 1723.0), (..."
2,Title,91b79077e60608525259f3b16688dda9,Northern Virginia Community College,"{'coordinates': {'points': ((1223.0, 1996.0), ..."
3,Title,ec3ffb76c607be7ff92809db98900474,Welcome to,"{'coordinates': {'points': ((501.0, 119.0), (5..."
4,UncategorizedText,d08cb9c277419ff66197193c34a8d4ba,NOVA!,"{'coordinates': {'points': ((500.0, 181.0), (5..."


In [7]:
last_entry = catalog_df.iloc[len(catalog_df) - 1]
last_metadata_string = last_entry["metadata"]

print(type(last_entry))
print(last_entry)
print(type(last_entry["metadata"]))

last_metadata_dict = ast.literal_eval(last_metadata_string)
print(type(last_metadata_dict))
print(last_metadata_dict)

last_page = last_metadata_dict["page_number"]
last_page

<class 'pandas.core.series.Series'>
type                                                      Title
element_id                     974bfb713bd654303417ec2af8508caa
text          Clinical Data Coding Medical Laboratory Assist...
metadata      {'coordinates': {'points': ((1257.0, 1780.0), ...
Name: 7778, dtype: object
<class 'str'>
<class 'dict'>
{'coordinates': {'points': ((1257.0, 1780.0), (1257.0, 1847.0), (1502.0, 1847.0), (1502.0, 1780.0)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'filetype': 'application/pdf', 'languages': ['eng'], 'last_modified': '2024-08-22T11:49:26', 'page_number': 197, 'file_directory': 'data', 'filename': 'nova_catalog_24_25.pdf'}


197

In [8]:
pages_dict = {}
for page in range(1, last_page + 1):
    pages_dict[page] = ""

for index, entry in catalog_df.iterrows():
    entry_metadata_string = entry["metadata"]
    entry_metadata = ast.literal_eval(entry_metadata_string)
    entry_page = entry_metadata["page_number"]
    
    pages_dict[entry_page] += f"{entry["text"]} "

In [9]:
print(len(pages_dict))
print(pages_dict[1])
print(sentence_list[0])
print(pages_dict[2])

197
CATALOG 2024-2025 Northern Virginia Community College 
CATALOG 2024-2025 Northern Virginia Community College Welcome to NOVA!
Welcome to NOVA! Thank you for choosing NOVA! More than 70,000 students this year are discovering that NOVA has exactly what they’re looking for to meet their educational needs. The high quality of NOVA’s offerings is well known: + Our classes feature up-to-date technology and course content to prepare you for today’s competitive job market; + Wealso offer courses that will transfer to colleges all over Virginia and the nation. In fact, NOVA has guaranteed admission agreements with more than 40 colleges and universities for students who meet specific requirements. NOVA’s excellence is delivered by a devoted faculty who loves teaching and really cares about the success of every student, and a staff who works hard to make sure you have what you need to accomplish your goals. At NOVA, you’ll find students with diverse backgrounds and views who will enrich your 

In [10]:
# Combine all pages into a single string
document_content = " ".join(pages_dict.values())

# Save the document content to a text file
with open("full_document_content.txt", "w", encoding="utf-8") as f:
    f.write(document_content)

In [None]:
def split_text_into_chunks_with_overlap(pages_dict, sentences, char_limit, overlap_limit):
    """
    Splits sentences into their actual chunks (determined by max characters that preserves sentences) and
    assigns the page number where each chunk started.
    
    Returns a list of dataframes with two key value pairs each corresponding to a chunk. Each list dataframe
    has chunk text and chunk starting page number.
    """
    chunks_list = []
    current_chunk = ""
    current_page_num = 1
    first_sentence = ""
    
    for sentence in sentences:
        if current_chunk == "":
            chunk_first_sentence = sentence
        # Check if adding this sentence would exceed the character limit
        if len(current_chunk) + len(sentence) <= char_limit:
            # If it fits, add the sentence to the current chunk
            current_chunk += f"{sentence} "
        else:
            # If it doesn't fit, add the current chunk to the chunk dict
            chunk_dict = {"text": current_chunk}
            
            # Add chunk page number to chunk dict
            current_page_text = pages_dict[current_page_num]
            if chunk_first_sentence in current_page_text:
                chunk_dict["page"] = current_page_num
            elif (current_page_num < last_page):
                following_page_num = current_page_num + 1
                combined_pages_text = f"{current_page_text} "
                while following_page_num <= last_page:
                    following_page_text = pages_dict[following_page_num]
                    combined_pages_text += f"{following_page_text} "
                    if (chunk_first_sentence in combined_pages_text):
                        chunk_dict["page"] = current_page_num
                        current_page_num = following_page_num - 1
                        chunks_list.append(chunk_dict)
                        break
                    else:
                        following_page_num += 1
                        
            
            # Check if the last sentence of the current chunk is smaller than the overlap limit
            if len(sentence) <= overlap_limit:
                # If it is, start the new chunk with this sentence as an overlap
                current_chunk = f"{sentence} "
                chunk_first_sentence = sentence
            else:
                # Otherwise, start the new chunk empty
                current_chunk = ""
    
    # Add the last chunk if it's not empty
    if current_chunk:
        chunk_dict = {
            "text": current_chunk,
            "page": current_page_num
        }
        chunks_list.append(chunk_dict)
    
    return chunks_list


char_limit = 1000
overlap_limit = 200

chunk_list = split_text_into_chunks_with_overlap(pages_dict, sentence_list, char_limit, overlap_limit)

print(len(chunk_list))
chunk_list[:5]

In [8]:
def split_text_into_chunks_with_overlap(sentences, char_limit, overlap_limit):
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        # Check if adding this sentence would exceed the character limit
        if len(current_chunk) + len(sentence) <= char_limit:
            # If it fits, add the sentence to the current chunk
            current_chunk += f"{sentence} "
        else:
            # If it doesn't fit, add the current chunk to the list of chunks
            chunks.append(current_chunk)
            
            # Check if the last sentence of the current chunk is smaller than the overlap limit
            if len(sentence) <= overlap_limit:
                # If it is, start the new chunk with this sentence as an overlap
                current_chunk = f"{sentence} "
            else:
                # Otherwise, start the new chunk empty
                current_chunk = ""
    
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

char_limit = 1000
overlap_limit = 200

chunk_list = split_text_into_chunks_with_overlap(sentence_list, char_limit, overlap_limit)

print(len(chunk_list))
chunk_list[90:100]


1036


["Military Benefits Active-Duty Tuition Assistance The College participates in the Armed Forces Tuition Assistance (TA) program. Tuition Assistance is a benefit paid to eligible members of the Army, Navy, Marines, Air Force, and Coast Guard. Congress has given each service the ability to pay up to 100 percent for the tuition expenses of its members. If TA does not cover fees required by the College, the service member is responsible for paying the out-of-pocket fees. Each service has its own criteria for eligibility, obligated service, application processes, and restrictions. This money is usually paid directly to the institution by the individual services. For more information on using Tuition Assistance, students should contact their branch of service education office or NOVA's Military Services Office at militaryservices@nvcc.edu. Reserves and National Guard Tuition Assistance Members of the Selective Reserves are eligible for Tuition Assistance (TA). ",
 "However, each of the Armed

## Recursive Split (by paragraph)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,  # Adjust based on LLM's token limit
    chunk_overlap=50,  # Some overlap helps maintain context
    separators=["\n\n", "\n", ".", " ", ""]
)
chunks = text_splitter.split_text(joined_text)
print(len(chunks))
print(chunks[2])
#chunks = [f"{chunk}" for chunk in chunks]
embeddings_df = pd.DataFrame(chunks, columns=["text"])
print(embeddings_df.shape)
embeddings_df.head()

In [9]:
embeddings_df = pd.DataFrame(chunk_list, columns=["text"])
print(embeddings_df.shape)
embeddings_df.head()

(1036, 1)


Unnamed: 0,text
0,CATALOG 2024-2025 Northern Virginia Community ...
1,"As if that weren’t enough, NOVA offers the mos..."
2,"For other information about the College, pleas..."
3,Student Affairs Academic Policies and Informat...
4,Auditing a Course .. Tuition Military Benefits...


In [10]:
model = SentenceTransformer("BAAI/bge-base-en-v1.5", use_auth_token=HF_API_TOKEN)

text_column = "text"
embeddings_column = "embedding"
embeddings_df[embeddings_column] = embeddings_df[text_column].apply(lambda x: model.encode(str(x)))




In [12]:
print(len(embeddings_df[embeddings_column]))
print(embeddings_df.shape)
print(type(embeddings_df["embedding"][7]))
embeddings_df.head()

1036
(1036, 2)
<class 'numpy.ndarray'>


Unnamed: 0,text,embedding
0,CATALOG 2024-2025 Northern Virginia Community ...,"[-0.00161347, -0.037952054, 0.04580193, -0.052..."
1,"As if that weren’t enough, NOVA offers the mos...","[-0.002499899, -0.0006054201, 0.009863339, -0...."
2,"For other information about the College, pleas...","[0.013434716, -0.013141983, -0.012691294, -0.0..."
3,Student Affairs Academic Policies and Informat...,"[-0.010771758, 0.004085607, 0.0019284533, -0.0..."
4,Auditing a Course .. Tuition Military Benefits...,"[0.016198536, 0.0070315483, 0.021127488, -0.01..."


In [13]:
print(type(embeddings_df["embedding"][0]))

<class 'numpy.ndarray'>


In [17]:

# List will hold all tuples that will be upserted into the pinecone vector db
upsert_list = []
# Iterates through rows of the dataframes and formats the tuples that are added to upsert_list
for idx, entry in embeddings_df.iterrows():
    # Formats and adds dicts into upsert_list
    entry_dict = {
        "id": f"{idx + 1}",
        "values": list(entry["embedding"]),
        "metadata": {"text": entry["text"]}
    }
    upsert_list.append(entry_dict)
        
print(len(upsert_list))
print(type(upsert_list[0]))
print(upsert_list[0])

1036
<class 'dict'>
{'id': '1', 'values': [-0.00161347, -0.037952054, 0.04580193, -0.05225228, 0.039485287, -0.006678248, 0.00736231, 0.06261571, -0.039659493, -0.026740775, -0.036298018, 0.008714986, -0.022118917, 0.038413297, 0.022951512, 0.033640634, 0.0529506, 0.027183581, 0.02435537, -0.005742648, -0.020246264, 0.0065986603, 0.0226201, 0.046977174, -0.013240958, -0.00217751, 0.017805906, -0.0073615215, -0.03742422, -0.0050477595, 0.05877147, -0.0010648514, -0.04215343, 0.010222742, 0.017675094, -0.038400218, 0.07843923, 0.003349304, 0.034764722, -0.01787049, -0.023844741, -0.07618803, -0.02680266, -0.020013593, -0.0280217, -0.0035964646, -0.012996028, -0.031305477, -0.011551301, -0.019599352, -0.019417431, -0.008988467, 0.0008413923, 0.01612431, 0.00088619476, 0.005707287, 0.040240698, -0.054153133, -0.0040453873, -0.07980543, 0.026279237, 0.06681166, -0.0525882, 0.015116036, -0.035686918, -0.008262288, 0.044522613, 0.06121282, -0.027066367, -0.058665067, -0.028438268, 0.015274704

In [18]:
# Connect to pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
host = "https://basic-embeddings-m8sj7l5.svc.aped-4627-b74a.pinecone.io"

index = pc.Index("basic-embeddings", host=host)
index.describe_index_stats()

#pc.create_index(
#    name="basic-embeddings",
#    dimension=768, 
#    metric="cosine", 
#    spec=ServerlessSpec(
#        cloud="aws",
#        region="us-east-1",
#        
#    ),
#    deletion_protection="disabled"
#)

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

In [19]:
# Seperates upsert list into chunks of at most 100
lower_cut = 0
upper_cut = 100
# If this condition is true, there are tupels in upsert_list that have yet to be upserted into pinecone
while lower_cut <= len(upsert_list):
    # Upserts chunck of upser_list and prints upsert response
    index.upsert(
        vectors=upsert_list[lower_cut:upper_cut]
        )

    # Sets cuts to the index of next 100 tuples to be upserted
    lower_cut += 100
    upper_cut += 100
    # Sleeps a random time to prevent flooding pinecone with requests
    sleep_time = random.randrange(1, 5)
    time.sleep(sleep_time)

In [21]:
new_csv_path = "text_embeddings.csv"
embeddings_df.to_csv(new_csv_path, index=False)