# Simple rag model from research papers.

# importing docuents

In [1]:
import os 
import requests
import fitz

# pdf path
pdf_path ="samplepdf.pdf"


# read pdf

In [2]:
# Requires !pip install PyMuPDF, see: https://github.com/pymupdf/pymupdf
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 0,  # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': 0,
  'page_char_count': 4340,
  'page_word_count': 588,
  'page_sentence_count_raw': 31,
  'page_token_count': 1085.0,
  'text': 'Citation: Liu, M.; Wang, J.; Lin, T.; Ma, Q.; Fang, Z.; Wu, Y. An Empirical Study of the Code Generation of Safety-Critical Software Using LLMs. Appl. Sci. 2024, 14, 1046. https:// doi.org/10.3390/app14031046 Academic Editor: Juan Pavón Received: 28 December 2023 Revised: 16 January 2024 Accepted: 18 January 2024 Published: 26 January 2024 Copyright: © 2024 by the authors. Licensee MDPI, Basel, Switzerland. This article is an open access article distributed under the terms and conditions of the Creative Commons Attribution (CC BY) license (https:// creativecommons.org/licenses/by/ 4.0/). applied   sciences Article An Empirical Study of the Code Generation of Safety-Critical Software Using LLMs Mingxing Liu 1,2 , Junfeng Wang 1,*, Tao Lin 1,*, Quan Ma 2, Zhiyang Fang 1 and Yanqun Wu 1,2 1 College of Computer Science, Sichuan University, Cheng

In [3]:
import random
random.sample(pages_and_texts, k=3)

[{'page_number': 21,
  'page_char_count': 3905,
  'page_word_count': 562,
  'page_sentence_count_raw': 27,
  'page_token_count': 976.25,
  'text': 'Appl. Sci. 2024, 14, 1046 22 of 41 As shown in Table 5, compared to other existing prompt methods discussed in Section 2.3, the proposed Prompt-FDC in this paper presents the most comprehensive prompt framework. It covers the requirements of safety-critical software and introduces a method to progressively refine and generalize domain requirements, enabling a more thorough understanding of the domain requirements and generating higher-quality code. Table 5. Comparison between different prompts of code generation. Prompt Prompt-Simple Prompt-Specific Prompt-FDC Reference Luo et al., 2023 [27] Koziolek et al., 2023 [36] Chen et al., 2021 [8] Bubeck et al., 2023 [7] methodology proposed in this paper Features Function description, code language constraints Function description, code language constraints Function description, function example D

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()


Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,0,4340,588,31,1085.0,"Citation: Liu, M.; Wang, J.; Lin, T.; Ma, Q.; ..."
1,1,4190,586,30,1047.5,"Appl. Sci. 2024, 14, 1046 2 of 41 has great po..."
2,2,4262,623,32,1065.5,"Appl. Sci. 2024, 14, 1046 3 of 41 scale dialog..."
3,3,3131,494,28,782.75,"Appl. Sci. 2024, 14, 1046 4 of 41 crucial aspe..."
4,4,3964,559,29,991.0,"Appl. Sci. 2024, 14, 1046 5 of 41 Table 1. Con..."


In [5]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,41.0,41.0,41.0,41.0,41.0
mean,20.0,3308.51,581.02,46.56,827.13
std,11.98,972.51,225.53,30.11,243.13
min,0.0,1514.0,177.0,8.0,378.5
25%,10.0,2590.0,429.0,28.0,647.5
50%,20.0,3425.0,559.0,33.0,856.25
75%,30.0,3905.0,715.0,74.0,976.25
max,40.0,5826.0,1206.0,121.0,1456.5


# pages into sentences

In [6]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

# Create a document instance as an example
doc = nlp("This is a sentence. This another sentence.")
assert len(list(doc.sents)) == 2

# Access the sentences of the document
list(doc.sents)

[This is a sentence., This another sentence.]

In [7]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/41 [00:00<?, ?it/s]

In [8]:
# Inspect an example
random.sample(pages_and_texts, k=1)

[{'page_number': 34,
  'page_char_count': 1921,
  'page_word_count': 333,
  'page_sentence_count_raw': 32,
  'page_token_count': 480.25,
  'text': 'Appl. Sci. 2024, 14, 1046 35 of 41 Appl. Sci. 2024, 14, x FOR PEER REVIEW  37 of 45    Listing A7. The test code for robustness testing.  85.     assert(RT1 == true);  86.     assert(RT2 == true);  87. }  88.   89. int main() {  90.     boundaryValueTesting();  91.     exceptionInputTesting();  92.     randomInputTesting();  93.     forcedErrorTesting();  94.     securityTesting();  95.   96.     printf(“All tests passed successfully!\\n”);  97.   98.     return 0;  99. }          Appendix D. The Standardized XML for Generalized Requirements  (1) The standardized XML template for generalized requirements is shown in Listing  A8.      Appendix D. The Standardized XML for Generalized Requirements (1) The standardized XML template for generalized requirements is shown in Listing A8. Listing A8. The standardized XML template for generalized req

In [9]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/41 [00:00<?, ?it/s]

In [10]:
 import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/41 [00:00<?, ?it/s]

208

In [11]:
# View a random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': 25,
  'sentence_chunk': 'void update_quality_bits() { 40.   float RNI_signals[] = {RNI1, RNI2, RNI3, RNI4}; 41.   bool *quality_bits[] = {&RNI1_Q, &RNI2_Q, &RNI3_Q, &RNI4_Q}; 42.  43.   for(int i = 0; i < 4; i++) { 44.     if (RNI_signals[i] < 10 || RNI_signals[i] > 10000) { 45.       *quality_bits[i] = false; 46.     } 47.   } 48. } 49.',
  'chunk_char_count': 314,
  'chunk_word_count': 74,
  'chunk_token_count': 78.5}]

In [12]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 9,
  'sentence_chunk': '2. Input signal types: RNI_INR_03: ON, OFF, Resume, Set, QuickDecel, QuickAccel are digital signals, boolean type. RNI_INR_04: Speed, Accel, Brake are analog signals, floating-point type.3. Valid range of input signals: None.4. Output signal names and destinations: CC_OUTR_01: Cruise speed signal (CruiseSpeed), output to the instrument panel. CC_OUTR_02: Throttle command signal (ThrottleCmd), output to the throttle. CC_OUTR_03: Vehicle cruise state signal (CruiseState), output to the instrument panel.5.',
  'chunk_char_count': 508,
  'chunk_word_count': 66,
  'chunk_token_count': 127.0}]

In [13]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,208.0,208.0,208.0,208.0
mean,22.73,633.8,96.98,158.45
std,11.18,412.46,54.46,103.12
min,0.0,32.0,11.0,8.0
25%,13.75,302.75,56.0,75.69
50%,25.0,509.0,79.0,127.25
75%,31.0,899.75,128.25,224.94
max,40.0,2133.0,241.0,533.25


In [14]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(2).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 22.0 | Text: { 78.   CruiseState new_state = current_state; //Initialize new_state with current_state
Chunk token count: 26.75 | Text: processNeutronFluxSignals(inputs, quality_bits, reset, &RT1, &RT2); 83. 84.  // Verify the expected outputs


In [15]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': 0,
  'sentence_chunk': 'Citation: Liu, M.; Wang, J.; Lin, T.; Ma, Q.; Fang, Z.; Wu, Y. An Empirical Study of the Code Generation of Safety-Critical Software Using LLMs. Appl. Sci.2024, 14, 1046.https:// doi.org/10.3390/app14031046 Academic Editor: Juan Pavón Received: 28 December 2023 Revised: 16 January 2024 Accepted: 18 January 2024 Published: 26 January 2024 Copyright: © 2024 by the authors. Licensee MDPI, Basel, Switzerland. This article is an open access article distributed under the terms and conditions of the Creative Commons Attribution (CC BY) license (https:// creativecommons.org/licenses/by/ 4.0/).applied  sciences Article An Empirical Study of the Code Generation of Safety-Critical Software Using LLMs Mingxing Liu 1,2 , Junfeng Wang 1,*, Tao Lin 1,*, Quan Ma 2, Zhiyang Fang 1 and Yanqun Wu 1,2 1 College of Computer Science, Sichuan University, Chengdu 610065, China; lmxstar@163.com (M. L.); fangzhiyang@scu.edu.cn (Z. F.); m15720123548@163.com (Y. W.) 2 Scie

## embedding text chunk

In [18]:
# Requires !pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cpu") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)

# Create a list of sentences to turn into numbers
sentences = [
    "The Sentences Transformers library provides an easy and open-source way to create embeddings.",
    "Sentences can be embedded one by one or as a list of strings.",
    "Embeddings are one of the most powerful concepts in machine learning!",
    "Learn to use embeddings well and you'll be well on your way to being an AI engineer."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: The Sentences Transformers library provides an easy and open-source way to create embeddings.
Embedding: [-2.07981747e-02  3.03164367e-02 -2.01218352e-02  6.86484128e-02
 -2.55256090e-02 -8.47691204e-03 -2.07164470e-04 -6.32377565e-02
  2.81606279e-02 -3.33353654e-02  3.02634425e-02  5.30720577e-02
 -5.03526069e-02  2.62287576e-02  3.33314203e-02 -4.51578647e-02
  3.63044217e-02 -1.37120148e-03 -1.20171569e-02  1.14946961e-02
  5.04510291e-02  4.70856428e-02  2.11913381e-02  5.14607430e-02
 -2.03746092e-02 -3.58889550e-02 -6.67811022e-04 -2.94392705e-02
  4.95859310e-02 -1.05639799e-02 -1.52013991e-02 -1.31760282e-03
  4.48196791e-02  1.56022850e-02  8.60379657e-07 -1.21388724e-03
 -2.37979013e-02 -9.09378694e-04  7.34485826e-03 -2.53930874e-03
  5.23370206e-02 -4.68043983e-02  1.66214611e-02  4.71579321e-02
 -4.15599830e-02  9.01906402e-04  3.60278375e-02  3.42215151e-02
  9.68227088e-02  5.94828539e-02 -1.64984874e-02 -3.51249725e-02
  5.92516595e-03 -7.07945146e-04 -2.4103

In [19]:
single_sentence = "Yo! How cool are embeddings?"
single_embedding = embedding_model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")

Sentence: Yo! How cool are embeddings?
Embedding:
[-1.97447557e-02 -4.51087812e-03 -4.98482119e-03  6.55444711e-02
 -9.87676159e-03  2.72835623e-02  3.66426148e-02 -3.30224284e-03
  8.50076973e-03  8.24948400e-03 -2.28497181e-02  4.02429961e-02
 -5.75200096e-02  6.33692592e-02  4.43207547e-02 -4.49507385e-02
  1.25284195e-02 -2.52012350e-02 -3.55292223e-02  1.29559385e-02
  8.67022946e-03 -1.92917287e-02  3.55628273e-03  1.89505890e-02
 -1.47128049e-02 -9.39845853e-03  7.64168892e-03  9.62188747e-03
 -5.98928845e-03 -3.90169173e-02 -5.47824614e-02 -5.67457359e-03
  1.11645376e-02  4.08067219e-02  1.76319088e-06  9.15296562e-03
 -8.77261534e-03  2.39382591e-02 -2.32784078e-02  8.04999545e-02
  3.19176763e-02  5.12596220e-03 -1.47708524e-02 -1.62524320e-02
 -6.03212640e-02 -4.35689725e-02  4.51211594e-02 -1.79053564e-02
  2.63367072e-02 -3.47867236e-02 -8.89172871e-03 -5.47675341e-02
 -1.24372784e-02 -2.38606706e-02  8.33496451e-02  5.71242943e-02
  1.13329012e-02 -1.49594918e-02  9.2037

In [20]:
%%time

# Send the model to the GPU
embedding_model.to("cuda") # requires a GPU installed, for reference on my local machine, I'm using a NVIDIA RTX 4090

# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/205 [00:00<?, ?it/s]

CPU times: total: 29.8 s
Wall time: 5.57 s


In [21]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [25]:
%%time

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=16, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

CPU times: total: 7.66 s
Wall time: 1.49 s


tensor([[-4.4371e-02, -3.0402e-02, -2.0879e-02,  ...,  1.0817e-02,
          4.3250e-02, -4.3674e-02],
        [-1.0618e-02,  2.2201e-02, -2.1247e-02,  ...,  1.3896e-04,
          5.9289e-05, -4.2658e-02],
        [-2.1196e-02,  6.1030e-02, -1.6898e-02,  ..., -1.1108e-03,
          4.4690e-04, -2.1227e-02],
        ...,
        [ 6.2582e-03, -1.9347e-03, -2.2213e-02,  ...,  3.9650e-02,
         -9.7993e-02, -5.3523e-02],
        [-3.5604e-02, -5.0317e-02, -2.9499e-02,  ..., -3.7472e-02,
          4.2730e-02, -2.6750e-02],
        [-4.3141e-02, -2.2908e-02, -4.1129e-02,  ...,  1.5031e-02,
          5.8275e-02, -3.7679e-02]], device='cuda:0')

### save embedding to file

In [26]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [27]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,"Citation: Liu, M.; Wang, J.; Lin, T.; Ma, Q.; ...",1688,238,422.0,[-4.43712063e-02 -3.04015651e-02 -2.08785105e-...
1,0,"However, there is currently a lack of systemat...",1503,201,375.75,[-1.06178354e-02 2.22011991e-02 -2.12470293e-...
2,0,Introduction The software development and veri...,1146,148,286.5,[-2.11959686e-02 6.10298701e-02 -1.68979596e-...
3,1,"Appl. Sci.2024, 14, 1046 2 of 41 has great pot...",1189,171,297.25,[-4.05684896e-02 -7.83899147e-03 -2.48091761e-...
4,1,LLMs can be significant for safety-critical de...,1360,183,340.0,[ 2.15886440e-03 1.43969720e-02 -1.86931174e-...


## Chunking and embedding questions