# 1. Read File and Text Processing

## Import Dependencies

In [3]:
# Google Colab installs
import os

if "COLAB_GPU" in os.environ:
    !pip install -U torch
    !pip install --upgrade --force-reinstall PyMuPDF # for reading PDFs with Python
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference
    !pip install tqdm

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.14
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
Collecting flash-attn
  Downloading flash_attn-2.7.0.post2.tar.gz (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m42.9 MB/s[0m eta

In [6]:
import pandas as pd
import fitz
from spacy.lang.en import English
from sentence_transformers import SentenceTransformer
import requests

## Read the PDF File

In [8]:
pdf_path = 'RAG for LLM.pdf'
url = 'https://arxiv.org/pdf/2312.10997'

if not os.path.exists(pdf_path):
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Open a file in binary write mode and save the content to it
        with open(pdf_path, "wb") as file:
            file.write(response.content)
        print(f"The file has been downloaded and saved as {pdf_path}")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")
else:
    print(f"File {pdf_path} exists.")


The file has been downloaded and saved as RAG for LLM.pdf


In [9]:
doc = fitz.open(pdf_path)

pdf_file = []
for page_number, page in enumerate(doc):  # iterate the document pages
    text = page.get_text()  # get plain text encoded as UTF-8
    text = text.replace("\n", " ").strip()
    pdf_file.append({"page_number": page_number,
                            "page_char_count": len(text),
                            "page_word_count": len(text.split(" ")),
                            "page_sentence_count_raw": len(text.split(". ")),
                            "page_token_count": len(text) / 4,
                            "text": text})
len(pdf_file)

21

In [10]:
df = pd.DataFrame(pdf_file)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,0,5451,739,28,1362.75,1 Retrieval-Augmented Generation for Large Lan...
1,1,3082,450,32,770.5,2 Fig. 1. Technology tree of RAG research. The...
2,2,3950,575,41,987.5,3 Fig. 2. A representative instance of the RAG...
3,3,3574,492,30,893.5,4 Fig. 3. Comparison between the three paradig...
4,4,6362,887,42,1590.5,5 aligns the text more closely with data distr...


## Text Processing

### Sentencizer

In [12]:
nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")
# Test
list(nlp("First sentences. Second sentence.").sents)

[First sentences., Second sentence.]

In [13]:
df['sentences_list'] = df['text'].apply(lambda text: list(nlp(text).sents))
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences_list
0,0,5451,739,28,1362.75,1 Retrieval-Augmented Generation for Large Lan...,"[(1, Retrieval, -, Augmented, Generation, for,..."
1,1,3082,450,32,770.5,2 Fig. 1. Technology tree of RAG research. The...,"[(2, Fig, .), (1, .), (Technology, tree, of, R..."
2,2,3950,575,41,987.5,3 Fig. 2. A representative instance of the RAG...,"[(3, Fig, .), (2, .), (A, representative, inst..."
3,3,3574,492,30,893.5,4 Fig. 3. Comparison between the three paradig...,"[(4, Fig, .), (3, .), (Comparison, between, th..."
4,4,6362,887,42,1590.5,5 aligns the text more closely with data distr...,"[(5, aligns, the, text, more, closely, with, d..."


### Check Token Limitation

Since I plan to use `all-mpnet-base-v2` model which has a capacity of 384 tokens to embed the text.

So, the text has to be splitted into chunks to make sure that they are not exceed the model's capacity.

In [14]:
# Check the token count in each page
for page in range(len(df['sentences_list'])):
    token = 0
    for text in df['sentences_list'][page]:
        token += len(text)/4
    print(f"Page {page} has {token} tokens")


Page 0 has 227.5 tokens
Page 1 has 136.5 tokens
Page 2 has 168.0 tokens
Page 3 has 151.75 tokens
Page 4 has 274.25 tokens
Page 5 has 211.0 tokens
Page 6 has 185.5 tokens
Page 7 has 272.0 tokens
Page 8 has 283.0 tokens
Page 9 has 269.25 tokens
Page 10 has 179.5 tokens
Page 11 has 261.5 tokens
Page 12 has 231.75 tokens
Page 13 has 189.25 tokens
Page 14 has 157.5 tokens
Page 15 has 154.25 tokens
Page 16 has 502.25 tokens
Page 17 has 515.25 tokens
Page 18 has 501.5 tokens
Page 19 has 511.5 tokens
Page 20 has 30.75 tokens


As we can see that the only pages that exceed 384 tokens are from Page 16 onward.

However, those pages are about the references which don't have much important information, so I decided to drop them.

In [21]:
df.drop([16, 17, 18, 19, 20], inplace=True)

### Split each sentence into new DataFrame

In [33]:
splitted_sentences = []
for page in range(len(df)):
    for sentence in df.iloc[page]['sentences_list']:
        sentences_dict = {}
        sentences_dict['page'] = page
        sentences_dict['sentences'] = sentence
        sentences_dict['token_count'] = len(sentence) / 4
        splitted_sentences.append(sentences_dict)

sentences_df = pd.DataFrame(splitted_sentences)
sentences_df

Unnamed: 0,page,sentences,token_count
0,0,"(1, Retrieval, -, Augmented, Generation, for, ...",26.75
1,0,"(Retrieval, -, Augmented, Generation, (, RAG, ...",5.00
2,0,"(This, enhances, the, accuracy, and, credibili...",7.50
3,0,"(RAG, synergistically, merges, LLMs, ’, intrin...",4.50
4,0,"(This, comprehensive, review, paper, offers, a...",7.25
...,...,...,...
494,15,"(The, analysis, outlines, three, developmental...",7.25
495,15,"(RAG, ’s, technical, integration, with, other,...",6.00
496,15,"(Despite, the, progress, in, RAG, technology, ...",5.75
497,15,"(RAG, ’s, application, scope, is, expanding, i...",7.25


## Embedding

In [43]:
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")


sentences_df['embedding'] = sentences_df['sentences'].apply(lambda sentences: embedding_model.encode(sentences.text))
sentences_df.head()

Unnamed: 0,page,sentences,token_count,embedding
0,0,"(1, Retrieval, -, Augmented, Generation, for, ...",26.75,"[0.033680752, 0.06904238, -0.03743913, 0.05732..."
1,0,"(Retrieval, -, Augmented, Generation, (, RAG, ...",5.0,"[0.043078717, 0.05558917, -0.021187901, -0.018..."
2,0,"(This, enhances, the, accuracy, and, credibili...",7.5,"[-0.011808507, -0.03139684, -0.040654678, -0.0..."
3,0,"(RAG, synergistically, merges, LLMs, ’, intrin...",4.5,"[-0.059846584, 0.028356424, -0.031254996, 0.01..."
4,0,"(This, comprehensive, review, paper, offers, a...",7.25,"[0.009409682, 0.003933026, 0.006201101, -0.064..."


# 2. RAG Implementation

In [50]:
import torch
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

# Convert embeddings to torch tensor
embeddings = torch.tensor(np.array(sentences_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([499, 768])