In [9]:
#Extracting words from one PDF
#Extracted words are combined into a single string
from pdfminer.high_level import extract_text
import re

pdf_path = r"C:\Users\makaa\Documents\Comp Sci\RAG PROJECT\CUDA_C_Programming_Guide.pdf"

# Extract text from the PDF
text = extract_text(pdf_path)

# Optional: Clean up the text by replacing newline characters
clean_text = text.replace('\n', ' ').strip()

# Use a regular expression to extract words
# This regex matches sequences of alphanumeric characters (words)
words = re.findall(r'\b\w+\b', clean_text)

# Output the results
print("Total words extracted:", len(words))

# Write the extracted words to a file (joined by spaces)
with open("extracted_words.txt", "w", encoding="utf-8") as file:
    file.write(" ".join(words))


Total words extracted: 178124


In [10]:
#Prepare Text for Chunking
# Read the cleaned text file (if not already in memory)
with open("extracted_words.txt", "r", encoding="utf-8") as file:
    clean_text = file.read()
    
#Using NLTKs tokenizer I am going to split the text into words and then grup them into chunks of 300 tokens.
import nltk
nltk.download('punkt')  # Run this once to download the tokenizer models
from nltk.tokenize import word_tokenize
import pandas as pd

# Tokenize the text
tokens = word_tokenize(clean_text)

# Set chunk size (number of words per chunk)
chunk_size = 300

# Create chunks: group every 300 tokens into a single chunk
chunks = [" ".join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]

print(f"Generated {len(chunks)} chunks.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\makaa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Generated 595 chunks.
First chunk:
CUDA C Programming Guide Release 12 8 NVIDIA Corporation Feb 14 2025 Contents 1 The Benefits of Using GPUs 2 CUDA A General Purpose Parallel Computing Platform and Programming Model 3 A Scalable Programming Model 4 Document Structure 5 Programming Model 5 1 5 2 5 2 1 5 3 5 4 5 5 5 5 1 5 6 Kernels Thread Hierarchy Thread Block Clusters Memory Hierarchy Heterogeneous Programming Asynchronous SIMT Programming Model Asynchronous Operations Compute Capability 6 Programming Interface CUDA Runtime Compilation with NVCC 6 1 1 1 6 1 1 2 6 1 6 1 1 6 1 2 6 1 3 6 1 4 6 1 5 6 1 6 6 2 6 2 1 6 2 2 6 2 3 6 2 3 1 6 2 3 2 6 2 3 3 6 2 3 4 6 2 3 5 6 2 3 6 6 2 3 7 6 2 3 8 6 2 4 6 2 5 6 2 6 6 2 6 1 Compilation Workflow Binary Compatibility PTX Compatibility Application Compatibility C Compatibility 64 Bit Compatibility Offline Compilation Just in Time Compilation Initialization Device Memory Device Memory L2 Access Management L2 Cache Set Aside for Persisting Accesses L2 P

In [12]:
#Saving chunks into a CSV file, with columns for an identifier, the text chunk, and a source label (indicating these come from CUDA documentation).
#Create a DataFrame for the knowledge base
data = {
    "id": list(range(1, len(chunks) + 1)),
    "text": chunks,
    "source": ["CUDA Documentation"] * len(chunks)  # You can update this if you add more sources later
}

df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv("knowledge_base.csv", index=False)
print("CSV file 'knowledge_base.csv' created successfully.")


CSV file 'knowledge_base.csv' created successfully.
