**Data Loading & Preprocessing:**

This section of the code loads all of the LaTeX documents we have in our datasets. It then preprocesses the data by splitting them into chunks based on a given chunk size.

In [53]:
import re
import glob
import os

tex_files = glob.glob('datasets/*.tex')  # load all .tex files from dataset directory
documents = []
CHUNK_SIZE = 50

def clean_tex_file(text):
    # Remove comments (lines starting with %)
    text = re.sub(r'(?m)^%.*\n?', '', text)
    # Remove LaTeX commands (basic removal; adjust if needed)
    text = re.sub(r'\\[a-zA-Z]+(\[[^\]]*\])?(\{[^\}]*\})?', '', text)
    # Optionally remove math environments (if not needed)
    text = re.sub(r'\$[^\$]*\$', '', text)
    # Remove measurement units like "cm", "pt", "em" that follow a number
    text = re.sub(r'\b\d+(,\d+)?\s*(cm|pt|em)\b', '', text, flags=re.IGNORECASE)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    # Delete all remaining symbols: keep only letters, numbers, and whitespace.
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    return text

for filename in tex_files:
    with open(filename, 'r', encoding='utf-8') as file:
        raw_text = file.read()
        cleaned_text = clean_tex_file(raw_text)
        documents.append(cleaned_text)

def split_into_chunks(text):
    words = text.split()
    return [' '.join(words[i:i+CHUNK_SIZE]) for i in range(0, len(words), CHUNK_SIZE)]

document_chunks = [split_into_chunks(doc) for doc in documents]  # create chunks for each document

print("Number of docs loaded:", len(documents))

Number of docs loaded: 27


**Data Annotation:**

Use Doccano, an open source annotation tool, to label and annotate our data. This helps us create high-quality training data for our RAG model.

In [54]:
#create a csv format of doc chunks

import csv

# Flatten the document_chunks list and assign document/chunk IDs
rows = []
for doc_idx, chunks in enumerate(document_chunks, start=1):
    for chunk_idx, chunk in enumerate(chunks, start=1):
        rows.append({
            'doc_id': doc_idx,
            'chunk_id': chunk_idx,
            'text': chunk
        })

# Write rows to a CSV file
csv_file = 'document_chunks.csv'
with open(csv_file, 'w', encoding='utf-8', newline='') as csvfile:
    fieldnames = ['doc_id', 'chunk_id', 'text']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

print(f"CSV file '{csv_file}' created with {len(rows)} rows.")

CSV file 'document_chunks.csv' created with 220 rows.


**Create embeddings**

We create numerical embeddings for each of the chunks using a sentence transformer (all-MiniLm-L6-v2)

In [55]:
from sentence_transformers import SentenceTransformer
import csv
import json

# Load a pre-trained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Read your existing CSV file of document chunks
input_csv = 'document_chunks.csv'
rows = []
with open(input_csv, 'r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        rows.append(row)

# Process each row to compute embeddings for the text chunk
for row in rows:
    text = row['text']
    # Compute embedding
    embedding = model.encode(text)
    # Convert the numpy array to a list and then to a JSON string for CSV storage
    row['embedding'] = json.dumps(embedding.tolist())

# Write the results to a new CSV file including the embeddings column
output_csv = 'document_chunks_with_embeddings.csv'
with open(output_csv, 'w', encoding='utf-8', newline='') as outfile:
    fieldnames = ['doc_id', 'chunk_id', 'text', 'embedding']
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

print(f"CSV file '{output_csv}' created with embeddings for {len(rows)} rows.")

CSV file 'document_chunks_with_embeddings.csv' created with embeddings for 220 rows.


In [57]:
import faiss
import numpy as np
import json
import csv
from sentence_transformers import SentenceTransformer

# Load your Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load embeddings and text chunks from CSV
embeddings = []
texts = []
with open('document_chunks_with_embeddings.csv', 'r', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        emb = np.array(json.loads(row['embedding']), dtype=np.float32)
        embeddings.append(emb)
        texts.append(row['text'])

embeddings = np.vstack(embeddings)

# Build FAISS index (using L2 distance)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print("Total embeddings indexed:", index.ntotal)

# Function to query the index
def search_query(query, k=10):
    query_embedding = model.encode(query, convert_to_numpy=True).astype(np.float32)
    distances, indices = index.search(np.expand_dims(query_embedding, axis=0), k)
    return [texts[idx] for idx in indices[0]]

# Test the retrieval system
query = "What are the payment terms for the contract?"
retrieved_chunks = search_query(query)
print("Retrieved Chunks:")
for chunk in retrieved_chunks:
    print(chunk, "\n")

Total embeddings indexed: 220
Retrieved Chunks:
0 This Consulting Agreement the Agreement is made and entered into as of by and between The Consultant agrees to provide the following services to the Client This Agreement shall commence on and continue until unless terminated earlier as outlined in Section 7 The Client agrees to pay the Consultant 

Agreement If additional payment is due this shall be payable within thirty days of the Clients written notification to stop work In the event of termination the Client shall also pay any expenses incurred by Consultant The Client shall assume responsibility for all collection of legal fees necessitated by default 

Purchase Price is due upon signing this Contract The remaining balance shall be paid upon delivery of the Products Payment shall be made via check wire transfer or credit card The Products shall be delivered to Buyer at on or before Title and risk of loss shall pass to Buyer 

handling fees A deposit of 30 is due upon signing this