In [6]:
!pip install spacy pymupdf transformers faiss-cpu chromadb langchain hnswlib spacy transformers

Collecting hnswlib
  Using cached hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml): started
  Building wheel for hnswlib (pyproject.toml): finished with status 'error'
Failed to build hnswlib


  error: subprocess-exited-with-error
  
  × Building wheel for hnswlib (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [5 lines of output]
      running bdist_wheel
      running build
      running build_ext
      building 'hnswlib' extension
      error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for hnswlib
ERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (hnswlib)


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import re
import spacy
import pymupdf  # PyMuPDF for PDF handling
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import faiss

In [8]:
# Example usage
pdf_path = r'C:\Users\kumar\Desktop\Sem_5\DL\mental_health_rag\Diagnostic and statistical manual of mental disorders _ DSM-5 ( PDFDrive.com ).pdf'
start_page = 51
end_page = 874
window_size = 5

In [15]:
# Load English tokenizer, POS tagger, etc.
nlp = spacy.load("en_core_web_sm",disable=["parser", "ner"])
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

In [16]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [17]:
# Extract text from PDF
def extract_text_from_pdf(pdf_path, start_page, end_page):
    doc = pymupdf.open(pdf_path)
    text = ""
    for page_num in range(start_page - 1, end_page):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [19]:
# Function to clean and tokenize text efficiently
def clean_and_tokenize_large_text(text):
    # Process the entire text using spaCy to create a Doc object
    doc = nlp(text)
    all_sentences = []

    # Extract sentences from the Doc object
    sentences = [sent.text for sent in doc.sents]

    # Optionally, you could further filter or clean the sentences here
    for sentence in sentences:
        if sentence.strip():  # Ensure the sentence is not empty
            all_sentences.append(sentence.strip())

    return all_sentences


In [23]:
# Function to split large text into smaller parts based on a maximum size
def split_large_text(text, max_chunk_size=2000000):
    chunks = []
    current_length = 0
    start_index = 0

    while start_index < len(text):
        end_index = min(start_index + max_chunk_size, len(text))
        chunk = text[start_index:end_index]
        chunks.append(chunk)
        start_index = end_index  # Move to the next chunk

    return chunks

# Function to process the large text in chunks
def process_large_text(text, max_chunk_size=2000000):
    # Split the large text into manageable chunks
    text_chunks = split_large_text(text, max_chunk_size)
    all_sentences = []

    # Process each chunk separately
    for chunk in text_chunks:
        sentences = clean_and_tokenize_large_text(chunk)
        all_sentences.extend(sentences)  # Aggregate results from all chunks

    return all_sentences

In [24]:
raw_text = extract_text_from_pdf(pdf_path, start_page, end_page)

# Step 2: Clean the text
clean_text_data = clean_text(raw_text)

# Step 3: Tokenize and create sentence groups
sentences = process_large_text(clean_text_data)

ValueError: [E088] Text of length 2000000 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.

In [22]:
sentences[1]

NameError: name 'sentences' is not defined

In [48]:
# Create sentence groups
def create_sentence_groups(sentences, window_size=5):
    sentence_groups = []
    for i in range(len(sentences)):
        group = sentences[max(0, i - window_size): min(len(sentences), i + window_size + 1)]
        sentence_groups.append(" ".join(group))
    return sentence_groups

In [49]:
sentence_groups = create_sentence_groups(sentences, window_size)

In [52]:
len(sentence_groups[5])

2142

In [56]:
# Function to generate embeddings using BioBERT
def generate_embeddings(sentence_groups):
    embeddings = []
    for group in sentence_groups:
        inputs = biobert_tokenizer(group, truncation=True, padding=True, return_tensors="pt", max_length=512)
        with torch.no_grad():
            outputs = biobert_model(**inputs)
        # Use mean pooling to get a single embedding for each group
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

In [57]:
# Function to build a FAISS index for HNSW
def build_faiss_hnsw_index(embeddings, ef_search=50, ef_construction=200, M=16):
    # Convert embeddings to NumPy array
    embeddings_array = np.array(embeddings).astype(np.float32)

    # Define the number of dimensions from the embeddings
    d = embeddings_array.shape[1]  # number of dimensions

    # Create a FAISS HNSW index
    index = faiss.IndexHNSWFlat(d, M)
    index.hnsw.efConstruction = ef_construction  # efConstruction controls the index construction
    index.hnsw.efSearch = ef_search  # efSearch controls the search depth

    # Add the embeddings to the FAISS index
    index.add(embeddings_array)

    return index

In [58]:
# Function to search for similar embeddings using FAISS HNSW
def find_similar_groups(index, query_embedding, k=5):
    query_embedding = np.array([query_embedding]).astype(np.float32)
    distances, indices = index.search(query_embedding, k)
    return distances, indices

In [60]:
# Function to create chunks based on theme similarity
def create_chunks_based_on_similarity(sentence_groups, embeddings, index, threshold=0.4):
    chunks = []
    current_chunk = []
    current_chunk_vector = None

    for i, embedding in enumerate(embeddings):
        if i == 0:
            # Initialize first chunk with the first group and its embedding
            current_chunk.append(sentence_groups[i])
            current_chunk_vector = embedding
        else:
            # Compare the current embedding with the previous chunk's embedding
            distances, indices = find_similar_groups(index, current_chunk_vector, k=1)
            similarity_score = distances[0][0]

            if similarity_score < threshold:  # If similar, add to the current chunk
                current_chunk.append(sentence_groups[i])
                # Update the chunk's vector (mean of all group embeddings)
                current_chunk_vector = np.mean([current_chunk_vector, embedding], axis=0)
            else:
                # If not similar, save the current chunk and start a new one
                chunks.append({
                    'chunk_text': ' '.join(current_chunk),
                    'chunk_vector': current_chunk_vector.tolist()
                })
                current_chunk = [sentence_groups[i]]
                current_chunk_vector = embedding

    # Add the last chunk
    if current_chunk:
        chunks.append({
            'chunk_text': ' '.join(current_chunk),
            'chunk_vector': current_chunk_vector.tolist()
        })

    return chunks

In [64]:
# Main function to process the text, generate embeddings, create chunks, and save to JSON
def process_large_text_with_faiss(sentence_groups,threshold=0.8, k=5):

    # Step 2: Generate embeddings for the sentence groups
    embeddings = generate_embeddings(sentence_groups)

    # Step 3: Build the FAISS HNSW index
    index = build_faiss_hnsw_index(embeddings)

    # Step 4: Create chunks based on theme similarity using HNSW similarity search
    chunks = create_chunks_based_on_similarity(sentence_groups, embeddings, index, threshold)

    return chunks

In [None]:
chunks = process_large_text_with_faiss(sentence_groups)

In [None]:
# Function to save chunks and their embeddings to JSON
import json
def save_chunks_to_json(chunks, file_path='/content/drive/MyDrive/Dataset/chunks_and_vectors.json'):
    with open(file_path, 'w') as f:
        json.dump(chunks, f)

In [None]:
# Step 5: Save chunks and their vectors to a JSON file
save_chunks_to_json(chunks, file_path='chunks.json')
