In [None]:
!pip install PyPDF2 pdfplumber transformers torch scikit-learn sentence-transformers


In [None]:
!pip install Chromadb sentence-transformers langchain-community

In [None]:
import os
import chromadb
import requests
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Import the required library
import pdfplumber

# Function to extract text from the specified page range in the PDF
def extract_text_from_pdf(file_path, start_page, end_page):
    text = []
    with pdfplumber.open(file_path) as pdf:
        for i in range(start_page, end_page):
            page = pdf.pages[i]
            text.append(page.extract_text())
    return ' '.join(text)

# Example usage
file_path = "/kaggle/input/dsm5-mental/DSM5.pdf"  # Path to your PDF file on Google Drive
start_page = 76  # Start from the page where main content begins
end_page = 837   # End at the page where main content ends

# Extract text
extracted_text = extract_text_from_pdf(file_path, start_page, end_page)

# Display a sample of the extracted text
print(extracted_text[:1000])  # Print first 1000 characters of extracted text to check output


In [None]:
import re

# Function to preprocess text by cleaning and splitting into sentences
def preprocess_text(text):
    # Remove extra spaces and non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    # Split into sentences using punctuation marks like '.', '!', or '?'
    sentences = re.split(r'(?<=[.!?]) +', text)
    return sentences

# Example usage
sentences = preprocess_text(extracted_text)

# Display a few cleaned sentences for verification
for i, sentence in enumerate(sentences[:5]):  # Print the first 5 sentences
    print(f"Sentence {i + 1}: {sentence}")


In [None]:
for i in range(1000):
  print(extracted_text[i], end="")

In [None]:
len(sentences)

In [None]:
from sentence_transformers import SentenceTransformer

# Load pre-trained SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
# Function to get embeddings for a list of sentences using SBERT
def get_sentence_embeddings_sbert(sentences):
    embeddings = model.encode(sentences, convert_to_tensor=True)  # Return tensors
    return embeddings

# Example usage
sentence_embeddings = get_sentence_embeddings_sbert(sentences)

In [None]:
# Display a few sentence embeddings for verification
for i, embedding in enumerate(sentence_embeddings[:3]):  # Print embeddings for the first 3 sentences
    print(f"Embedding for Sentence {i + 1}: {embedding}\n")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch

# Function to group similar sentences based on cosine similarity
def group_similar_sentences(embeddings, threshold=0.6):
    # Ensure embeddings are on the CPU and convert to NumPy array
    if isinstance(embeddings, list):  # If it's a list of tensors
        embeddings = [embedding.cpu().numpy() if hasattr(embedding, 'cpu') else embedding for embedding in embeddings]
    else:  # If it's a tensor
        embeddings = embeddings.cpu().numpy()
    
    # Compute cosine similarity between all sentence embeddings
    similarity_matrix = cosine_similarity(np.vstack(embeddings))
    chunks = []
    current_chunk = []
    
    for i, similarity in enumerate(similarity_matrix):
        if not current_chunk:
            current_chunk.append(i)
        elif max(similarity[current_chunk]) >= threshold:
            current_chunk.append(i)
        else:
            chunks.append(current_chunk)
            current_chunk = [i]
    
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

In [None]:
# Function to compute the representative vector for each chunk
def compute_chunk_vectors(sentence_embeddings, chunks):
    chunk_vectors = []
    
    for chunk in chunks:
        # Convert each embedding to CPU and NumPy array if necessary
        chunk_embeddings = [sentence_embeddings[i].cpu().numpy() if hasattr(sentence_embeddings[i], 'cpu') else sentence_embeddings[i] for i in chunk]
        # Compute the mean vector for the chunk
        chunk_embedding = np.mean(chunk_embeddings, axis=0)
        chunk_vectors.append(chunk_embedding)
    
    return chunk_vectors


In [None]:
#Example usage: Group sentences and display top similar chunks
chunks = group_similar_sentences(sentence_embeddings)

In [None]:
chunk_vectors = compute_chunk_vectors(sentence_embeddings, chunks)  # Compute representative vectors

In [None]:
# Function to display chunks and their representative vectors
def print_chunks_and_vectors(sentences, sentence_embeddings, chunks, chunk_vectors, num_chunks_to_display=3):
    for chunk_idx, chunk in enumerate(chunks[:num_chunks_to_display]):
        print(f"Chunk {chunk_idx + 1}:")
        for sent_idx in chunk:
            print(f" - Sentence: {sentences[sent_idx]}")
            print(f"   Vector: {sentence_embeddings[sent_idx]}")
        print(f"\nRepresentative vector for chunk {chunk_idx + 1}: {chunk_vectors[chunk_idx]}\n")

In [None]:
# Display some of the chunks and their corresponding vectors
print_chunks_and_vectors(sentences, sentence_embeddings, chunks, chunk_vectors, num_chunks_to_display=3)

In [None]:
len(chunks)

In [None]:
import json

# Function to save chunks, text, and vectors to a JSON file
def save_chunks_to_json(chunks, sentences, chunk_vectors, output_file):
    chunk_data = {}
    
    for idx, chunk in enumerate(chunks):
        # Get the sentences for the current chunk
        chunk_text = [sentences[i] for i in chunk]
        # Get the vector for the current chunk and flatten it to 1D if needed
        chunk_vector = chunk_vectors[idx].flatten().tolist()
        
        # Store the chunk text and vector in the dictionary
        chunk_data[f"Chunk_{idx}"] = {
            "text": chunk_text,
            "vector": chunk_vector  # Flattened vector (1D with 768 elements)
        }
    
    # Write the chunk data to a JSON file
    with open(output_file, 'w') as f:
        json.dump(chunk_data, f, indent=4)

In [None]:
# Example usage
output_file = "chunk_vectors.json"
save_chunks_to_json(chunks, sentences, chunk_vectors, output_file)

In [None]:
import json

# Function to display the last chunk and its vector from a JSON file
def display_final_chunk_from_json(json_file):
    # Open the JSON file and load its content
    with open(json_file, 'r') as f:
        chunk_data = json.load(f)
    
    # Get the last chunk by its key
    last_chunk_key = list(chunk_data.keys())[-1]
    last_chunk = chunk_data[last_chunk_key]
    
    # Display the final chunk's text and vector
    print(f"Chunk: {last_chunk_key}")
    print("Text:")
    for sentence in last_chunk["text"]:
        print(f" - {sentence}")
    
    print("\nVector (first 10 elements for brevity):")
    print(last_chunk["vector"][:10])  # Display only the first 10 elements of the vector

# Example usage
json_file = "/kaggle/working/chunk_vectors.json"
display_final_chunk_from_json(json_file)


In [None]:
# Function to load chunks and vectors from a JSON file
import json

def load_chunks_and_vectors_from_json(file_path):
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)
        
    # Extract chunks and vectors from the values of the dictionary
    chunks = [entry["text"] for entry in data.values()]
    vectors = [entry["vector"] for entry in data.values()]

    return chunks, vectors

# Example usage
file_path = '/kaggle/working/chunk_vectors.json'
chunks, vectors = load_chunks_and_vectors_from_json(file_path)

# Output the first chunk and vector to verify
print(f"First chunk: {chunks[0]}")
print(f"First vector (first 10 elements): {vectors[0][:10]}")


In [None]:
file_path = '/kaggle/working/chunk_vectors.json'
chunks, vectors = load_chunks_and_vectors_from_json(file_path)

In [None]:
len(chunks)

In [None]:
len(vectors)

In [None]:
chunks[9]

In [None]:
vectors[9]

In [None]:
from chromadb import Client

# Connect to ChromaDB with default settings
client = Client()  # Initializes with default settings

# Define collection name
collection_name = "mental_health"  # The collection name in ChromaDB

# Create or get the collection
collection = client.get_or_create_collection(collection_name)

# Function to save chunks and vectors to ChromaDB
def save_chunks_and_vectors_to_chromadb(chunks, chunk_vectors):
    for idx, (chunk, vector) in enumerate(zip(chunks, chunk_vectors)):
        collection.add(
            documents=[chunk],  # The text you want to store
            embeddings=[vector],  # Directly use the vector (assumed to be a list)
            ids=[str(idx)]  # Unique ID for each chunk (convert to string)
        )

save_chunks_and_vectors_to_chromadb(chunks, vectors)

In [None]:
from google.colab import userdata
userdata.get('groqApi')

In [None]:
# # Now you can perform retrieval using your previous code
# # ----- Retrieval and Generation Process -----
query = 'How to get rid off trauma?'

# Define the embedding model with 768-dimensional output
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Embed the query using the same model
query_vector = embedding_model.embed_query(query)

# Perform similarity search using the correct dimension
docs_chroma = collection.query(
    query_embeddings=[query_vector],
    n_results=5,
)

# Extract the documents directly from the nested lists in docs_chroma["documents"]
retrieved_docs = [doc[0] for doc in docs_chroma["documents"]]  # Assuming each document is a list with a single string

# Join the retrieved documents into a single context string
context_text = "\n\n".join(retrieved_docs)

# Set up the prompt template
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
Answer the question based on the above context: {question}.
Provide a detailed answer.
Don’t justify your answers.
Don’t give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
"""

# Format the prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query)

print("Generated prompt:")
print(prompt)

In [None]:
# ----- Groq API-Based Inference -----
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
headers = {
    "Authorization": f"Bearer {userdata.get('groqApi')}",
    "Content-Type": "application/json"
}
# Prepare the payload for the API request
payload = {
    "model": "mixtral-8x7b-32768",  # Specify the model to use
    "messages": [
        {
            "role": "user",
            "content": prompt,
        }
    ],
    "max_tokens": 150,
    "temperature": 0.7,
}

# Send the request to the Groq API
response = requests.post(GROQ_API_URL, headers=headers, json=payload)

# Check if the request was successful
if response.status_code == 200:
    response_data = response.json()
    response_text = response_data.get("choices", [{}])[0].get("message", {}).get("content", "No output generated")
    print("Generated Response:\n", response_text)
else:
    print("Failed to generate response. Error:", response.status_code, response.text)