In [None]:
! pip install langchain
!pip install -qU langchain-google-genai

! pip install chromadb

!pip install -qU langchain_community pypdf

! pip install -U langchain-cohere

!pip install --upgrade chromadb

!pip install -U huggingface-hub sentence-transformers chromadb cohere transformers


In [2]:
!pip install --upgrade sympy


Collecting sympy
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.13.3-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.1
    Uninstalling sympy-1.13.1:
      Successfully uninstalled sympy-1.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.1+cu121 requires sympy==1.13.1; python_version >= "3.9", but you have sympy 1.13.3 which is incompatible.[0m[31m
[0mSuccessfully installed sympy-1.13.3


In [3]:
import getpass
import os
os.environ["COHERE_API_KEY"] = "XXXXXX"


from huggingface_hub import login
login("XXXXXYYYYYYYY")

## almost working

In [17]:
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.chains import create_retrieval_chain
from langchain.prompts.chat import ChatPromptTemplate
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from langchain_cohere import ChatCohere
from transformers import AutoModel, AutoTokenizer
from langchain.embeddings import HuggingFaceEmbeddings
import shutil
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

from langchain.schema import HumanMessage
import time

class LegalBERTEmbeddings:
    def __init__(self):
        self.model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")
        print("LegalBERT model loaded successfully.")

    def embed(self, texts):
        """Return embeddings for a list of texts."""
        if isinstance(texts, str):
            texts = [texts]
        return self.model.encode(texts, batch_size=32).tolist()





class DocumentProcessor:
    def __init__(self, collection_name, persist_directory, embeddings_model):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.embeddings_model = embeddings_model
        self.vector_store = None
        self.documents = []
        self.llm = ChatCohere()
        print(f"DocumentProcessor initialized with collection_name: {collection_name}")

    def load_files(self, file_path):
        documents = []

        # Load and split documents
        if file_path.lower().endswith('.pdf'):
            loader = PyPDFLoader(file_path)
            documents.extend(loader.load())
        elif file_path.lower().endswith('.txt'):
            loader = TextLoader(file_path)
            documents.extend(loader.load())


        # Split documents into chunks
        splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n"],
            chunk_size=250,
            chunk_overlap=100,
            length_function=len
        )
        splitted_docs_with_titles = []
        splitted_docs_without_titles = []

        for doc in documents:
            chunks = splitter.split_text(doc.page_content)
            for chunk in chunks:
                title = self._generate_questions(chunk)

                # Save chunk with title
                chunk_with_title = f"Title: {title}\n\n{chunk}"

                # Remove "Title:" and "Summary:" keywords
                cleaned_chunk_with_title = chunk_with_title.replace("Title:", "").replace("Summary:", "").strip()

                splitted_docs_with_titles.append(Document(page_content=cleaned_chunk_with_title))

                # Save chunk without title
                # splitted_docs_without_titles.append(Document(page_content=chunk, metadata=doc.metadata))

        # print(f"Total chunks created with titles: {len(splitted_docs_with_titles)}")
        # print(f"Total chunks created without titles: {len(splitted_docs_without_titles)}")

        # Combine both versions into the same vector store
        # all_docs = splitted_docs_with_titles + splitted_docs_without_titles
        all_docs = splitted_docs_with_titles

        # # Showing what is going to be saved in the database
        # for idx, doc in enumerate(all_docs):
        #     print(f"\n--- Chunk {idx + 1} ---")
        #     print(f"Content: {doc.page_content[:200]}...")
        #     # print(f"Metadata: {doc.metadata}")

        # Save the documents to class attribute
        self.documents = all_docs

        # Initialize vector store
        self.vector_store = Chroma.from_documents(
            documents=self.documents,
            embedding=HuggingFaceEmbeddings(model_name="nlpaueb/legal-bert-base-uncased"),
            collection_name=self.collection_name,
            persist_directory=self.persist_directory
        )
        print("Vector store created and persisted successfully.")


    def _generate_questions(self, chunk):



        try:
            # Few-shot examples
            examples = (
                "Example 1:\n"
                "Text:\n"
                "12 October 2023\n"
                "LEASE AGREEMENT UNDER COMMON LAW\n"
                "between\n"
                "Huppeldepup NV as Lessor\n"
                "and\n"
                "Calimero BV as Lessee\n\n"
                "Summary: the names of the lessor and lessee.\n\n"

                "Example 2:\n"
                "Text:\n"
                "2 Object of lease\n"
                "The Lessor leases to the Lessee, who accepts, the real estate located at Stationsstraat 12, 2590 Berlaar Kontich, "
                "with an equipped office building with a total usable floor area of 500m2 spread over three levels, as well as eleven parking spaces (outdoor) "
                "with seven electric charging stations, (the Property), under the common law lease system and under the conditions and terms described in this Agreement.\n\n"
                "Summary: The address of the property and its facilities.\n\n"

                "Example 3:\n"
                "Text:\n"
                "AND:\n"
                "2. Calimero BV, a private limited company under Belgian law with registered office at Stationsstraat 12, 2590 Berlaar, "
                "and registered in the Legal Entities Register under number 0222.222.222 (RPR Antwerp, Mechelen division), "
                "hereinafter referred to as the Lessee.\n\n"
                "Summary: Name of the Lessee (tenant) and their information and address.\n\n"
            )

            # Define the prompt with few-shot examples
            prompt = (
                "You are an expert in analyzing legal texts and generating concise, contextually appropriate summary titles for semantic search. "
                "Your task is to identify key information from legal contracts, such as tenant and landlord roles and names, goods or property price and address, "
                "and other relevant aspects. "
                "Use precise legal terminology, avoid specific names, entities, or dates, and ensure the title is short, general, and accurately reflects the main content.\n\n"
                f"{examples}"
                f"Text:\n{chunk}\n\n"
                "Summary:"
            )

            # Prepare the message format for the generate method
            messages = [[HumanMessage(content=prompt)]]

            # print(f"\n--- Processing Chunk ---\n{chunk[:200]}...")  # Display the first 200 characters of the chunk for readability

            # Call the Cohere LLM's generate method with a delay
            response = self.llm.generate(messages=messages)

            # Introduce a 6-second delay to prevent rate limiting
            time.sleep(6)

            # Extract the generated title from the LLMResult
            if response.generations and len(response.generations) > 0:
                title = response.generations[0][0].text.strip()
                # print(f"--- Generated Title ---\n{title}\n")  # Display the generated title
                return title
            else:
                raise ValueError("No generations returned by the LLM.")

        except Exception as e:
            print(f"Error generating title: {e}")
            return f"No title generated due to an error: {str(e)}"





    def retrieve_with_cosine_similarity(self, query):

        query_embedding = np.array(self.embeddings_model.embed(query))

        all_embeddings = [
            np.array(self.embeddings_model.embed(doc.page_content))
            for doc in self.documents
        ]
        all_embeddings = np.vstack(all_embeddings)

        # Calculate cosine similarities
        similarities = cosine_similarity(query_embedding, all_embeddings)[0]

        sorted_indices = np.argsort(similarities)[::-1]
        sorted_docs = [self.documents[i] for i in sorted_indices]

        # Print chunks with similarity scores
        # for i, doc in enumerate(sorted_docs[:5]):
        #     print(f"Chunk {i+1}: {doc.page_content}...")
        #     print(f"Similarity Score: {similarities[sorted_indices[i]]}")
        #     print("---" * 50)

        return sorted_docs[:5]


class QAChain:
    def __init__(self, document_processor):
        self.document_processor = document_processor
        self.llm = ChatCohere()
        self.chain = self._initialize_chain()


    def _initialize_chain(self):
        print("Initializing retrieval chain and LLM...")
        system_prompt = (
            "You are a legal specialist, Use the information provided below to answer the question precisely. "
            "Follow these rules: "
            "1. Provide the exact answer as it appears in the provided content. Never ever rephrase, interpret, or expand on the answer. "
            "2. If additional information relevant to the answer is available, provide it after the exact answer. Never ever rephrase, interpret, or expand on the answer."
            "3. If you cannot find the answer in the provided content, respond with: 'I could not find the answer.' "
            "4. If the provided content or question is in another European language, answer in that language. "
            "5. If you cannot recognize the language or respond in it, state: 'Please use English or another European language.' "
            "Context: {context}"
        )

        prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            ("human", "{input}"),
        ])
        question_answer_chain = create_stuff_documents_chain(self.llm, prompt)

        return create_retrieval_chain(
            retriever=self.document_processor.vector_store.as_retriever(),
            combine_docs_chain=question_answer_chain
        )

    def run_queries(self, queries):
        responses = []
        for query in queries:
            # print(f"Running query: '{query}'")
            try:
                retrieved_docs = self.document_processor.retrieve_with_cosine_similarity(query)
                context = " ".join(doc.page_content for doc in retrieved_docs[:3])
                # print(f"Context passed to LLM for query '{query}':\n{context}\n{'-'*50}")
                # print(context)

                response = self.chain.invoke({"input": query, "context": context})
                time.sleep(6)
                # print(f"LLM response: {response}")

                responses.append({
                    "query": query,
                    "answer": response.get("answer", "I could not find the answer.")
                })
            except Exception as e:
                # print(f"Error during query: {e}")
                responses.append({
                    "query": query,
                    "answer": "Error during query."
                })
        return responses




# Function to Run Queries Independently
def run_independent_queries(queries):
    """
    Function to execute queries without restarting the document processing steps.
    """
    print("\n" + "=" * 70)
    print("Executing Queries and Displaying Results")
    print("=" * 70 + "\n")

    results = qa_chain.run_queries(queries)

    for i, result in enumerate(results, start=1):
        print(f"\nQuery {i}/{len(results)}")
        print("-" * 70)
        print(f"📋 Query: {result['query']}")
        print(f"✅ Answer: {result['answer']}")
        print("-" * 70)

    print("\n" + "=" * 70)
    print("All Queries Processed")
    print("=" * 70 + "\n")


In [7]:
start_time = time.time()

# Main Script
file_path = "/content/document-one.txt"
persist_dir = "/content/collections_one"

# Step 1: Initialize Embedding Model
embeddings_model = LegalBERTEmbeddings()

# Step 2: Load Files and Create Vector Store
doc_processor = DocumentProcessor(
    collection_name="legal_docs_one",
    persist_directory=persist_dir,
    embeddings_model=embeddings_model
)

# Run this only once to preprocess documents and store embeddings
doc_processor.load_files(file_path)

# Step 3: Define Query System
qa_chain = QAChain(document_processor=doc_processor)


end_time = time.time()

# Calculate the execution time
execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")



LegalBERT model loaded successfully.
DocumentProcessor initialized with collection_name: legal_docs_one


  embedding=HuggingFaceEmbeddings(model_name="nlpaueb/legal-bert-base-uncased"),


Vector store created and persisted successfully.
Initializing retrieval chain and LLM...
Total Execution Time: 287.71 seconds


In [8]:
start_time = time.time()
queries = ["What is the location of the rental property?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results


Query 1/1
----------------------------------------------------------------------
📋 Query: What is the location of the rental property?
✅ Answer: Stationsstraat 12, 2590 Berlaar Kontich
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 21.29 seconds


In [9]:
start_time = time.time()
queries = ["Who is the tenant?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results


Query 1/1
----------------------------------------------------------------------
📋 Query: Who is the tenant?
✅ Answer: Calimero BV
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 20.68 seconds


In [10]:
start_time = time.time()
queries = ["Who is the landlord?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results


Query 1/1
----------------------------------------------------------------------
📋 Query: Who is the landlord?
✅ Answer: Huppeldepup NV
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 20.83 seconds


In [11]:
start_time = time.time()
queries = ["What is the rental price?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results


Query 1/1
----------------------------------------------------------------------
📋 Query: What is the rental price?
✅ Answer: I could not find the answer.
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 20.75 seconds


In [21]:
start_time = time.time()

# Main Script
file_path = "/content/document-two.txt"
persist_dir = "/content/collections_twoo"

# Step 1: Initialize Embedding Model
embeddings_model = LegalBERTEmbeddings()

# Step 2: Load Files and Create Vector Store
doc_processor = DocumentProcessor(
    collection_name="legal_docs_two",
    persist_directory=persist_dir,
    embeddings_model=embeddings_model
)

# Run this only once to preprocess documents and store embeddings
doc_processor.load_files(file_path)

# Step 3: Define Query System
qa_chain = QAChain(document_processor=doc_processor)


end_time = time.time()

# Calculate the execution time
execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")



LegalBERT model loaded successfully.
DocumentProcessor initialized with collection_name: legal_docs_two




Vector store created and persisted successfully.
Initializing retrieval chain and LLM...
Total Execution Time: 448.33 seconds


In [22]:
start_time = time.time()
queries = ["What is the location of the rental property?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results


Query 1/1
----------------------------------------------------------------------
📋 Query: What is the location of the rental property?
✅ Answer: I could not find the answer.
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 27.61 seconds


In [23]:
start_time = time.time()
queries = ["Who is the tenant?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results


Query 1/1
----------------------------------------------------------------------
📋 Query: Who is the tenant?
✅ Answer: The emphyteutic lessee.
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 29.72 seconds


In [24]:
start_time = time.time()
queries = ["Who is the landlord?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results


Query 1/1
----------------------------------------------------------------------
📋 Query: Who is the landlord?
✅ Answer: I could not find the answer.
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 29.38 seconds


In [25]:
start_time = time.time()
queries = ["What is the rental price?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results


Query 1/1
----------------------------------------------------------------------
📋 Query: What is the rental price?
✅ Answer: I could not find the answer.
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 31.73 seconds


In [26]:
start_time = time.time()
queries = ["What costs should the tenant carry?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results


Query 1/1
----------------------------------------------------------------------
📋 Query: What costs should the tenant carry?
✅ Answer: All taxes, charges and levies, of whatever nature, that currently or in the future apply to the land or building (existing or to be erected), are entirely and exclusively borne by the emphyteutic lessee from the entry into force of this agreement, and this for its entire duration.
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 34.19 seconds


In [27]:
start_time = time.time()

# Main Script
file_path = "/content/document-three.txt"
persist_dir = "/content/collections_three"

# Step 1: Initialize Embedding Model
embeddings_model = LegalBERTEmbeddings()

# Step 2: Load Files and Create Vector Store
doc_processor = DocumentProcessor(
    collection_name="legal_docs_three",
    persist_directory=persist_dir,
    embeddings_model=embeddings_model
)

# Run this only once to preprocess documents and store embeddings
doc_processor.load_files(file_path)

# Step 3: Define Query System
qa_chain = QAChain(document_processor=doc_processor)




end_time = time.time()

# Calculate the execution time
execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")



LegalBERT model loaded successfully.
DocumentProcessor initialized with collection_name: legal_docs_three




Vector store created and persisted successfully.
Initializing retrieval chain and LLM...
Total Execution Time: 32.30 seconds


In [28]:
start_time = time.time()
queries = ["Which court should be used in case of disputes?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results






Query 1/1
----------------------------------------------------------------------
📋 Query: Which court should be used in case of disputes?
✅ Answer: The courts of Ghent shall have exclusive jurisdiction to settle any dispute arising out of or in connection with this Agreement (including a dispute relating to non-contractual obligations arising out of or in connection with this Agreement) which the Parties are unable to settle amicably.
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 12.76 seconds


In [29]:
start_time = time.time()
queries = ["Which jurisdiction is relevant for this contract?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results






Query 1/1
----------------------------------------------------------------------
📋 Query: Which jurisdiction is relevant for this contract?
✅ Answer: This Agreement shall be governed by and construed in accordance with the laws of Belgium.
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 11.82 seconds


In [30]:
start_time = time.time()
queries = ["What are the IP transfer provisions?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results






Query 1/1
----------------------------------------------------------------------
📋 Query: What are the IP transfer provisions?
✅ Answer: I could not find the answer.
----------------------------------------------------------------------

All Queries Processed

Total Execution Time: 10.87 seconds


In [31]:
start_time = time.time()
queries = ["Under what circumstances can the contract be terminated?"]

run_independent_queries(queries)

end_time = time.time()


execution_time = end_time - start_time
print(f"Total Execution Time: {execution_time:.2f} seconds")


Executing Queries and Displaying Results






Query 1/1
----------------------------------------------------------------------
📋 Query: Under what circumstances can the contract be terminated?
✅ Answer: 1. Term and terminatio
This Agreement is entered into on the Date of the Agreement for an indefinite term.
The Company and the Manager may at any time terminate the Agreement with a six (6) months' prior written notice.
Without prejudice to Clause 5.2, the Company can terminate this Agreement with immediate effect by notifying the Manager thereof via registered letter, in the event any of the following has occurred:
- - - 1. fraud (bedrog/dol), willful misconduct (eigen opzet/fait intentionel) or gross negligence (grove nalatigheid/faute lourde) by the Manager or the Permanent Representative in relation to the...performance of the Service 2. a material or persistent breach by the Manager or the Permanent Representative of any of the...obligations under this Agreement, if such breach is not capable of being remedied or has not been