In [3]:
uv pip install crewai docling

[2mUsing Python 3.12.6 environment at: /Users/lorenzejay/.pyenv/versions/3.12.6[0m
[2mAudited [1m3 packages[0m [2min 82ms[0m[0m
Note: you may need to restart the kernel to use updated packages.


# Create a knowledge source with Custom Chunking Strategies

In [11]:
# from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
from crewai.knowledge.source.excel_knowledge_source import ExcelKnowledgeSource
from docling.document_converter import DocumentConverter
from typing import Any, List, Dict
from pathlib import Path

class CustomKnowledgeSource(BaseFileKnowledgeSource):    
    def load_content(self) -> Dict[Path, Any]:
        """Load and preprocess pdf's content."""
        doc_converter = DocumentConverter()
        content_dict = {}

        for file_path in self.safe_file_paths:
            docling_doc = doc_converter.convert(file_path)
            content = docling_doc.document
            content_dict[file_path] = content
        return content_dict
    
      
    def add(self) -> None:
        """Process content, chunk it, and save to storage if configured."""
        self.validate_content()
        if isinstance(self.content, dict):
            content = self.content.values()
        else:
            content = self.content
        

        self.chunks = self._chunk_content(content)
        self._save_documents()


            
    def _chunk_content(self, content: Any) -> List[str]:
        """Semantic chunking of the content based on embedding similarity.
        
        This method:
        1. Splits text into sentences
        2. Creates embeddings for each sentence
        3. Computes similarity between all sentences
        4. Groups similar sentences together into coherent chunks
        
        Args:
            text: The text content to be semantically chunked
            
        Returns:
            List of semantically coherent text chunks
        """
        from datetime import datetime
        from docling.chunking import HybridChunker

        contracts_for_db = []
        for contract in content:
            chunker = HybridChunker()
            chunks = list(chunker.chunk(contract))
            total_chunks = len(chunks)
            
            for i, chunk in enumerate(chunks):
                enriched_text = chunker.serialize(chunk=chunk)
                # Extract metadata from the chunk
                metadata = {
                    "filename": f"{contract.name}.pdf",
                    "page_numbers": [],
                    "headings": [],
                    "chunk_length": len(enriched_text),
                    "chunk_index": i,
                    "total_chunks": total_chunks,
                    "processed_date": datetime.now().isoformat(),
                }
                
                # Extract page numbers
                page_numbers = set()
                if hasattr(chunk.meta, "doc_items"):
                    for doc_item in chunk.meta.doc_items:
                        if hasattr(doc_item, "prov") and doc_item.prov:
                            for prov_item in doc_item.prov:
                                if (
                                    hasattr(prov_item, "page_no")
                                    and prov_item.page_no is not None
                                ):
                                    page_numbers.add(prov_item.page_no)
                
                metadata["page_numbers"] = list(page_numbers)
                
                # Extract headings
                if hasattr(chunk.meta, "headings"):
                    metadata["headings"] = chunk.meta.headings

                if i == 1 :
                    print('enriched_text', enriched_text)
                    print('metadata', metadata)

                contracts_for_db.append(str({
                    "text": enriched_text,
                    "metadata": metadata
                }))
        return contracts_for_db
        





# CustomKnowledgeSource(file_paths=['/contracts/AimmuneTherapeuticsInc_20200205_8-K_EX-10.3_11967170_EX-10.3_Development Agreement.pdf']).add()

In [12]:
from crewai import Agent, Crew, Process, Task 
knowledge_source = CustomKnowledgeSource(file_paths=["/contracts/AimmuneTherapeuticsInc_20200205_8-K_EX-10.3_11967170_EX-10.3_Development Agreement.pdf"])


agent = Agent(
    role="Contract Analyst",
    goal="Analyze the contract and provide a summary of the contract and answer the question",
    backstory="You are a contract analyst with a deep understanding of the contract and the contract law.",
    knowledge_sources=[knowledge_source],
    verbose=True
)
task = Task(
    description="Analyze the contract and provide a summary of the contract. And answer the question: {question}, be sure to cite the section heading and page number of the answer.",
    expected_output="A summary of the contract.",
    agent=agent,
)

crew = Crew(
    agents=[agent],
    tasks=[task],
    process=Process.sequential,
    verbose=True,
)

result = crew.kickoff(inputs={"question": "How does Regulatory Approval work for AimmuneTherapeuticsInc?"})
print('result', result)





Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors


enriched_text Exhibit 10.3
TABLE OF CONTENTS
ARTICLE 1 Definitions, Page = 1. ARTICLE 2 Licenses, Page = 13. ARTICLE 3 Development, Page = 16. ARTICLE 4 Regulatory, Page = 17. ARTICLE 5 Commercialization, Page = 19. ARTICLE 6 Supply, Page = 20. ARTICLE 7 Payments, Page = 21. ARTICLE 8 Payment; Records; Audits, Page = 24. ARTICLE 9 Intellectual Property Matters, Page = 26. ARTICLE 10 Representations, Warranties and Covenants; Compliance, Page = 31. ARTICLE 11 Indemnification, Page = 34. ARTICLE 12 Confidentiality, Page = 36. ARTICLE 13 Term and Termination, Page = 40. ARTICLE 14 Effects of Expiration Or Termination, Page = 40. ARTICLE 15 Miscellaneous, Page = 43. Schedule 1.10 Antibody, Page = 50. Schedule 1.79 Xencor General Patents, Page = 51. Schedule 1.81 Xencor Product Specific Patents, Page = 52. Schedule 2.7 Xencor Know-How, Regulatory Materials, and Regulatory Data, Page = 53. Schedule 6.1 Initial Product Supply, Page = 54. Schedule 10.2.6 Exceptions, Page = 55. Schedule 12.2 In

Overriding of current TracerProvider is not allowed


agent_knowledge_snippets [{'id': 'ebfbf3069d3f14368eea54f2acef33b6d80e97bfa4cd5b2c9ea1dea311e405d5', 'metadata': None, 'context': "{'text': '4.1 Regulatory Filings and Regulatory Approvals.\\n4.1.1 General Responsibilities; Ownership of Regulatory Approvals. Aimmune shall be responsible for the preparation of all Regulatory Materials necessary or desirable for obtaining and maintaining the Regulatory Approvals for the Product and Aimmune shall submit such Regulatory Materials, as applicable, to the applicable Governmental Authorities. For clarity, to the extent allowed by Applicable Law, all Regulatory Approvals for the Product shall be held and owned by Aimmune in its name.\\n4.1.2 Pricing Approvals. To the extent that a given country or regulatory jurisdiction requires Pricing Approval for sale of the Product, Aimmune shall (to the extent permitted by Applicable Laws) be solely responsible for (and shall use Commercially Reasonable Efforts toward) obtaining and maintaining Pricing Ap