In [140]:
from dotenv import load_dotenv
from langchain_huggingface  import HuggingFaceEndpoint
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain.schema import AIMessage, HumanMessage, SystemMessage

load_dotenv()

# Initialize the LLM
llm = HuggingFaceEndpoint(
    repo_id="tiiuae/falcon-7b-instruct",
    model_kwargs={"return_dict": True}
)

model = ChatOpenAI(model="gpt-4o-mini")

prompt = PromptTemplate(
    input_variables=["text"], 
    template="""
    Please take the following text and format it into sections with clear headings and paragraphs in French. \
    Extract all the text dont cut nothing
    {text}
    """
)

chain = prompt | model | StrOutputParser()

result = chain.invoke({"text": flat_text})
print(result)


# Document de Publication au Moniteur Belge

## Informations Générales

- **Mod** : DoC 18.01
- **Date de dépôt** : 20 DEC. 2023
- **Numéro d'entreprise** : 24000006
- **Forme légale** : SOCIETE ANONYME
- **Nom de l'entreprise** : BIOCODEX BENELUX
- **Numéro d'entreprise** : 0401 936 623
- **Adresse complète du siège** : BOULEVARD DE L'HUMANITE 292, 1190 FOREST

## Objet de l'Acte

### Reconduction des Mandats d'Administrateurs

L'administrateur délégué - commissaire

Du procès-verbal de l'Assemblée Générale extraordinaire d.d. 7/11/2023, il apparaît que l'assemblée confirme à l'unanimité la décision suivante : 

1. La reconduction des mandats d'administrateurs des Messieurs François Hublot et Jean-Marie Lefevre ainsi que de la société par actions simplifiée française BIOCODEX (dont le représentant permanent est Monsieur Nicolas Coudurier), mandats commençant le premier janvier 2023 pour une durée de trois ans, se terminant après l'assemblée générale ordinaire qui aura lieu en 2026.

#

In [145]:
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain_huggingface import ChatHuggingFace
from langchain_core.output_parsers import PydanticOutputParser
from typing import List

# Initialize the LLM
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    model_kwargs={"return_dict": True}
)
chat = ChatHuggingFace(llm=llm, verbose=True)

model = ChatOpenAI(model="gpt-4o-mini")

# Define the output schema
class Document(BaseModel):
    company_name: str = Field(description="Company Name")
    company_identifier: str = Field(description="Company Identifier")
    document_purpose: str = Field(description="Document Purpose")
    key_terms: str = Field(description="Key terms related with the document")

class Info(BaseModel):
    infomration: List[Document]

parser = PydanticOutputParser(pydantic_object=Info)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an AI assistant specialized in extracting key business details from documents. "
            "Your goal is to identify and translate relevant information into English if necessary, then format the output "
            "according to the specified JSON schema. Ensure that each extracted field is accurate, complete, and follows the schema precisely.\n\n"
            "Please extract the following fields:\n\n"
            "1. **Company Name**: The full name of the company as it appears in the document.\n"
            "2. **Company Identifier**: A unique identifier for the company, such as a registration or business number.\n"
            "3. **Document Purpose**: The purpose or intent of the document (e.g., 'Appointment of Directors', 'Annual Report'). "
            "Translate this to English if it’s in another language.\n"
            "4. **Key Terms about the Document Purpose**: Extract detailed information relevant to the document’s purpose, such as roles, positions, and effective dates. "
            "For instance, if the document covers the appointment of directors, include terms like the position title and effective date. Translate these terms to English if needed.\n\n"
            "{format_instructions}"
        ),
        (
            "human",
            "{query}"
        ),
    ]
).partial(format_instructions=parser.get_format_instructions())

chain = prompt | model | parser
chain.invoke({"query": flat_text})

Info(infomration=[Document(company_name='BIOCODEX BENELUX', company_identifier='0401 936 623', document_purpose="Renewal of Directors' Mandates", key_terms="Renewal of mandates for Messieurs François Hublot and Jean-Marie Lefevre, and the simplified joint-stock company BIOCODEX, effective January 1, 2023, for a duration of three years, ending after the ordinary general assembly in 2026. Appointment of François Hublot as delegated administrator starting January 1, 2023, for three years, ending with the general assembly in 2026. The auditor's mandate renewed for the next three years is SRL CDP Petit & Co represented by Benjamin Gorlier.")])

In [146]:
chain = prompt | model | parser
chain.invoke({"query": result})

Info(infomration=[Document(company_name='BIOCODEX BENELUX', company_identifier='0401 936 623', document_purpose="Renewal of Directors' Mandates", key_terms="The mandates of directors Mr. François Hublot and Mr. Jean-Marie Lefevre, as well as the simplified joint-stock company BIOCODEX (represented by Mr. Nicolas Coudurier), are renewed starting January 1, 2023, for a duration of three years, ending after the ordinary general assembly in 2026. Mr. François Hublot's mandate is renamed as executive director starting January 1, 2023, for a term of three years, concluding at the general assembly in 2026.")])