Utilisation de pydantic pour forcer le LLM à mieux respecter un format qui sera transformé grâce à des mécanismes de LangChain.

In [1]:
import os
os.environ["OPENAI_API_KEY"] = "voc-8162499801266773377505669655d3c05508.40840521"
os.environ["OPENAI_API_BASE"] = "https://openai.vocareum.com/v1"

In [2]:
#from langchain_openai import OpenAI
from langchain_openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
import re

In [3]:
num_ads = 3
advert_separator = "======="

In [4]:
model_name="gpt-3.5-turbo-instruct" #gpt-4
temperature = 0.0
llm = OpenAI(
    model_name=model_name, temperature=temperature, max_tokens=3000
)

In [5]:
class PropertyAdvertClass(BaseModel):
    neighborhood: str = Field(
        description = "location in USA including the name the neighborhood"
    )
    style: str = Field(
        description = "style of construction"
    )
    rooms: int = Field(
        description = "number of rooms"
    )
    bedrooms: int = Field(
        description = "number of bedrooms"
    )
    bathrooms: int = Field(
        description = "number of bathrooms"
    )
    floors: int = Field(
        description = "number of floors"
    )
    house_size: str = Field(
        description = "surface area in square feet"
    )
    price: str = Field(
        description = "price in dollars"
    )
    property_description : str = Field(
        description = "a detailed description of the property"
    )
    neighborhood_description : str = Field(
        description = "the neighborhood description"
    )

In [6]:
parser = PydanticOutputParser(pydantic_object=PropertyAdvertClass)
print(parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"neighborhood": {"description": "location in USA including the name the neighborhood", "title": "Neighborhood", "type": "string"}, "style": {"description": "style of construction", "title": "Style", "type": "string"}, "rooms": {"description": "number of rooms", "title": "Rooms", "type": "integer"}, "bedrooms": {"description": "number of bedrooms", "title": "Bedrooms", "type": "integer"}, "bathrooms": {"description": "number of bathrooms", "title": "Bathrooms", "type": "integer"}, "floors": {"description": "number of floors", "t

In [7]:
prompt = PromptTemplate(
    template="{question}\n{format_instructions}",
    input_variables=["question", "context"],
    partial_variables={"format_instructions": parser.get_format_instructions},
)

In [8]:
question = f"""
    generate {num_ads} property advertisements for middle-class buyers.
    be creative in your descriptions but consistent and realistic.
"""
query = prompt.format(question=question)
print(query)


    generate 3 property advertisements for middle-class buyers.
    be creative in your descriptions but consistent and realistic.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"neighborhood": {"description": "location in USA including the name the neighborhood", "title": "Neighborhood", "type": "string"}, "style": {"description": "style of construction", "title": "Style", "type": "string"}, "rooms": {"description": "number of rooms", "title": "Rooms", "type": "integer"}, "bedrooms": {"description": "number of bedrooms", "title": "Bedrooms", "type": "integer"}, "bathroom

In [9]:
generated_adverts = llm.invoke(query)

In [10]:
print(generated_adverts)


{
    "neighborhood": "Brooklyn Heights, New York",
    "style": "Victorian Townhouse",
    "rooms": 8,
    "bedrooms": 4,
    "bathrooms": 2,
    "floors": 3,
    "house_size": "2,500 sqft",
    "price": "$1,200,000",
    "property_description": "Welcome to your dream home in the charming and historic neighborhood of Brooklyn Heights. This stunning Victorian townhouse boasts 4 spacious bedrooms, 2 luxurious bathrooms, and 3 levels of living space. The elegant and timeless design features high ceilings, original hardwood floors, and intricate moldings. Enjoy cooking in the gourmet kitchen with top-of-the-line appliances and entertain guests in the formal dining room. Relax in the beautifully landscaped backyard oasis or take a stroll to the nearby parks and waterfront. Don't miss out on the opportunity to own a piece of Brooklyn's rich history.",
    "neighborhood_description": "Brooklyn Heights is known for its picturesque tree-lined streets, historic brownstones, and stunning views 

In [11]:
pattern = r"```json(.*?)```"
matches = re.findall(pattern, generated_adverts, re.DOTALL)
print(len(matches))
for match in matches:
    print(match.strip())

0


In [12]:
result = parser.parse(generated_adverts)

In [13]:
def parse_advert(str_advert: str) -> dict:
    advert = {}
    features = [
        "Index",
        "Neighborhood",
        "Style",
        "Rooms",
        "Bedrooms",
        "Bathrooms",
        "House Size",
        "Price",
        "Real Estate Description",
        "Neighborhood Description"
        ]
    regex_features = [r"^.*Advert #(\d+):.*$"]
    regex_features += ([re.compile(f"^{str}:\s*(.*)$") for str in features[1:]])
    
    split = str_advert.split("\n")
    for row in split:
        for feature, regex_feature in zip(features, regex_features):
            result = re.match(regex_feature, row)
            if result:
                advert[feature] = result.group(1)
                break

    #advert["Advert text"] = "\n".join(split[1:])
    return advert
    

In [14]:
generated_adverts_list = [ads.strip() for ads in generated_adverts.split(advert_separator)]
print(generated_adverts_list[0])
parse_advert(generated_adverts_list[0].strip())

{
    "neighborhood": "Brooklyn Heights, New York",
    "style": "Victorian Townhouse",
    "rooms": 8,
    "bedrooms": 4,
    "bathrooms": 2,
    "floors": 3,
    "house_size": "2,500 sqft",
    "price": "$1,200,000",
    "property_description": "Welcome to your dream home in the charming and historic neighborhood of Brooklyn Heights. This stunning Victorian townhouse boasts 4 spacious bedrooms, 2 luxurious bathrooms, and 3 levels of living space. The elegant and timeless design features high ceilings, original hardwood floors, and intricate moldings. Enjoy cooking in the gourmet kitchen with top-of-the-line appliances and entertain guests in the formal dining room. Relax in the beautifully landscaped backyard oasis or take a stroll to the nearby parks and waterfront. Don't miss out on the opportunity to own a piece of Brooklyn's rich history.",
    "neighborhood_description": "Brooklyn Heights is known for its picturesque tree-lined streets, historic brownstones, and stunning views o

{}

In [15]:
print(len(generated_adverts_list))

1


In [16]:
import json

In [17]:
generated_adverts_json = [parse_advert(advert.strip()) for advert in generated_adverts_list if advert]
for adv in generated_adverts_json:
    print(json.dumps(adv, indent=2))

{}


In [20]:
filename = "generated_adverts_b.jsonl"
with open(filename, "w") as save_file:
    for generated_advert_json in generated_adverts_json:
        json.dump(generated_advert_json, save_file)
        save_file.write('\n')
save_file.close()

In [19]:
with open(filename, "r") as file:
    for line in file:
        data_entry = json.loads(line)
        # Process each data_entry as a Python dict
        print(data_entry)

{}
