# Project Home Match

## Modules

Load the modules

In [1]:
import json
from uuid import uuid4
from time import sleep
import random

#from langchain_openai import OpenAI
from langchain_openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from tqdm.notebook import tqdm

## OpenAI model

In [2]:
from dotenv import load_dotenv
if not load_dotenv(): # keys are loaded from .env file
    print("Warning: .env file not found. Make sure to set environment variables manually.")

Instanciate the OpenAI model

In [3]:
model_name="gpt-3.5-turbo-instruct"
#model_name="gpt-4o-mini"
temperature = 0.01
llm = OpenAI(
    model_name=model_name, temperature=temperature, max_tokens=3500
)

## Generating Real Estate Listings

Done as JSON format structured data and saved in a file

In [4]:
class PropertyAdvertClass(BaseModel):
    location: str = Field(
        description = "location in USA including the name the neighborhood"
    )
    style: str = Field(
        description = "style of construction"
    )
    rooms: int = Field(
        description = "number of rooms"
    )
    bedrooms: int = Field(
        description = "number of bedrooms"
    )
    bathrooms: int = Field(
        description = "number of bathrooms"
    )
    floors: int = Field(
        description = "number of floors"
    )
    house_size: int = Field(
        description = "surface area in square feet"
    )
    price: int = Field(
        description = "price in dollars"
    )
    property_description : str = Field(
        description = "a detailed description including its surface area in square feet, the number of rooms, bedrooms and bathrooms, the number of floors, if there are a garage and a garden, the style of construction and its price in dollars"
    )
    neighborhood_description : str = Field(
        description = "the neighborhood description"
    )

class ListOfAdvertsClass(BaseModel):
    adverts_list: list[PropertyAdvertClass]

In [5]:
parser = PydanticOutputParser(pydantic_object=ListOfAdvertsClass)

gen_prompt = PromptTemplate(
    template="{question}.{context}\n{format_instructions}",
    input_variables=["question", "context"],
    partial_variables={"format_instructions": parser.get_format_instructions},
)

We draw up a list of 20 entries, accessing the model 4 times to avoid the maximum token limit.

In [6]:
requests_data = [
    {
        "num_ads": 5,
        "context": "the Northeast of the USA",
    },
    {
        "num_ads": 5,
        "context": "the Midwest of the USA",
    },
    {
        "num_ads": 5,
        "context": "the South of the USA",
    },
    {
        "num_ads": 5,
        "context": "the West of the USA",
    },
]

all_generated_adverts = []
for request in tqdm(requests_data):
    num_ads = request["num_ads"]
    context = request["context"]

    adverts_query = f"""
        generate {num_ads} real estate advertisements for middle-class buyers, each respecting the output schema, and all gathered in a unique array. be creative in your descriptions but consistent and realistic.
    """
    context_query = f"the following is a list of properties for sale in {context}."

    prompt = gen_prompt.format(question=adverts_query, context=context_query)
    #print(prompt)
    generated_adverts = llm.invoke(prompt)
    #print(generated_adverts)
    all_generated_adverts.append(parser.parse(generated_adverts))

    sleep(2)

  0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
for generated_adverts in all_generated_adverts:
    print(generated_adverts.adverts_list[0], end="\nn")

location='Brooklyn, New York' style='Brownstone' rooms=6 bedrooms=3 bathrooms=2 floors=3 house_size=2000 price=800000 property_description="This beautiful Brownstone in the heart of Brooklyn offers 3 spacious bedrooms, 2 full bathrooms, and a total of 6 rooms. With 3 floors, this property boasts a total of 2000 square feet. The classic Brownstone style adds charm and character to this home. Don't miss out on this amazing opportunity for only $800,000!" neighborhood_description='Located in the trendy neighborhood of Brooklyn, this property is surrounded by local shops, restaurants, and parks. With easy access to public transportation, you can easily explore all that New York City has to offer.'
nlocation='Chicago, IL - Lincoln Park' style='Victorian' rooms=8 bedrooms=4 bathrooms=3 floors=3 house_size=2500 price=750000 property_description='This beautiful Victorian home in the heart of Lincoln Park boasts 8 rooms, including 4 bedrooms and 3 bathrooms. With 3 floors, there is plenty of sp

## Save listings in a file

Save the listings in a file.

In [8]:
filename = "listings.jsonl"
with open(filename, "w") as save_file:
    for generated_adverts in all_generated_adverts:
        for advert in generated_adverts.adverts_list:
            json.dump(advert.model_dump(mode="json"), save_file)
            save_file.write('\n')
    save_file.close()
!cp {filename} listings.txt

## Storing Listings in a Vector Database

Launch the vector database and store the listings in it.

In [9]:
vector_store_directory = "./chroma_langchain_db"
!rm -rf {vector_store_directory}

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = Chroma(
    collection_name="real_estate",
    embedding_function=embeddings,
    persist_directory=vector_store_directory,  # Where to save data locally, remove if not necessary
)

In [10]:
documents = []
for generated_adverts, geo_context in zip(all_generated_adverts, requests_data):
    for i, advert in enumerate(generated_adverts.adverts_list, start=1):
        metadata = {}
        metadata["source"] = model_name
        metadata["id"] = i
        
        metadata["region"] = geo_context["context"]
        metadata["location"] = advert.location
        metadata["style"] = advert.style
        metadata["rooms"] = advert.rooms
        metadata["bedrooms"] = advert.bedrooms
        metadata["bathrooms"] = advert.bathrooms
        metadata["floors"] = advert.floors
        metadata["house_size"] = advert.house_size
        metadata["price"] = advert.price

        page_content = advert.property_description + advert.neighborhood_description
        documents.append(
            Document(page_content=page_content, metadata=metadata)
            )
    
uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents=documents, ids=uuids)
print(f"{len(documents)} documents added to the vector store")

20 documents added to the vector store


Check the database to see if the listings are stored correctly.

In [11]:
# Check if the vector store is already populated
print(len(vector_store.get(where={"source": f"{model_name}"})["ids"]), "documents in the vector store")

20 documents in the vector store


## Handling the user query

Change the query to your needs in `user_search_query` variable. Or use the `HomeMatch_clint.py` script to run the code using a user interface.

In [12]:
user_search_request = "I'm looking for a modern-style house with a sea view for a large family. The house should be located in a quiet neighborhood in San Francisco, California. It should have at least 2 bathrooms and a large garden. The total area of the house should be at least 2500 square feet, and the price should not exceed $500,000."

An alternative way to search for ads based on the user's query would be to extract elements from specific descriptions and build a query based mainly on objective, factual, quantitative or nominal elements. For this, defining the structure of data receiving the user query analysis result.

In [13]:
class PropertySearchRequestClass(BaseModel):
    location: str = Field(
        description = "location in USA including the name the neighborhood"
    )
    style: str = Field(
        description = "style of construction"
    )
    rooms: int = Field(
        description = "number of rooms"
    )
    bedrooms: int = Field(
        description = "number of bedrooms"
    )
    bathrooms: int = Field(
        description = "number of bathrooms"
    )
    floors: int = Field(
        description = "number of floors"
    )
    house_size: int = Field(
        description = "surface area in square feet"
    )
    price: int = Field(
        description = "price in dollars"
    )

In [14]:
parser = PydanticOutputParser(pydantic_object=PropertySearchRequestClass)

analyzing_prompt_generator = PromptTemplate(
    template="{query}\n{format_instructions}",
    input_variables=["query", "context"],
    partial_variables={"format_instructions": parser.get_format_instructions},
)

def get_analyze_query(user_search_request):
    return f"""
find all available information as defined in the following output schema from this request :"{user_search_request}", in order to generate a corresponding filled json data structure. set -1 for any missing information.
"""

Analyze the user query using the OpenAI model and store the result in a variable.

In [15]:
prompt = analyzing_prompt_generator.format(query=get_analyze_query(user_search_request))
analyzed_request = llm.invoke(prompt)
print("Analyzed request: ", analyzed_request)
parsed_response = parser.parse(analyzed_request)
print("Parsed response: ", parsed_response)

Analyzed request:  
{"location": "San Francisco, California", "style": "modern", "rooms": -1, "bedrooms": -1, "bathrooms": 2, "floors": -1, "house_size": 2500, "price": 500000}
Parsed response:  location='San Francisco, California' style='modern' rooms=-1 bedrooms=-1 bathrooms=2 floors=-1 house_size=2500 price=500000


Create a summary of the data detected in the user query.

In [16]:
summary = []
for k, v in vars(parsed_response).items():
    if v != -1:
        if k == "location":
            summary.append(f"located in {v}")
        elif k == "style":
            summary.append(f"{v}-style")
        elif k == "rooms":
            summary.append(f"{v} room(s)")
        elif k == "bedrooms":
            summary.append(f"{v} bedroom(s)")
        elif k == "bathrooms":
            summary.append(f"{v} bathroom(s)")
        elif k == "floors":
            summary.append(f"{v} floors")
        elif k == "house_size":
            summary.append(f"{v} square feet size")
        elif k == "price":
            summary.append(f"priced ${v}")

summary = ", ".join(summary)
print("Your are looking a house like this :", summary)

Your are looking a house like this : located in San Francisco, California, modern-style, 2 bathroom(s), 2500 square feet size, priced $500000


## Searching Based on Preferences

But finally we're going to perform a search based on the user's entire search query. So, search in the vector database the closest advertisements based on the semantic closeness.

In [20]:
results = vector_store.similarity_search(
    user_search_request,
    k=3,
    filter={"source": model_name},
)
for res in results:
    print(f"> {res.page_content} [{res.metadata}]", end="\n\n")

> This stunning contemporary home in Seattle boasts 4 bedrooms, 3 bathrooms, and 3 floors. With a spacious 3000 square feet, this home is perfect for a large family or those who love to entertain. The sleek and modern design is sure to impress.Located in the bustling neighborhood of Capitol Hill, this home is surrounded by trendy restaurants, bars, and shops. Enjoy the vibrant city life and easy access to public transportation. [{'id': 3, 'style': 'Contemporary', 'floors': 3, 'house_size': 3000, 'source': 'gpt-3.5-turbo-instruct', 'region': 'the West of the USA', 'bedrooms': 4, 'price': 800000, 'rooms': 7, 'bathrooms': 3, 'location': 'Seattle, Washington'}]

> This beautiful modern home in the heart of Los Angeles features 3 bedrooms, 2 bathrooms, and 2 floors. With a spacious 2000 square feet, this home is perfect for a growing family. The sleek and stylish design is sure to impress. Don't miss out on this opportunity to own a piece of LA.Located in the trendy neighborhood of Silver L

## Personalizing Listing Descriptions

Augment the listing descriptions with regard to the user preferences.

In [26]:
def get_response_augmentation_request(advertisement, search_request):
    start_with_list = ["Welcome to", "This wonderful", "This fantastic", "This awesome", "This incredible"]
    start_with = random.choice(start_with_list)
    return f"""create a new description from this real estate advertisement {advertisement} in order to show that it matches the following search request: {search_request}, with respect to the information included in the advertisement. Add arguments in the new description to make it more attractive. it must be in a natural language format, and must not be a json object. The advertisement should be at least 100 words long, in only one paragraph. All the text is in English. Start with '{start_with}!'"""

In [27]:
for result in results:
    metadata = result.metadata
    metadata.pop('source', None)
    metadata.pop('id', None)
    advertisement = f"{result.page_content} [{metadata}]"
    augmented_advert = llm.invoke(get_response_augmentation_request(advertisement, user_search_request))
    print("> augmented :", augmented_advert[2:], end="\n\n")
    sleep(2)

> augmented : This incredible contemporary home in Seattle is the perfect match for your search request! With 4 bedrooms, 3 bathrooms, and 3 floors, this spacious 3000 square foot home is ideal for a large family or those who love to entertain. The sleek and modern design will surely impress, and the stunning sea view from the large garden will take your breath away. Located in the bustling neighborhood of Capitol Hill, you'll have easy access to trendy restaurants, bars, and shops, while still enjoying a quiet and peaceful atmosphere. And the best part? This dream home is within your budget, priced at only $800,000. Don't miss out on the opportunity to live in this desirable location on the West Coast of the USA. Come see it for yourself in Seattle, Washington!

> augmented : This wonderful modern home in the heart of Los Angeles is the perfect fit for a large family looking for a stylish and spacious living space. Boasting 3 bedrooms, 2 bathrooms, and 2 floors, this home offers a tot