In [None]:
import os
import os
import json
import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
import numpy as np

os.environ["OPENAI_API_KEY"] = ""
os.environ["OPENAI_API_BASE"] = "https://openai.vocareum.com/v1"

def normalize_embeddings(embeddings):
    return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

  from .autonotebook import tqdm as notebook_tqdm


## Step 2: Generating Real Estate Listings

Generate real estate listings using a Large Language Model. Generate at least 10 listings. This can involve creating prompts for the LLM to produce descriptions of various properties. An example of a listing might be:


**Neighborhood**: Green Oaks
Price: $800,000
Bedrooms: 3
Bathrooms: 2
House Size: 2,000 sqft

**Description**: Welcome to this eco-friendly oasis nestled in the heart of Green Oaks. This charming 3-bedroom, 2-bathroom home boasts energy-efficient features such as solar panels and a well-insulated structure. Natural light floods the living spaces, highlighting the beautiful hardwood floors and eco-conscious finishes. The open-concept kitchen and dining area lead to a spacious backyard with a vegetable garden, perfect for the eco-conscious family. Embrace sustainable living without compromising on style in this Green Oaks gem.

**Neighborhood Description**: Green Oaks is a close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public transportation and bike lanes, commuting is a breeze.

In [11]:
# Initialize the LLM
llm = ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo")

# Create a prompt template asking the LLM to produce 10 listings in a structured JSON format
# We’ll request JSON output so it’s easier to parse programmatically.
prompt_template = PromptTemplate(
    input_variables=[],
    template="""
You are a real estate listing generator. Produce exactly 10 distinct real estate listings as JSON, 
each containing the following fields:
- neighborhood (string)
- price (integer, no $ sign)
- bedrooms (integer)
- bathrooms (integer)
- house_size (string, e.g. "2000 sqft")
- description (string, at least 800 characters, with rich and vivid details about the property)
- neighborhood_description (string, at least 600 characters, with detailed information about the neighborhood)

Try to vary neighborhoods, styles, and price ranges, and ensure the data is realistic but fictional.
Only output valid JSON (no extra text outside JSON). Your JSON should be a list of objects.

Example format:
[
  {{
    "neighborhood": "Green Oaks",
    "price": 800000,
    "bedrooms": 3,
    "bathrooms": 2,
    "house_size": "2000 sqft",
    "description": "This eco-friendly oasis boasts solar panels, hardwood floors, and an open kitchen/dining layout. The spacious living room features panoramic windows with breathtaking views of the surrounding forest.",
    "neighborhood_description": "Green Oaks is a family-friendly neighborhood with parks, excellent schools, and a thriving community atmosphere. The nearby forest reserve offers hiking trails, picnic areas, and natural beauty year-round."
  }},
  ...
]

Now, produce 10 such listings:
"""
)

chain = LLMChain(llm=llm, prompt=prompt_template)
response = chain.run({}) # don't need any input variables so simply pass {} otherwise it will throw an error

# Parse the JSON output
try:
    listings = json.loads(response)
except json.JSONDecodeError as e:
    # Print the specific error message and the part of the response that caused the issue
    print("JSON Decode Error:", str(e))
    print("Response that caused the error:")
    print(response[:1000])  # Limit the output length for readability (adjust as needed)
    raise ValueError("The LLM did not produce valid JSON.") from e

# Convert listings to a pandas DataFrame
df = pd.DataFrame(listings)

# Save the DataFrame to a CSV file
df.to_csv("real_estate_listings.csv", index=False)

print("Generated listings saved to real_estate_listings.csv:")
print(df.head())


Generated listings saved to real_estate_listings.csv:
         neighborhood    price  bedrooms  bathrooms house_size  \
0        Sunset Hills   650000         4          3  2800 sqft   
1  Oceanfront Estates  1500000         5          4  4000 sqft   
2         Maple Grove   400000         3          2  1800 sqft   
3    Lakeview Terrace   800000         4          3  2500 sqft   
4    Hillside Heights   550000         3          2  2000 sqft   

                                         description  \
0  Welcome to this charming colonial-style home i...   
1  Live the ultimate beachfront lifestyle in this...   
2  Nestled in the tranquil Maple Grove neighborho...   
3  This modern ranch-style home in Lakeview Terra...   
4  Perched on a hilltop in Hillside Heights, this...   

                            neighborhood_description  
0  Sunset Hills is known for its tree-lined stree...  
1  Oceanfront Estates is a prestigious waterfront...  
2  Maple Grove is a quiet and friendly communit

## Step 3: Storing Listings in a Vector Database

Vector Database Setup: Initialize and configure ChromaDB or a similar vector database to store real estate listings.
Generating and Storing Embeddings: Convert the LLM-generated listings into suitable embeddings that capture the semantic content of each listing, and store these embeddings in the vector database.

In [2]:

# Step 1: Load the previously saved real_estate_listings.csv
listings_file = "real_estate_listings.csv"
df = pd.read_csv(listings_file)

# Ensure numeric fields are integers or floats
df['price'] = df['price'].astype(int)
df['bedrooms'] = df['bedrooms'].astype(int)
df['bathrooms'] = df['bathrooms'].astype(int)

# Ensure the file is loaded correctly
print("Loaded Listings:\n", df.head())

# Step 2: Initialize ChromaDB with disk-based persistence
persist_directory = "./chroma_db"  # Directory to store the database
client = chromadb.PersistentClient(path=persist_directory)

# Step 3: Create or load a collection for the real estate listings
collection_name = "real_estate_listings"
collection = client.get_or_create_collection(collection_name)

# Step 4: Generate embeddings for the listings
# Use SentenceTransformer for embedding generation
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')  # Pre-trained embedding model

# Combine key fields to create a textual representation of each listing
df['text_representation'] = (
    "Neighborhood: " + df['neighborhood'] + "\n" +
    "Price: $" + df['price'].astype(str) + "\n" +
    "Bedrooms: " + df['bedrooms'].astype(str) + "\n" +
    "Bathrooms: " + df['bathrooms'].astype(str) + "\n" +
    "House Size: " + df['house_size'] + "\n" +
    "Description: " + df['description'] + "\n" +
    "Neighborhood Description: " + df['neighborhood_description']
)

# Generate embeddings for each listing
embeddings = model.encode(df['text_representation'].tolist(), convert_to_numpy=True)
normalized_embeddings = normalize_embeddings(embeddings)

# Step 5: Store the listings in ChromaDB
for idx, row in df.iterrows():
    collection.add(
        ids=[f"listing_{idx}"],  # Unique ID for each listing
        documents=[row['text_representation']],  # The full text of the listing
        metadatas=[row.to_dict()],  # Store the entire row as metadata
        embeddings=[normalized_embeddings[idx]]  # The generated embedding
    )

print(f"Successfully stored {len(df)} listings in ChromaDB!")

Loaded Listings:
          neighborhood    price  bedrooms  bathrooms house_size  \
0        Sunset Hills   650000         4          3  2800 sqft   
1  Oceanfront Estates  1500000         5          4  4000 sqft   
2         Maple Grove   400000         3          2  1800 sqft   
3    Lakeview Terrace   800000         4          3  2500 sqft   
4    Hillside Heights   550000         3          2  2000 sqft   

                                         description  \
0  Welcome to this charming colonial-style home i...   
1  Live the ultimate beachfront lifestyle in this...   
2  Nestled in the tranquil Maple Grove neighborho...   
3  This modern ranch-style home in Lakeview Terra...   
4  Perched on a hilltop in Hillside Heights, this...   

                            neighborhood_description  
0  Sunset Hills is known for its tree-lined stree...  
1  Oceanfront Estates is a prestigious waterfront...  
2  Maple Grove is a quiet and friendly community ...  
3  Lakeview Terrace is a sce

Here is an **example** of how to query choma-db with metadata filtering.

In [3]:
where_filters = {
    "$and": [
        {"bedrooms": {"$gte": 3}},    # At least 3 bedrooms
        {"bathrooms": {"$gte": 2}},  # At least 2 bathrooms
        {"price": {"$lte": 500000}}  # Price less than or equal to $500,000
    ]
}

# Step 6: Query ChromaDB (example query)
query_embedding = model.encode(["Affordable 3-bedroom house with a spacious backyard"], convert_to_numpy=True)
results = collection.query(
    query_embeddings=normalize_embeddings(query_embedding),
    n_results=3,  # Number of closest matches to return
    where=where_filters
)

print("Query Results:")
for doc, score in zip(results["documents"], results["distances"]):
    print(f"Score: {score}\nDocument:\n{doc}\n")

Query Results:
Score: [0.7958226009909201]
Document:
['Neighborhood: Maple Grove\nPrice: $400000\nBedrooms: 3\nBathrooms: 2\nHouse Size: 1800 sqft\nDescription: Nestled in the tranquil Maple Grove neighborhood, this charming bungalow features a wrap-around porch, a bright and airy living room, and a renovated kitchen with stainless steel appliances. The backyard oasis includes a deck for outdoor dining and a lush garden for relaxation.\nNeighborhood Description: Maple Grove is a quiet and friendly community with tree-lined streets, local parks, and close proximity to schools and shopping centers. Residents enjoy a sense of community spirit and a peaceful atmosphere in this sought-after neighborhood.']



## Step 4: Building the User Preference Interface

Collect buyer preferences, such as the number of bedrooms, bathrooms, location, and other specific requirements from a set of questions or telling the buyer to enter their preferences in natural language. You can hard-code the buyer preferences in questions and answers, or collect them interactively however you'd like, example:


**questions** = [   
                "How big do you want your house to be?" 
                "What are 3 most important things for you in choosing this property?", 
                "Which amenities would you like?", 
                "Which transportation options are important to you?",
                "How urban do you want your neighborhood to be?",   
            ]


**answers** = [
    "A comfortable three-bedroom house with a spacious kitchen and a cozy living room.",
    "A quiet neighborhood, good local schools, and convenient shopping options.",
    "A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.",
    "Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.",
    "A balance between suburban tranquility and access to urban amenities like restaurants and theaters."


**Buyer Preference Parsing:** Implement logic to interpret and structure these preferences for querying the vector database.

In [10]:
# Step 1: Load the previously saved ChromaDB
persist_directory = "./chroma_db"  # Directory where the database is stored
client = chromadb.PersistentClient(path=persist_directory)

# Load the existing collection
collection_name = "real_estate_listings"
collection = client.get_or_create_collection(
    collection_name, metadata={"distance_function": "cosine"})

print(f"Loaded collection '{collection_name}' from {persist_directory}.")

# Step 2: Load the embedding model for querying
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Step 3: Collect buyer preferences
questions = [
    "How big do you want your house to be?",
    "What are 3 most important things for you in choosing this property?",
    "Which amenities would you like?",
    "Which transportation options are important to you?",
    "How urban do you want your neighborhood to be?",
    "How expensive are you willing to go?"
]

# You can use input() for real interaction, or use hardcoded answers
answers = [
    "A comfortable three-bedroom house with a spacious kitchen and a cozy living room.",
    "A quiet neighborhood, good local schools, and convenient shopping options.",
    "A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.",
    "Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.",
    "A balance between suburban tranquility and access to urban amenities like restaurants and theaters.",
    "I'm looking for a home which costs less than $650,000."
]

# Combine answers into a single preference statement
preference_statement = " ".join(answers)

print("\nCollected Buyer Preferences:")
for question, answer in zip(questions, answers):
    print(f"{question}\n{answer}\n")

Loaded collection 'real_estate_listings' from ./chroma_db.

Collected Buyer Preferences:
How big do you want your house to be?
A comfortable three-bedroom house with a spacious kitchen and a cozy living room.

What are 3 most important things for you in choosing this property?
A quiet neighborhood, good local schools, and convenient shopping options.

Which amenities would you like?
A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.

Which transportation options are important to you?
Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.

How urban do you want your neighborhood to be?
A balance between suburban tranquility and access to urban amenities like restaurants and theaters.

How expensive are you willing to go?
I'm looking for a home which costs less than $650,000.



## Step 5: Searching Based on Preferences

- Semantic Search Implementation: Use the structured buyer preferences to perform a semantic search on the vector database, retrieving listings that most closely match the user's requirements.

- Listing Retrieval Logic: Fine-tune the retrieval algorithm to ensure that the most relevant listings are selected based on the semantic closeness to the buyer’s preferences.

In [None]:
# Step 1: Generate an embedding for the buyer's preferences
preference_embedding = model.encode([preference_statement], convert_to_numpy=True)

# Step 2: Define filters based on the buyer's preferences. We use an LLM for this, so the user can input their preferences in clear text.

# Initialize the LLM
llm = ChatOpenAI(temperature=0.0, model_name="gpt-3.5-turbo")

# Create a prompt for parsing (don't forget to escape the curly braces)
prompt_template = PromptTemplate(
    input_variables=["user_input"],
    template="""
You are tasked with extracting structured filters for querying a vector database. The filters must include only the following elements:

1. **Bedrooms**: Minimum number of bedrooms, formatted as {{"bedrooms": {{"$gte": <number>}}}}.
2. **Bathrooms**: Minimum number of bathrooms, formatted as {{"bathrooms": {{"$gte": <number>}}}}.
3. **Price**: Maximum price, formatted as {{"price": {{"$lte": <number>}}}}.

Rules:
- If only one filter (e.g., bedrooms) is mentioned, return it as a single dictionary.
- If multiple filters are mentioned, combine them using the `$and` operator, for example:
  {{
    "$and": [
        {{"bedrooms": {{"$gte": 3}}}},
        {{"bathrooms": {{"$gte": 2}}}},
        {{"price": {{"$lte": 500000}}}}
    ]
  }}
- If a field is not mentioned, exclude it from the output.
- Only return filters in the format specified above, without extra text.

Input Example:
"I need a house with at least 3 bedrooms and 2 bathrooms, and my budget is $500,000."

Expected Output:
{{
    "$and": [
        {{"bedrooms": {{"$gte": 3}}}},
        {{"bathrooms": {{"$gte": 2}}}},
        {{"price": {{"$lte": 500000}}}}
    ]
}}

Input: {user_input}

Output:
Provide the filters as valid JSON, formatted according to the above rules. Ensure the JSON is valid and do not include any extra text.
"""
)

# Set up the chain
chain = LLMChain(llm=llm, prompt=prompt_template)

# Example user input
user_input = preference_statement

# Run the chain
response = chain.run({"user_input": user_input})

# Parse the LLM output
filters = json.loads(response)
print(filters)

where_filters = filters

# Step 3: Query ChromaDB using the buyer's preferences
results = collection.query(
    query_embeddings=normalize_embeddings(preference_embedding),
    n_results=3,  # Return the top 3 matches,
    where=where_filters
)

print("\nTop 3 Matching Listings:")
for i, (doc, score) in enumerate(zip(results["documents"][0], results["distances"][0])):
    print(f"Match {i+1}:")
    print(f"Similarity Score: {score}")
    print(f"Listing:\n{doc}\n")

{'$and': [{'bedrooms': {'$gte': 3}}, {'price': {'$lte': 650000}}]}

Top 3 Matching Listings:
Match 1:
Similarity Score: 0.7345886350609102
Listing:
Neighborhood: Hillside Heights
Price: $550000
Bedrooms: 3
Bathrooms: 2
House Size: 2000 sqft
Description: Perched on a hilltop in Hillside Heights, this mid-century modern home offers panoramic views of the city skyline. The house features a sunlit living room with vaulted ceilings, a gourmet kitchen with quartz countertops, and a private deck for al fresco dining. The master suite boasts a walk-in closet and a spa-like bathroom with a soaking tub.
Neighborhood Description: Hillside Heights is a sought-after neighborhood with winding streets, mature trees, and upscale homes. Residents enjoy easy access to hiking trails, parks, and trendy cafes in this vibrant and desirable community.

Match 2:
Similarity Score: 0.7525908373073091
Listing:
Neighborhood: Garden Grove
Price: $600000
Bedrooms: 3
Bathrooms: 2
House Size: 2200 sqft
Description: E

## Step 6: Personalizing Listing Descriptions

- LLM Augmentation: For each retrieved listing, use the LLM to augment the description, tailoring it to resonate with the buyer’s specific preferences. This involves subtly emphasizing aspects of the property that align with what the buyer is looking for.

- Maintaining Factual Integrity: Ensure that the augmentation process enhances the appeal of the listing without altering factual information.

In [19]:
# Initialize the LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)

# Create a prompt template for personalizing descriptions
prompt_template = PromptTemplate(
    input_variables=["buyer_preferences", "listing_description", "listing_metadata"],
    template="""
You are an expert real estate agent. You have factual information about a property in the form of an original listing description and associated metadata. 
Your task: rewrite the description to better resonate with the buyer's specific preferences. 

Important Instructions:
- Do NOT add or invent any information that is not already present in the original listing description or metadata.
- Only emphasize and rearrange details that are already given.
- If a certain preference of the buyer does not appear in the listing's information, simply do not mention it.
- Keep all numerical facts (like price, bedrooms, bathrooms, and house size) accurate.
- The listing's metadata and original description represent the ground truth. Do not contradict or alter factual details.

Here are the buyer's preferences:
{buyer_preferences}

Here is the original listing description:
{listing_description}

Here is the factual metadata associated with the listing:
{listing_metadata}

Based on the buyer's preferences and the provided factual information, rewrite the description to emphasize the aspects that align with what they are looking for, without introducing any new, unverified details. The final output should be engaging, accurate, and reflect only the information given.
"""
)

# Step 1: Personalize each listing
def personalize_descriptions(buyer_preferences, listings, llm, prompt_template):
    """
    Personalize listing descriptions using an LLM.
    
    Args:
        buyer_preferences (str): Buyer's preferences.
        listings (list of dict): Retrieved listings with original descriptions and metadata.
        llm (ChatOpenAI): The initialized LLM instance.
        prompt_template (PromptTemplate): The prompt template for description augmentation.
        
    Returns:
        list of dict: Listings with personalized descriptions added.
    """

    personalized_listings = []
    chain = LLMChain(llm=llm, prompt=prompt_template)

    for listing in listings:
        # Original listing description
        original_description = listing["documents"][0]

        # Remove the 'text_representation' field from the metadata
        metadata = listing["metadata"].copy()  # Create a copy to avoid modifying the original
        metadata.pop("text_representation", None)  # Remove the field; 'None' ensures no error if the field doesn't exist

        # Convert metadata to a JSON string for clarity
        listing_metadata_json = json.dumps(metadata, indent=2)

        # Generate the prompt text using the template
        filled_prompt = prompt_template.format(
            buyer_preferences=buyer_preferences,
            listing_description=original_description,
            listing_metadata=listing_metadata_json
        )

        # Print the filled-out prompt
        print("\n=== Filled Prompt ===")
        print(filled_prompt)
        print("=====================\n")
        
        # Use the LLM to generate a personalized description
        personalized_description = chain.run(
            buyer_preferences=buyer_preferences,
            listing_description=original_description,
            listing_metadata=listing_metadata_json
        )
        
        # Add the personalized description to the listing metadata
        listing_with_personalization = listing.copy()
        listing_with_personalization["personalized_description"] = personalized_description
        personalized_listings.append(listing_with_personalization)

    return personalized_listings

retrieved_listings = []
for i in range(len(results["documents"][0])):
    retrieved_listings.append({
        "documents": [results["documents"][0][i]],  # The matched listing's description
        "distances": [results["distances"][0][i]],  # The similarity score
        "metadata": results["metadatas"][0][i]     # Associated metadata (e.g., price, neighborhood)
    })

# print("\nTransformed Retrieved Listings:")
# for listing in retrieved_listings:
#     print(listing)

# Step 2: Generate personalized descriptions
personalized_listings = personalize_descriptions(
    preference_statement, retrieved_listings, llm, prompt_template
)

# Step 3: Display personalized listings
for listing in personalized_listings:
    print("Original Description:\n", listing["documents"][0])
    print("\nPersonalized Description:\n", listing["personalized_description"])
    print("-" * 80)



=== Filled Prompt ===

You are an expert real estate agent. You have factual information about a property in the form of an original listing description and associated metadata. 
Your task: rewrite the description to better resonate with the buyer's specific preferences. 

Important Instructions:
- Do NOT add or invent any information that is not already present in the original listing description or metadata.
- Only emphasize and rearrange details that are already given.
- If a certain preference of the buyer does not appear in the listing's information, simply do not mention it.
- Keep all numerical facts (like price, bedrooms, bathrooms, and house size) accurate.
- The listing's metadata and original description represent the ground truth. Do not contradict or alter factual details.

Here are the buyer's preferences:
A comfortable three-bedroom house with a spacious kitchen and a cozy living room. A quiet neighborhood, good local schools, and convenient shopping options. A backyard