This is a starter notebook for the project, you'll have to import the libraries you'll need, you can find a list of the ones available in this workspace in the requirements.txt file in this workspace. 

In [5]:
import os
import openai
import pandas as pd
import chromadb
from langchain.llms import OpenAI
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

# Set API Key for OpenAI
openai.api_key = "API KEY HERE"
openai.api_base = "https://openai.vocareum.com/v1"

In [7]:
# Prompt for generating listings
listing_prompt = """
Generate 10 realistic real estate listings in the following format:
Neighborhood: [Neighborhood Name]
Price: [Price]
Bedrooms: [Number of Bedrooms]
Bathrooms: [Number of Bathrooms]
House Size: [Size in sqft]
Description: [Brief description of the property]
"""

# Call OpenAI to generate listings
response = openai.ChatCompletion.create(
    model="gpt-4",  # Use GPT-4 or GPT-3.5
    messages=[{"role": "user", "content": listing_prompt}]
)

# Extract generated text
generated_text = response["choices"][0]["message"]["content"]

# Split text into listings (assuming listings are separated by double newlines)
listings = generated_text.strip().split("\n\n")

# Process listings into a structured DataFrame
data = []
for listing in listings:
    lines = listing.split("\n")
    entry = {}
    for line in lines:
        if ": " in line:  # Ensure line contains a key-value pair
            key, value = line.split(": ", 1)
            # Strip extra spaces
            entry[key.strip()] = value.strip()
    
    # Only add non-empty entries
    if entry:
        data.append(entry)

# Convert to DataFrame
listings_df = pd.DataFrame(data)

# Clean up DataFrame by removing any extra or invalid columns
listings_df = listings_df.loc[:, ~listings_df.columns.str.contains('^Unnamed')]

# Save to CSV (optional)
listings_df.to_csv("listings.csv", index=False, encoding="utf-8")

# Display first few rows
print(listings_df.head())

         Neighborhood       Price Bedrooms Bathrooms  House Size  \
0    Uptown Manhattan  $1,250,000        3         2  1,500 sqft   
1  Palm Beach Gardens    $850,000        4       3.5  3,000 sqft   
2       Beverly Hills  $8,500,000        5         6  6,900 sqft   
3        Lincoln Park    $550,000        2         2  1,300 sqft   
4    Westlake, Austin  $1,100,000        4       3.5  2,800 sqft   

                                         Description  
0  Luxury apartment with beautiful city views, fe...  
1  Charming single-family home with a large backy...  
2  Elegant mansion with a guest house, luxurious ...  
3  Warm and welcoming condo featuring a modern de...  
4  Beautiful traditional brick home with a spacio...  


In [8]:
# Initialize Chroma client
client = chromadb.Client()

# Define collection name
collection_name = "realestate_listings"

# Delete collection if it already exists
try:
    client.delete_collection(collection_name)
    print(f"Collection '{collection_name}' deleted.")
except Exception:
    print(f"Collection '{collection_name}' does not exist, creating a new one.")

# Create a new collection
collection = client.create_collection(name=collection_name)
print(f"Collection '{collection_name}' created successfully.")


Collection 'realestate_listings' does not exist, creating a new one.
Collection 'realestate_listings' created successfully.


In [9]:
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

# Function to generate embeddings (this uses the correct method in langchain)
def generate_embeddings(text):
    # Generate embedding using langchain's OpenAIEmbeddings
    return embeddings.embed_query(text)  # Correct method to generate embeddings

# Generate embeddings for each listing description
listings_df['embedding'] = listings_df['Description'].apply(generate_embeddings)

# Check the DataFrame with embeddings
listings_df.head()

Unnamed: 0,Neighborhood,Price,Bedrooms,Bathrooms,House Size,Description,embedding
0,Uptown Manhattan,"$1,250,000",3,2.0,"1,500 sqft","Luxury apartment with beautiful city views, fe...","[0.009076486664536466, 0.004290182761727325, 0..."
1,Palm Beach Gardens,"$850,000",4,3.5,"3,000 sqft",Charming single-family home with a large backy...,"[-0.0023595419591868325, 0.030821117129888328,..."
2,Beverly Hills,"$8,500,000",5,6.0,"6,900 sqft","Elegant mansion with a guest house, luxurious ...","[-0.009865948096208596, 0.013545541226134229, ..."
3,Lincoln Park,"$550,000",2,2.0,"1,300 sqft",Warm and welcoming condo featuring a modern de...,"[0.0042718300056201084, 0.011894114424555798, ..."
4,"Westlake, Austin","$1,100,000",4,3.5,"2,800 sqft",Beautiful traditional brick home with a spacio...,"[-0.019723996592158545, 0.02182339678779089, -..."


In [10]:
# Add data to ChromaDB (documents, embeddings, and metadata)
documents = listings_df['Description'].tolist()
embeddings_list = listings_df['embedding'].tolist()
metadatas = [{'description': desc} for desc in documents]  # Metadata as a simple dictionary

# Create a unique ID for each document, using the DataFrame index
ids = [str(i) for i in listings_df.index.tolist()]

# Add the documents, metadata, embeddings, and ids to the Chroma collection
collection.add(
    documents=documents,
    metadatas=metadatas,
    embeddings=embeddings_list,
    ids=ids  # Make sure to include the ids
)

In [11]:
# Confirm data has been added by checking the collection
print(collection.get())

{'ids': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], 'embeddings': None, 'metadatas': [{'description': 'Luxury apartment with beautiful city views, featuring a modern kitchen, spacious living area, rooftop access, and a fitness center in the building.'}, {'description': 'Charming single-family home with a large backyard, updated kitchen, main level master suite, swimming pool and a 2-car garage, minutes away from the beach.'}, {'description': 'Elegant mansion with a guest house, luxurious finishes, a swimming pool, home theater, wine cellar, and a 3-car garage, located in a prestigious gated community.'}, {'description': 'Warm and welcoming condo featuring a modern design, upgraded kitchen with stainless steel appliances, hardwood floors, and just walking distance to shopping and dining.'}, {'description': 'Beautiful traditional brick home with a spacious yard, open floor plan, chic fireplace, and an expansive deck with panoramic views.'}, {'description': 'Charming townhome in t

In [12]:
# Define the user preferences and perform the search
questions = [
    "How big do you want your house to be?", 
    "What are 3 most important things for you in choosing this property?", 
    "Which amenities would you like?", 
    "Which transportation options are important to you?",
    "How urban do you want your neighborhood to be?"
]

answers = [
    "A comfortable three-bedroom house with a spacious kitchen and a cozy living room.",
    "A quiet neighborhood, good local schools, and convenient shopping options.",
    "A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.",
    "Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.",
    "A balance between suburban tranquility and access to urban amenities like restaurants and theaters."
]

# Combine the answers into a single string for semantic search
preferences = " ".join(answers)

In [13]:
# Perform the semantic search based on the preferences

# Combine the answers into a single string for semantic search
preferences = " ".join(answers)

# Convert the preferences into an embedding using the embed_query method
preference_embedding = embeddings.embed_query(preferences)

# Perform the search with the embedding for the user's preferences
results = collection.query(
    query_embeddings=[preference_embedding],
    n_results=3  # Number of results to return
)

# Display the results
print("Top matching listings based on user preferences:")
for result_list in results['documents']:
    for result in result_list:
        print(f"Description: {result}\n")



Top matching listings based on user preferences:
Description: Charming townhome in the heart of the city with gourmet kitchen, fenced backyard, and off-street parking, close to public transportation and nightlife.

Description: Charming single-family home with a large backyard, updated kitchen, main level master suite, swimming pool and a 2-car garage, minutes away from the beach.

Description: A gorgeous row house featuring a gourmet kitchen, hardwood floors, private patio, and off-street parking located in a renowned historic neighborhood.



In [14]:
# Function to augment the listing description using LLM
def augment_listing_description(description, preferences):
    # Shorten the prompt to only ask for key details
    prompt = f"""
    Enhance the description for a buyer with the following preferences:

    Preferences: {preferences}
    
    Listing: {description}
    
    Make it engaging but short, simple and concise, focusing on key features that appeal most.
    """
    
    # Call OpenAI API to augment the description
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    
    # Extract and return a short version of the augmented description
    augmented_description = response['choices'][0]['message']['content']
    return augmented_description

# Loop through the results and augment descriptions
for result in results['documents']:
    for description in result:  # Loop through the list of descriptions
        augmented_description = augment_listing_description(description, preferences)
        print(f"Augmented Description: {augmented_description[:100]}...\n")


Augmented Description: "Discover urban suburban bliss in this modern, three-bedroom townhome nestled in a quiet, family-fri...

Augmented Description: Ideal for those seeking suburban tranquility, this charming three-bedroom home perfectly balances co...

Augmented Description: Listing: Discover your dream home nestled in a serene historic neighborhood! This beautiful three-be...



#### Other Test #1

In [15]:
# Other Test #1
questions = [
    "How big do you want your house to be?", 
    "What are 3 most important things for you in choosing this property?", 
    "Which amenities would you like?", 
    "Which transportation options are important to you?",
    "How urban do you want your neighborhood to be?"
]

answers = [
    "I need a 3-bedroom house, preferably near a beach with great ocean views.",
    "Safety, proximity to good schools, and a quiet neighborhood for families.",
    "An energy-efficient home with solar panels, a large backyard for gardening, and a two-car garage.",
    "Easy access to public transport, especially a metro station or bus routes.",
    "I want a neighborhood with lots of green spaces but also close to cafes and restaurants."
]

preferences = " ".join(answers)


In [16]:
# Perform the semantic search based on the preferences

# Combine the answers into a single string for semantic search
preferences = " ".join(answers)

# Convert the preferences into an embedding using the embed_query method
preference_embedding = embeddings.embed_query(preferences)

# Perform the search with the embedding for the user's preferences
results = collection.query(
    query_embeddings=[preference_embedding],
    n_results=3  # Number of results to return
)

# Display the results
print("Top matching listings based on user preferences:")
for result_list in results['documents']:
    for result in result_list:
        print(f"Description: {result}\n")

Top matching listings based on user preferences:
Description: Charming single-family home with a large backyard, updated kitchen, main level master suite, swimming pool and a 2-car garage, minutes away from the beach.

Description: Charming townhome in the heart of the city with gourmet kitchen, fenced backyard, and off-street parking, close to public transportation and nightlife.

Description: Elegant mansion with a guest house, luxurious finishes, a swimming pool, home theater, wine cellar, and a 3-car garage, located in a prestigious gated community.



In [17]:
# Augment descriptions for each listing and show shorter versions
for listing in listings_df["Description"]:  # Directly iterate over descriptions
    augmented_description = augment_listing_description(listing, preferences)
    print(f"Augmented Description: {augmented_description[:100]}...\n")

Augmented Description: This luxurious 3-bedroom beachfront house cuts a perfect balance of comfort and convenience for your...

Augmented Description: Listing: Delight in this charming, energy-efficient, 3-bedroom family home with solar panels. Nestle...

Augmented Description: Listing: Discover your dream home! A 3-bedroom beachfront charmer nestled in a safe, quiet family ne...

Augmented Description: Envision your dream home with this stunning 3-bedroom condo, nestling close to a tranquil and pictur...

Augmented Description: Listing: Embrace coastal living in a safe, family-friendly neighborhood with this stunning 3-bedroom...

Augmented Description: Listing: Discover your dream 3-bedroom house, beautifully located near an awe-inspiring beach perfec...

Augmented Description: Immerse yourself in this breathtakingly beautiful 3-bedroom beach house! Tailored for your desires, ...

Augmented Description: Listing: A charming 3-bedroom energy-efficient house, oozing with family-friendly

#### Other Test #2

In [18]:
# Other Test #2
questions = [
    "How big do you want your house to be?", 
    "What are 3 most important things for you in choosing this property?", 
    "Which amenities would you like?", 
    "Which transportation options are important to you?",
    "How urban do you want your neighborhood to be?"
]

answers = [
    "I need a 5-bedroom mansion with a home theater, gym, and a bowling alley.",
    "The most important things are extreme privacy, no neighbors within a mile, and proximity to nature.",
    "A personal spa, a wine cellar, and a private boat dock.",
    "Helicopter access, a private road, and a remote location with no nearby public transport.",
    "A completely isolated, ultra-rural location with no urban influence at all."
]

preferences = " ".join(answers)


In [19]:
# Perform the semantic search based on the preferences

# Combine the answers into a single string for semantic search
preferences = " ".join(answers)

# Convert the preferences into an embedding using the embed_query method
preference_embedding = embeddings.embed_query(preferences)

# Perform the search with the embedding for the user's preferences
results = collection.query(
    query_embeddings=[preference_embedding],
    n_results=3  # Number of results to return
)

# Display the results
print("Top matching listings based on user preferences:")
for result_list in results['documents']:
    for result in result_list:
        print(f"Description: {result}\n")

Top matching listings based on user preferences:
Description: Elegant mansion with a guest house, luxurious finishes, a swimming pool, home theater, wine cellar, and a 3-car garage, located in a prestigious gated community.

Description: A gorgeous row house featuring a gourmet kitchen, hardwood floors, private patio, and off-street parking located in a renowned historic neighborhood.

Description: Charming single-family home with a large backyard, updated kitchen, main level master suite, swimming pool and a 2-car garage, minutes away from the beach.



In [20]:
# Augment descriptions for each listing and show shorter versions
for listing in listings_df["Description"]:  # Directly iterate over descriptions
    augmented_description = augment_listing_description(listing, preferences)
    print(f"Augmented Description: {augmented_description[:100]}...\n")

Augmented Description: Listing: A pristine 5-bedroom countryside mansion, hidden away in a private, ultra-rural location. T...

Augmented Description: Remarkable secluded haven nestled amidst nature intended for the utmost privacy and tranquility. Thi...

Augmented Description: Experience absolute tranquility in this opulent 5-bedroom mansion, nestled in an ultra-rural, remote...

Augmented Description: Listing: Experience unparalleled luxury and privacy in this 5-bedroom mansion, nestled in an ultra-r...

Augmented Description: Listing: Step into ultimate luxury with this ultra-private, 5-bedroom brick mansion nestled in the h...

Augmented Description: Listing: Discover your personal paradise in this expansive 5-bedroom ultra-luxurious mansion, nestle...

Augmented Description: Discover the epitome of opulent and private living with this 5-bedroom mansion, nestled amidst lush ...

Augmented Description: Listing: Welcome to your bespoke, ultra-private 5-bedroom mansion nestled amidst 