This is a starter notebook for the project, you'll have to import the libraries you'll need, you can find a list of the ones available in this workspace in the requirements.txt file in this workspace. 

# Environment Setup

In [23]:
try:
    import sys, subprocess
    def _pip(pkg):
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
    for pkg in [
        "langchain",
        "langchain_openai",
        "langchain_chroma",
        "openai",
        "pydantic",
        "chromadb",
        "jupyter"
    ]:
        try:
            __import__(pkg.split("==")[0].split(">=")[0].split("[")[0])
        except Exception:
            _pip(pkg)
except Exception as e:
    print("Non-fatal install warning:", e)

In [24]:
import os
import json
from typing import List, Dict, Any
import warnings
warnings.filterwarnings('ignore')

# LangChain v0.3.x
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

# Pydantic Stuff
from pydantic import BaseModel, Field

# Set up ENV variables from .env file
from dotenv import load_dotenv
load_dotenv()

if "OPENAI_API_KEY" not in os.environ:
    print("Please set your OPENAI_API_KEY environment variable")
    os.environ["OPENAI_API_KEY"] = input("Enter your OpenAI API key: ")

# Define Data Models
- Here I define two Pydantic models, one for the listing itself and another for the buyr preferences.

In [25]:
class RealEstateListing(BaseModel):
    """Schema for a real estate listing"""
    id: int = Field(description="Unique identifier for the listing")
    neighborhood: str = Field(description="Name of the neighborhood")
    price: int = Field(description="Price of the property in USD")
    bedrooms: int = Field(description="Number of bedrooms")
    bathrooms: float = Field(description="Number of bathrooms")
    house_size: int = Field(description="Size of the house in square feet")
    description: str = Field(description="Detailed description of the property")
    neighborhood_description: str = Field(description="Description of the neighborhood")


class BuyerPreferences(BaseModel):
    """Schema for buyer preferences"""
    size_preference: str = Field(description="Preferred size of the house")
    important_features: List[str] = Field(description="Most important features")
    amenities: List[str] = Field(description="Desired amenities")
    transportation: List[str] = Field(description="Important transportation options")
    neighborhood_type: str = Field(description="Preferred neighborhood type")

# Initialize LLM and Embeddings

In [26]:
# Initialize the language model
llm = ChatOpenAI(
    model_name="gpt-4",
    temperature=0.7,
    max_tokens=1000
)

# Initialize embeddings for vector storage
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

print("✅ LLM and embeddings initialized successfully")

✅ LLM and embeddings initialized successfully


# Synthetic Data Generation
- RUBRIC: The submission must demonstrate using a Large Language Model (LLM) to generate at least 10 diverse and realistic real estate listings containing facts about the real estate.



In [27]:
def generate_real_estate_listings(num_listings: int = 10) -> List[Dict]:
    """Generate synthetic real estate listings using LLM"""
    
    listing_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a real estate data generator. Generate diverse and realistic property listings."),
        ("user", """
        Generate a unique and realistic real estate listing with the following requirements:
        - Listing number: {listing_number}
        - Make it diverse from previous listings
        - Include varied price ranges ($300,000 to $2,000,000)
        - Include different neighborhood types (urban, suburban, rural)
        - Vary the number of bedrooms (2-5) and bathrooms (1-4)
        - House sizes should range from 1,200 to 4,500 sqft
        
        Provide the listing in the following JSON format:
        {{
            "id": {listing_number},
            "neighborhood": "neighborhood name",
            "price": price_in_dollars,
            "bedrooms": number_of_bedrooms,
            "bathrooms": number_of_bathrooms,
            "house_size": size_in_sqft,
            "description": "detailed property description focusing on unique features",
            "neighborhood_description": "description of the neighborhood and local amenities"
        }}
        
        Make each listing unique with different architectural styles, features, and neighborhood characteristics.
        Return ONLY the JSON object, no additional text.
        """)
    ])
    
    # Create a chain using the new LCEL (LangChain Expression Language) approach
    listing_chain = listing_prompt | llm | JsonOutputParser()

    listings = []

    # Define diverse neighborhood types for variety
    neighborhood_styles = [
        "eco-friendly community", "historic district", "modern downtown",
        "family-oriented suburb", "artistic quarter", "waterfront area",
        "mountain view region", "tech hub vicinity", "quiet countryside",
        "trendy urban district", "golf course community", "university area"
    ]
    
    for i in range(num_listings):
        try:
            # Generate listing using new chain invoke method
            listing_data = listing_chain.invoke({"listing_number": i+1})
            
            # Ensure all required fields are present
            if isinstance(listing_data, dict) and 'id' in listing_data:
                listings.append(listing_data)
                print(f"✅ Generated listing {i+1}/{num_listings}")
            else:
                raise ValueError("Invalid listing format")
            
        except Exception as e:
            print(f"⚠️ Error generating listing {i+1}: {str(e)}")
            # Fallback to a pre-defined listing if generation fails
            fallback_listing = {
                "id": i+1,
                "neighborhood": neighborhood_styles[i % len(neighborhood_styles)].replace("_", " ").title(),
                "price": 500000 + (i * 100000),
                "bedrooms": 3 + (i % 3),
                "bathrooms": 2 + (i % 2) * 0.5,
                "house_size": 2000 + (i * 200),
                "description": f"Beautiful home in {neighborhood_styles[i % len(neighborhood_styles)]} with modern amenities and spacious layout. Features include hardwood floors, updated kitchen, and energy-efficient appliances.",
                "neighborhood_description": f"A vibrant {neighborhood_styles[i % len(neighborhood_styles)]} with excellent schools, convenient shopping, and community parks. Perfect for families and professionals alike."
            }
            listings.append(fallback_listing)
    
    return listings


# Semantic Search
- RUBRIC: The project must demonstrate the creation of a vector database and successfully storing real estate listing embeddings within it. The database should effectively store and organize the embeddings generated from the LLM-created listings.
- RUBRIC: The application must include a functionality where listings are semantically searched based on given buyer preferences. The search should return listings that closely match the input preferences.

In [28]:
def create_vector_database(listings: List[Dict]) -> Chroma:
    """Create ChromaDB vector database and store listing embeddings using latest API"""
    
    documents = []
    
    for listing in listings:
        # Combine all relevant information for embedding
        content = f"""
        Neighborhood: {listing['neighborhood']}
        Price: ${listing['price']}
        Bedrooms: {listing['bedrooms']}
        Bathrooms: {listing['bathrooms']}
        House Size: {listing['house_size']} sqft
        
        Property Description: {listing['description']}
        
        Neighborhood Description: {listing['neighborhood_description']}
        """
        
        # Create document with new Document class
        doc = Document(
            page_content=content,
            metadata={
                "id": listing['id'],
                "neighborhood": listing['neighborhood'],
                "price": listing['price'],
                "bedrooms": listing['bedrooms'],
                "bathrooms": listing['bathrooms'],
                "house_size": listing['house_size']
            }
        )
        documents.append(doc)
    
    # Create and populate vector database with latest Chroma API
    vectordb = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        collection_name="real_estate_listings",
        persist_directory="./chroma_db"
    )
    
    print(f"✅ Vector database created with {len(documents)} listings")
    print("💾 Vector database persisted to './chroma_db'")
    return vectordb


In [29]:
# Collect and Parse buyer preferences
def collect_buyer_preferences(answers=[]) -> Dict[str, Any]:
    """Collect buyer preferences through questions or pre-defined responses"""
    
    # Define questions
    questions = [
        "How big do you want your house to be?",
        "What are 3 most important things for you in choosing this property?",
        "Which amenities would you like?",
        "Which transportation options are important to you?",
        "How urban do you want your neighborhood to be?"
    ]

    if len(answers) < 5:
        preferences.clear()
        preferences["size_preference"]=input(questions[0]),
        preferences["important_features"]=input(questions[1]),
        preferences["amenities"]=input(questions[2]),
        preferences["transportation"]=input(questions[3]),
        preferences["neighborhood_type"]=input(questions[4])
    else:
        preferences = {
            "size_preference": answers[0],
            "important_features": answers[1],
            "amenities": answers[2],
            "transportation": answers[3],
            "neighborhood_type": answers[4]
        }
    
    # Display collected preferences
    print("👤 Buyer Preferences Collected:")
    print("-" * 50)
    for i, (q, a) in enumerate(zip(questions, answers)):
        print(f"Q{i+1}: {q}")
        print(f"A{i+1}: {a}\n")
    
    return preferences


# Parse and Structure Buyer Preferences
def parse_buyer_preferences(preferences: Dict[str, Any]) -> str:
    """Convert buyer preferences into a searchable query"""
    
    preference_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an expert at understanding buyer preferences and creating search queries."),
        ("user", """
        Based on the following buyer preferences, create a comprehensive search query 
        that captures what they're looking for in a property:
        
        Size preference: {size_preference}
        Important features: {important_features}
        Desired amenities: {amenities}
        Transportation needs: {transportation}
        Neighborhood type: {neighborhood_type}
        
        Synthesize these preferences into a detailed description of their ideal property
        and neighborhood, focusing on the key aspects they value most.
        """)
    ])
    
    # Use LCEL chain
    preference_chain = preference_prompt | llm
    
    # Invoke the chain with preferences
    response = preference_chain.invoke(preferences)
    search_query = response.content
    
    print("🔍 Processed Search Query:")
    print(search_query)
    print("-" * 50)
    
    return search_query


In [30]:
# Semantic Search of Listings Based on Buyer Preferences
def search_listings(vector_db: Chroma, query: str, k: int = 5) -> List[Document]:
    """Perform semantic search to find matching listings"""
    
    print(f"\n🔎 Searching for top {k} matching properties...")
    
    # Use the new retriever interface
    retriever = vector_db.as_retriever(
        search_type="similarity",
        search_kwargs={"k": k}
    )
    
    # Perform retrieval
    matching_docs = retriever.invoke(query)
    
    print(f"✅ Found {len(matching_docs)} matching properties")
    
    # Display basic info about matches
    print("\n📊 Matching Properties Overview:")
    for i, doc in enumerate(matching_docs, 1):
        meta = doc.metadata
        print(f"{i}. {meta['neighborhood']} - ${meta['price']:,} | "
              f"{meta['bedrooms']}BR/{meta['bathrooms']}BA | {meta['house_size']} sqft")
    
    return matching_docs


# Augmented Response Generation
- RUBRIC: The project must demonstrate a logical flow where buyer preferences are used to search and then augment the description of real estate listings. The augmentation should personalize the listing without changing factual information.
- RUBRIC: The submission must utilize an LLM to generate personalized descriptions for the real estate listings based on buyer preferences. The descriptions should be unique, appealing, and tailored to the preferences provided.

In [31]:
# Augment a listing using an LLM
def personalize_listing_description(listing_content: str, 
                                   listing_metadata: Dict,
                                   buyer_preferences: Dict) -> str:
    """Generate personalized description based on buyer preferences"""
    
    personalization_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a skilled real estate agent creating personalized property descriptions."),
        ("user", """
        You are creating a personalized property description for a specific buyer.
        
        Original Property Information:
        {listing_content}
        
        Property Details:
        - Price: ${price:,}
        - Bedrooms: {bedrooms}
        - Bathrooms: {bathrooms}
        - Size: {house_size} sqft
        
        Buyer's Preferences:
        - Size: {size_preference}
        - Important Features: {important_features}
        - Desired Amenities: {amenities}
        - Transportation: {transportation}
        - Neighborhood Type: {neighborhood_type}
        
        Task: Rewrite the property description to emphasize aspects that align with the buyer's 
        preferences. Highlight relevant features they mentioned, but DO NOT change any factual 
        information about the property. Make the description feel personally crafted for this buyer.
        
        Keep all facts accurate while making the narrative resonate with their specific needs.
        """)
    ])
    
    personalization_chain = personalization_prompt | llm
    
    # Prepare input data
    input_data = {
        "listing_content": listing_content,
        "price": listing_metadata['price'],
        "bedrooms": listing_metadata['bedrooms'],
        "bathrooms": listing_metadata['bathrooms'],
        "house_size": listing_metadata['house_size'],
        **buyer_preferences
    }
    
    # Generate personalized description
    response = personalization_chain.invoke(input_data)
    
    return response.content



# Run Steps

In [32]:
def do_run(answers=[]):
    # Generate listings
    print("🏠 Generating real estate listings...")
    listings_data = generate_real_estate_listings(12)
    print(f"\n✅ Successfully generated {len(listings_data)} listings")

    # Save listings to txt file
    with open('listings.txt', 'w', encoding="utf-8") as f:
        json.dump(listings_data, f, ensure_ascii=False, indent=2)
    print("💾 Listings saved to 'listings.txt'")

    # Create vector database
    print("\n🗄️ Creating vector database...")
    vector_db = create_vector_database(listings_data)

    # Collect buyer preferences
    #   Example buyer preferences (will be interactive if len(answers) < 5)
    answers = [
        "A comfortable three-bedroom house with a spacious kitchen and a cozy living room.",
        "A quiet neighborhood, good local schools, and convenient shopping options.",
        "A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.",
        "Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.",
        "A balance between suburban tranquility and access to urban amenities like restaurants and theaters."
    ]
    buyer_prefs = collect_buyer_preferences(answers)

    # Parse preferences into search query
    search_query = parse_buyer_preferences(buyer_prefs)

    # Search for matching listings
    matching_listings = search_listings(vector_db, search_query, k=3)

    # Generate personalized descriptions for matched listings
    print("\n✨ Generating Personalized Listing Descriptions:\n")
    print("=" * 80)

    personalized_listings = []
    for i, doc in enumerate(matching_listings, 1):
        print(f"\n🏠 PERSONALIZED LISTING {i}")
        print("-" * 80)
        
        # Get original listing data
        listing_id = doc.metadata['id']
        original_listing = next(l for l in listings_data if l['id'] == listing_id)
        
        # Generate personalized description
        personalized_desc = personalize_listing_description(
            doc.page_content,
            doc.metadata,
            buyer_prefs
        )
        
        # Display personalized listing
        print(f"📍 Neighborhood: {doc.metadata['neighborhood']}")
        print(f"💰 Price: ${doc.metadata['price']:,}")
        print(f"🛏️ Bedrooms: {doc.metadata['bedrooms']} | 🚿 Bathrooms: {doc.metadata['bathrooms']}")
        print(f"📐 Size: {doc.metadata['house_size']} sqft")
        print(f"\n📝 Personalized Description:")
        print(personalized_desc)
        
        # Store personalized listing
        personalized_listings.append({
            "original": original_listing,
            "personalized_description": personalized_desc,
            "metadata": doc.metadata
        })
        
        print("\n" + "=" * 80)

# Do Some Runs !

In [33]:
answers = [
    "A comfortable three-bedroom house with a spacious kitchen and a cozy living room.",
    "A quiet neighborhood, good local schools, and convenient shopping options.",
    "A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.",
    "Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.",
    "A balance between suburban tranquility and access to urban amenities like restaurants and theaters."
]

# do a run using the user prefs above as answers
do_run(answers)

# uncomment this line to do another run but this time let the user input the answers interactively
#do_run()

🏠 Generating real estate listings...
✅ Generated listing 1/12
✅ Generated listing 2/12
✅ Generated listing 3/12
✅ Generated listing 4/12
✅ Generated listing 5/12
✅ Generated listing 6/12
✅ Generated listing 7/12
✅ Generated listing 8/12
✅ Generated listing 9/12
✅ Generated listing 10/12
✅ Generated listing 11/12
✅ Generated listing 12/12

✅ Successfully generated 12 listings
💾 Listings saved to 'listings.txt'

🗄️ Creating vector database...
✅ Vector database created with 12 listings
💾 Vector database persisted to './chroma_db'
👤 Buyer Preferences Collected:
--------------------------------------------------
Q1: How big do you want your house to be?
A1: A comfortable three-bedroom house with a spacious kitchen and a cozy living room.

Q2: What are 3 most important things for you in choosing this property?
A2: A quiet neighborhood, good local schools, and convenient shopping options.

Q3: Which amenities would you like?
A3: A backyard for gardening, a two-car garage, and a modern, energy