This is a starter notebook for the project, you'll have to import the libraries you'll need, you can find a list of the ones available in this workspace in the requirements.txt file in this workspace. 

In [1]:
!pip install -U chromadb pandas langchain langchain-openai langchain-chroma langchain-community

Defaulting to user installation because normal site-packages is not writeable




In [2]:
import os
import re
import json
import pandas as pd
import chromadb
from io import StringIO
from langchain_openai import OpenAI
from langchain_community.document_loaders import CSVLoader
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.schema import SystemMessage, HumanMessage


os.environ["OPENAI_API_KEY"] = "Place your API_KEY Here"
os.environ["OPENAI_API_BASE"] = "https://openai.vocareum.com/v1"
client = chromadb.PersistentClient(path="./chroma_db") # Stores data in a local directory

llm = OpenAI(
    model="gpt-4o-mini",
    temperature=0.7,
    max_tokens=2500,
    api_key=os.getenv("OPENAI_API_KEY")
)

def personalize(content, personality="friendly, benefit-focused, concise"):

    prompt = f"""
    You are editing real-estate copy. Personalize the wording while keeping all facts unchanged.

    Persona/tone: {personality}
    Allowed changes:
    - Improve clarity, friendliness, flow, and appeal for a homebuyer
    - Highlight benefits implied by the facts
    - Keep it concise (2–3 sentences each)
    - Include Price

    Forbidden changes:
    - Do NOT change numeric values (price, beds, baths, size, neighborhood name)
    - Do NOT invent amenities
    - Do NOT change units/currency

    Facts:
    {content}

    Return ONLY valid string with exactly these keys:
      "Description": "...",
      "Neighborhood Description": "..."
    """.strip()

    out = llm.invoke(prompt).strip()
    return out

instruction = """
Generate 35 Real Estate lisitngs in CSV format with the following fields:Neighborhood, Price, Bedrooms, Bathrooms, House Size, Description.
Follow this format
Neighborhood: Green Oaks
Price: $800,000
Bedrooms: 3
Bathrooms: 2
House Size: 2,000 sqft

Description: Welcome to this eco-friendly oasis nestled in the heart of Green Oaks. 
This charming 3-bedroom, 2-bathroom home boasts energy-efficient features such as solar panels 
and a well-insulated structure. Natural light floods the living spaces, highlighting the beautiful 
hardwood floors and eco-conscious finishes. The open-concept kitchen and dining area lead to a spacious 
backyard with a vegetable garden, perfect for the eco-conscious family. Embrace sustainable living without 
compromising on style in this Green Oaks gem.

Neighborhood Description: Green Oaks is a close-knit, environmentally-conscious community with access to 
organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park
or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public transportation and bike lanes, 
commuting is a breeze.
"""

RE_listings = llm.invoke(instruction)

# Convert RE_listings->DataFrame
df = pd.read_csv(StringIO(RE_listings), sep=';')

# Normalize column names (strip spaces and colons)
df.columns = [c.strip().replace(":", "") for c in df.columns]


df.to_csv('RE_listings.csv', index=False)

# Display All Listings
print("All HomeMatch Listings")
print(df)
print("--------HOMEMATCH--------")
print("\n")

# Load CSV
csv_loader = CSVLoader(file_path='./RE_listings.csv')
RE_listing_docs = csv_loader.load()

# Embeddings created
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",   
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_API_BASE")  
)

# Build Chroma Database
db = Chroma.from_documents(
    documents=RE_listing_docs,
    embedding=embeddings,
    collection_name="RE_listings",
    persist_directory="data",
)
#db.persist() not needed in current version; writes are automatically persisted 

All HomeMatch Listings
                                                  ```
0   Neighborhood,Price,Bedrooms,Bathrooms,House Si...
1   Green Oaks,$800,000,3,2,2,000 sqft,"Welcome to...
2   Sunnyvale,$1,200,000,4,3,2,800 sqft,"Discover ...
3   Maplewood,$650,000,3,2,1,800 sqft,"Welcome to ...
4   Riverside,$1,000,000,5,4,3,500 sqft,"Experienc...
5   Highland Park,$700,000,4,2,2,200 sqft,"Nestled...
6   Cedar Creek,$850,000,3,3,2,500 sqft,"Welcome t...
7   Lakeside,$1,500,000,6,5,4,000 sqft,"Indulge in...
8   Willow Creek,$950,000,4,3,3,200 sqft,"Discover...
9   Oak Ridge,$775,000,3,2,2,100 sqft,"Welcome to ...
10  Parkview,$1,100,000,5,4,3,000 sqft,"Experience...
11  Elmwood,$600,000,3,2,1,600 sqft,"Welcome to El...
12  Cherry Hill,$1,350,000,5,4,3,800 sqft,"Indulge...
13  Sunny Hills,$825,000,4,3,2,500 sqft,"Discover ...
14  Pine Valley,$1,100,000,5,4,3,200 sqft,"Welcome...
15  Silver Lake,$950,000,4,3,2,800 sqft,"Experienc...
16  Golden Valley,$740,000,3,2,1,900 sqft,"Welcome...
17  B

In [4]:
# Client Input
client_input = input("Welcome to House Match; Please enter what you are looking for: ")
print("\nYou entered:", client_input)

# HomeMatch Search
results = db.similarity_search(client_input, k=5)

for i, d in enumerate(results, 0):
    #print(f"\nHome Result {i}:\n{d.page_content}\nDetails: {d.metadata}")
    
    # Go through results
    doc = results[i]
    
    # Get the original DataFrame row using the loader’s metadata
    row_idx = doc.metadata["row"]        # which row in df
    #row = df.iloc[row_idx]               # actual pandas.Series
    
    # Run it through personalization & Original Message
    
    personalized = personalize(doc.page_content, personality="friendly, benefit-focused, concise")
    
    
    print(f"\nDetailed Home Result (Original) {i+1}:\n\n{d.page_content}\nSource: {d.metadata}")
    print("\nPersonalized Description:\n")
    
    # Remove leading and trailing braces
    cleaned_personalized = personalized.strip('{} \n\t')

    # Remove all internal double quotes
    cleaned_personalized = cleaned_personalized.replace('"', '')

    print(cleaned_personalized)

Welcome to House Match; Please enter what you are looking for: solar

You entered: solar

Detailed Home Result (Original) 1:

```: Sunnyvale,$1,200,000,4,3,2,800 sqft,"Discover this stunning 4-bedroom, 3-bathroom home in Sunnyvale, where modern elegance meets comfort. The spacious living room features floor-to-ceiling windows that provide breathtaking views of the surrounding hills. The gourmet kitchen is a chef's dream, complete with high-end appliances and a large island. The master suite offers a private balcony and luxurious ensuite bathroom. Enjoy outdoor living in the beautifully landscaped backyard with a built-in barbecue area, perfect for entertaining friends and family."
Source: {'row': 2, 'source': './RE_listings.csv'}

Personalized Description:

```
{
  Description: Explore this beautiful 4-bedroom, 3-bathroom home in Sunnyvale, where modern elegance blends seamlessly with comfort. You'll love the spacious living room with floor-to-ceiling windows that frame stunning views 

In [None]:
print(personalized)