# Imports & Setup

In [21]:
import openai
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import googlemaps
import math
from dotenv import load_dotenv
import os
import requests
import time
import nltk
import json
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler
from folium.plugins import MarkerCluster
import folium
import webbrowser

# Download NLTK data
nltk.download('vader_lexicon', quiet=True)

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Set up plotting
%matplotlib inline
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.1)

# Configuration

In [22]:
# Load environment variables from the .env file
load_dotenv(dotenv_path='/Users/irisyu/Desktop/Project/virtual-housing-agent/.env')

# Access the variables
DATA_API_KEY = os.getenv('DATA_API_KEY')
PLACES_API_KEY = os.getenv('PLACES_API_KEY')
GEOCODING_KEY = os.getenv('GEOCODING_API_KEY')
OPENAI_KEY = os.getenv('OPENAI_API_KEY')

gmaps_places = googlemaps.Client(key=PLACES_API_KEY)
gmaps_geocoding = googlemaps.Client(key=GEOCODING_KEY)

### Set up dataset

Fetch Rental Listing API

In [23]:
def fetch_sf_rental_listings(limit=500, max_requests=500):
    """
    Fetch rental listings from the Rentcast API for San Francisco, CA.
    
    Args:
    limit (int): The number of listings to fetch per request.
    max_requests (int): The maximum number of API requests to make.
    
    Returns:
    pandas.DataFrame: A DataFrame containing all fetched listings.
    """
    Data_URL = 'https://api.rentcast.io/v1/listings/rental/long-term'
    params = {
        'city': 'San Francisco',
        'state': 'CA',
        'limit': limit,
        'status': 'Active', 
        'offset': 0
    }

    headers = {
        'Accept': 'application/json',
        'X-Api-Key': DATA_API_KEY
    }

    all_listings = []
    request_count = 0

    try:
        for _ in range(max_requests):
            response = requests.get(Data_URL, headers=headers, params=params)
            request_count += 1
            
            if response.status_code == 200:
                listings = response.json() 
                if not isinstance(listings, list):
                    print("Unexpected data format received.")
                    break
                
                all_listings.extend(listings)
                
                print(f"Retrieved {len(listings)} rental listings. Total: {len(all_listings)}")
                
                if len(listings) < params['limit']:
                    break
                
                params['offset'] += len(listings)
            else:
                print(f"Error: {response.status_code}")
                print(response.text)
                break
            
            time.sleep(1)

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")

    return pd.DataFrame(all_listings)

# Usage
df = fetch_sf_rental_listings()

Retrieved 500 rental listings. Total: 500
Retrieved 500 rental listings. Total: 1000
Retrieved 298 rental listings. Total: 1298


# Google Maps API

In [24]:
# Vectorize the vague address from user input 
def autocomplete_place(input_text):
    predictions = gmaps_places.places_autocomplete(input_text, types='geocode')
    return predictions

# Help locate the target address
def get_lat_lng_from_place_id(place_id):
    result = gmaps_geocoding.place(place_id=place_id)
    location = result['result']['geometry']['location']
    return location['lat'], location['lng']

# Helper Function: Haversine function to calculate distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2) ** 2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c

# Google Maps Reviews API

In [25]:
# Fetch Google Maps reviews
def fetch_reviews(place_id, max_reviews=100):
    reviews = []
    place_details = gmaps_places.place(place_id=place_id)
    if 'reviews' in place_details['result']:
        reviews.extend(place_details['result']['reviews'][:max_reviews])
    return reviews# NLP Methods


nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Helper Function: Calculate sentiment score
def sentiment_score(text):
    return sia.polarity_scores(text)["compound"]


# Helper Function: Locate the target address for its reviews 
def search_place(query):
    places_result = gmaps_places.places(query)
    if places_result['results']:
        return places_result['results'][0]['place_id']
    return None


# Helper Function: Analyze reviews and get sentiment scores for relevant features
def analyze_reviews(reviews):
    review_data = {
        "review_text": [review['text'] for review in reviews],
        "sentiment_score": [sentiment_score(review['text']) for review in reviews]
    }
    reviews_df = pd.DataFrame(review_data)
    avg_sentiment_score = reviews_df['sentiment_score'].mean()
    return avg_sentiment_score

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/irisyu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Recomendation System

This rental property recommendation system follows a structured and logical flow to provide users with personalized rental listings based on their preferences and criteria.

1. User Input Collection: The system prompts users to enter their rental preferences, such as the number of bedrooms and bathrooms, rent price range, specific location, move-in date, lease term, and maximum distance from the desired location. Users also rate the importance of each criterion on a scale from 1 to 10 to establish the weights for each factor.

2. Initial Property Filtering: The system filters the property listings based on the user’s basic requirements (bedrooms, bathrooms, and rent price range). The system retrieves the latitude and longitude of the user-specified location. It calculates the distance of each property from the specified location and filters out properties beyond the maximum distance. The prices and distances are normalized, and an initial score is calculated for each property based on the user-defined weights. The properties are then sorted by this initial score.

3. Enhanced Filtering and Sentiment Analysis: The system retrieves user reviews for each property and performs sentiment analysis to obtain sentiment scores. It calculates an enhanced score by combining the initial score with the sentiment score. Properties are then sorted based on the enhanced score.

4. Result Display and Map Visualization: The filtered and sorted properties are displayed in the console, showing details such as address, price, bedrooms, bathrooms, distance, and scores. The top properties are visualized on a map using a mapping tool, and the map is saved as an HTML file.

### Helper Functions

In [26]:
# Grab user input from Terminal
def get_user_input():
    print("Let's find your ideal rental property!")

    criteria = {}
    weights = {}

    try:
        criteria['bedrooms'] = float(input("Enter number of bedrooms (e.g., 1, 2, 3): "))
        criteria['bathrooms'] = float(input("Enter number of bathrooms (e.g., 1, 1.5, 2): "))

        criteria['min_rent'] = float(input("Minimum rent price: "))
        criteria['max_rent'] = float(input("Maximum rent price: "))

        criteria['location'] = input("Enter desired location (you can input zip code, neighborhood, building name, or landmark): ")

        criteria['move_in_date'] = input("Enter moving-in date (YYYY-MM-DD): ")
        criteria['lease_term'] = int(input("Enter lease term (in months): "))

        criteria['max_distance_km'] = float(input("Enter maximum distance from the location (in kilometers): "))

        print("\nNow, let's set the importance of each criterion.")
        print("Please rate the importance of each criterion from 1 (least important) to 10 (most important):")

        weights['bedrooms'] = float(input("Importance of number of bedrooms: ")) / 10
        weights['bathrooms'] = float(input("Importance of number of bathrooms: ")) / 10
        weights['price'] = float(input("Importance of price: ")) / 10
        weights['distance'] = float(input("Importance of distance from desired location: ")) / 10
    except ValueError as e:
        print(f"Invalid input: {e}. Please enter numeric values where required.")
        return None, None

    print("criteria from input",criteria, weights)

    return criteria, weights


# Calculate the scores with weights for each property 
def calculate_score(row, criteria, weights):
    score = 0
    score += weights['bedrooms'] * (1 if row['bedrooms'] >= criteria['bedrooms'] else 0)
    score += weights['bathrooms'] * (1 if row['bathrooms'] >= criteria['bathrooms'] else 0)
    score += weights['price'] * (1 - row['price'])
    score += weights['distance'] * (1 - row['distance'])
    return score


# Initial Property Filtering 
def filter_listings(df, criteria = {'bedrooms': 1.0, 'bathrooms': 1.0, 'min_rent': 1000.0, 'max_rent': 8000.0, 'location': 'San Francisco', 'move_in_date': '2024-08-01', 'lease_term': 12, 'max_distance_km': 20.0},  weights = {'bedrooms': 1.0, 'bathrooms': 1.0, 'price': 1.0, 'distance': 2.0 }):
    df['original_price'] = df['price'].copy()
    filtered_df = df[
        (df['bedrooms'] >= criteria['bedrooms']) &
        (df['bathrooms'] >= criteria['bathrooms']) &
        (df['price'] >= criteria['min_rent']) &
        (df['price'] <= criteria['max_rent'])
    ].copy()

    if filtered_df.empty:
        print("No listings found that match your criteria.")
        return filtered_df

    try:
        predictions = autocomplete_place(criteria['location'])
        if predictions:
            place_id = predictions[0]['place_id']
            lat, lng = get_lat_lng_from_place_id(place_id)

            filtered_df['distance'] = filtered_df.apply(lambda row: haversine(lat, lng, row['latitude'], row['longitude']), axis=1)
            filtered_df = filtered_df[filtered_df['distance'] <= criteria['max_distance_km']]

        if not filtered_df.empty:
            scaler = MinMaxScaler()
            filtered_df[['price', 'distance']] = scaler.fit_transform(filtered_df[['price', 'distance']])

            filtered_df['score'] = filtered_df.apply(lambda row: calculate_score(row, criteria, weights), axis=1)
            filtered_df = filtered_df.sort_values('score', ascending=False)
        else:
            print("No listings found after applying distance filter.")
            return filtered_df
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()

    return filtered_df



# Cache to store place IDs and their sentiment scores
place_cache = {}


# Get Google Reviews for target listings for the recommendation system
def get_reviews_for_listings(filtered_df, max_reviews_per_listing=100):
    filtered_df['place_id'] = filtered_df['formattedAddress'].apply(lambda x: place_cache.get(x, search_place(x)))
    filtered_df['sentiment_score'] = 0.0

    for index, row in filtered_df.iterrows():
        if row['place_id']:
            if row['place_id'] not in place_cache:
                reviews = fetch_reviews(row['place_id'], max_reviews=max_reviews_per_listing)
                if reviews:
                    avg_sentiment_score = analyze_reviews(reviews)
                    place_cache[row['place_id']] = avg_sentiment_score
                else:
                    place_cache[row['place_id']] = 0.0
            filtered_df.at[index, 'sentiment_score'] = place_cache[row['place_id']]
    return filtered_df


# Enhanced filterings by analyze google reviews
def enhanced_filter_listings(filtered_df, sentiment):
    if filtered_df.empty:
        print("No listings available for further filtering.")
        return filtered_df

    # Get reviews and analyze sentiment
    filtered_df = get_reviews_for_listings(filtered_df)

    # Calculate enhanced score
    filtered_df['enhanced_score'] = (
        filtered_df['score'] +
        filtered_df['sentiment_score'] * sentiment)

    return filtered_df.sort_values('enhanced_score', ascending=False)


# Display the recommendation listings
def display_results(filtered_df):
    if filtered_df.empty:
        print("No listings found that match your criteria.")
    else:
        print(f"Found{len(filtered_df)} listings that match your criteria:")
        print(filtered_df[['formattedAddress','price','bedrooms','bathrooms','squareFootage','distance','score']])

# Visualize the recommendations using HTML Maps
def visualize_results_on_map(df):
    sf_map = folium.Map(location=[37.7749, -122.4194], zoom_start=12)
    marker_cluster = MarkerCluster().add_to(sf_map)

    for rank, (index, row) in enumerate(df.iterrows(), start=1):
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=(
                f"Rank: {rank}<br>"
                f"Address: {row['formattedAddress']}<br>"
                f"Price: ${row['original_price']}<br>"
                f"Bedrooms: {row['bedrooms']}<br>"
                f"Bathrooms: {row['bathrooms']}<br>"
                f"Score: {row['enhanced_score']:.2f}"
            ),
            icon=folium.Icon(color='blue')
        ).add_to(marker_cluster)

    # Save the map as HTML file
    map_file = "recommended_listings_map.html"
    sf_map.save(map_file)
    print(f"Map saved to {map_file}")

    return map_file

def open_map(map_file):
    webbrowser.open('file://' + os.path.realpath(map_file))


# Main Recommendation System Function
def run_recommendation_system(criteria, weights):
        filtered_listings = filter_listings(df, criteria, weights)
        if not filtered_listings.empty:
            recommended_listings = enhanced_filter_listings(filtered_listings.head(20), 0.5) # Use hardcode for the sentiment value
            return recommended_listings
        
        else:
            print("No listings found that match your criteria.")


# Test Recommendation System

In [27]:
example_criteria = {'bedrooms': 1.0, 'bathrooms': 1.0, 'min_rent': 2000.0,'max_rent': 4000.0, 'location': 'University of San Francisco', 'move_in_date': '2024-08-21', 'lease_term': 12, 'max_distance_km': 12.0}
example_weights = {'bedrooms': 1.0, 'bathrooms': 1.0, 'price': 1.0, 'distance': 2.0 }
recommended_results = run_recommendation_system(example_criteria, example_weights)

display_results(recommended_results)
map_file = visualize_results_on_map(recommended_results)
open_map(map_file)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['place_id'] = filtered_df['formattedAddress'].apply(lambda x: place_cache.get(x, search_place(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['sentiment_score'] = 0.0


Found20 listings that match your criteria:
                                       formattedAddress   price  bedrooms  \
985          840 California St, San Francisco, CA 94108  0.0475       1.0   
277              634 Powell St, San Francisco, CA 94108  0.0975       1.0   
526       1140 Clay St, Apt 10, San Francisco, CA 94108  0.0475       1.0   
776      424 Jones St, Apt 405, San Francisco, CA 94102  0.0475       2.0   
227     550 Jessie St, Apt 115, San Francisco, CA 94103  0.0475       1.0   
491       1073 Bush St, Apt 11, San Francisco, CA 94109  0.0450       1.0   
306              655 Powell St, San Francisco, CA 94108  0.1475       2.0   
1002            1364 Kearny St, San Francisco, CA 94133  0.1250       1.0   
255     550 Jessie St, Apt 118, San Francisco, CA 94103  0.0750       1.0   
617          722 Montgomery St, San Francisco, CA 94111  0.2250       1.0   
39    631 Ofarrell St, Apt 616, San Francisco, CA 94109  0.0500       1.0   
722     930 Sutter St, Apt 602, S

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['enhanced_score'] = (


# Function Calling

This part is to do function calling between OpenAI API and my other functions. I will define a function to describe my function, and pass the question to the OpenAI Model with user prompts.

In [29]:
openai_api_key=OPENAI_KEY

# User prompt
user_prompt = "Find me a 1-bedroom apartment in San Francisco with a budget of $3000 per month"

# Function description as provided by you
func_descrip = [{
    "name": "run_recommendation_system",
    "description": "Filters and recommends property listings based on user criteria and weights",
    "parameters": {
        "type": "object",
        "properties": {
            "criteria": {
                "type": "object",
                "description": "Dictionary of filtering criteria for property listings",
                "properties": {
                    "bedrooms": {"type": "number", "description": "Number of bedrooms"},
                    "bathrooms": {"type": "number", "description": "Number of bathrooms"},
                    "min_rent": {"type": "number", "description": "Minimum rent price"},
                    "max_rent": {"type": "number", "description": "Maximum rent price"},
                    "location": {"type": "string", "description": "Desired location"},
                    "move_in_date": {"type": "string", "description": "Move-in date (YYYY-MM-DD)"},
                    "lease_term": {"type": "number", "description": "Lease term in months"},
                    "max_distance_km": {"type": "number", "description": "Maximum distance from the location in kilometers"}
                },
                "required": ["bedrooms", "bathrooms", "min_rent", "max_rent", "location", "move_in_date", "lease_term", "max_distance_km"]
            },
            "weights": {
                "type": "object",
                "description": "Dictionary of importance weights for different criteria",
                "properties": {
                    "bedrooms": {"type": "number", "description": "Importance of number of bedrooms"},
                    "bathrooms": {"type": "number", "description": "Importance of number of bathrooms"},
                    "price": {"type": "number", "description": "Importance of price"},
                    "distance": {"type": "number", "description": "Importance of distance from desired location"}
                },
                "required": ["bedrooms", "bathrooms", "price", "distance"]
            }
        },
        "required": ["criteria", "weights"]
    }
}]

# Create a chat completion with the new API method
chat_completion =openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "user",
            "content": user_prompt
        }
    ],
    functions=func_descrip,
    function_call="auto"
)
# Check if a function call is returned
response_message = chat_completion.choices[0].message

if 'function_call' in response_message:
    function_call = response_message['function_call']
    print(f"Function Name: {function_call['name']}")
    print(f"Function Arguments: {function_call['arguments']}")
else:
    output = response_message['content']
    print(output)



Function Name: run_recommendation_system
Function Arguments: {"criteria":{"bedrooms":1,"bathrooms":1,"min_rent":0,"max_rent":3000,"location":"San Francisco","move_in_date":"2023-01-01","lease_term":12,"max_distance_km":10},"weights":{"bedrooms":1,"bathrooms":1,"price":3,"distance":2}}
