In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import requests as requests
import json

## Step 1: How to get the data? ##

#1A: Working with Zillo API from RapidAPI

In [None]:
import requests

url = "https://zillow-working-api.p.rapidapi.com/byaddress"

querystring = {"propertyaddress":"701 Royal Ct APT 304, Charlotte, NC 28202"}	#My condo address

headers = {
	"X-RapidAPI-Key": "e9cf3e65c2msh0de45007e2bea81p141686jsnc588f53da2b6",
	"X-RapidAPI-Host": "zillow-working-api.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)

print(response.json())

In [None]:
data = response.json()

In [None]:
print(json.dumps(data, indent=4))

#1B: Working with HomeHarvest 

https://github.com/Bunsly/HomeHarvest?tab=readme-ov-file

In [None]:
pip install homeharvest

In [None]:
from homeharvest import scrape_property
from datetime import datetime

# Loop through years from 2024 to 2004
for year in range(2020, 2000, -1):
    try:
        
        filename = f"HomeHarvest_{year}_10_Miles_From_Home.csv"
        
        # Set the date range for the current year
        date_from = f"{year}-01-01"
        date_to = f"{year}-12-31"
        
        # Call scrape_property for the current year
        properties = scrape_property(
            location="701 Royal Ct, Apt 304, Charlotte, NC",
            radius=10,
            listing_type="sold",
            date_from=date_from,
            date_to=date_to,
            foreclosure=False
        )
        
        # Check if properties were returned before attempting to save to a CSV
        if len(properties) > 0:
            print(f"Number of properties for {year}: {len(properties)}")
            
            # Save the properties to a CSV file
            properties.to_csv(filename, index=False)
            print(f"Data saved to {filename}")
            print(properties.head())
        else:
            print(f"No properties found for {year}.")
    except Exception as e:
        print(f"An error occurred for {year}: {e}. Skipping to the next year.")


In [None]:
from homeharvest import scrape_property
from datetime import datetime

current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"HomeHarvest_{current_timestamp}.csv"
#filename = "HomeHarvest_2023_10_Miles_From_Home.csv"

properties = scrape_property(
  location="701 Royal Ct,Apt 304,Charlotte,NC",
  #radius=10,
  #location="Charlotte, NC",  

  listing_type="sold",  # or (for_sale, for_rent, pending)
  #past_days=10,  # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)
  
  date_from="2022-01-01", # alternative to past_days 
  date_to="2022-03-31", 
  foreclosure=False
  
  # mls_only=True,  # only fetch MLS listings
)
print(f"Number of properties: {len(properties)}")

properties.to_csv(filename, index=False)
print(properties.head())

### Step 2: Pull data and start to play with it

In [None]:
import pandas as pd
filename = "HomeHarvest_2022_10_Miles_From_Home.csv"    #Lets just work with 2020 data
df_2020 = pd.read_csv(filename)
#print(df_2020.head())

In [124]:
#df_2020_Condos = df_2020[df_2020['style'] == 'PropertyType.CONDOS']
df_2020_Condos = df_2020[df_2020['style'] == 'PropertyType.CONDOS']

columns_to_drop = ['property_url', 'mls', 'mls_id', 'status', 'style', 'street', 'unit', 'city', 'state', 'zip_code',
                   'list_date', 'list_price', 'lot_sqft', 'stories', 'hoa_fee', 'parking_garage', 
                   'primary_photo', 'alt_photos', 'last_sold_date', 'days_on_mls', 'price_per_sqft']

df_2020_Condos = df_2020_Condos.drop(columns=columns_to_drop)

# Half baths are put as null, we dont want to drop ALL of these
df_2020_Condos['half_baths'].fillna(0, inplace=True)    # set them all to 0 if they are null
df_2020_Condos['full_baths'] = df_2020_Condos['full_baths'] + 0.5 * df_2020_Condos['half_baths']        # Change the full baths to include the half baths

# Drop the half baths because they are now accounted for in the full baths column
df_2020_Condos.drop(columns=['half_baths'], inplace=True)

df_2020_Condos = df_2020_Condos.dropna()

#Define our data and target
T = df_2020_Condos['sold_price']
X = df_2020_Condos.drop(columns=['sold_price'])

from sklearn.model_selection import train_test_split

X_train, X_test, t_train, t_test = train_test_split(X, T, test_size=0.2, random_state=27)

print(X_train.shape, X_test.shape, t_train.shape, t_test.shape)

(703, 6) (176, 6) (703,) (176,)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_2020_Condos['half_baths'].fillna(0, inplace=True)    # set them all to 0 if they are null


In [None]:
# Create our SVC!
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

hyper_parameters = {
    'C': [0.1, 1, 10, 100], 
    'gamma': [1, 0.1, 0.01, 0.001], 
    'kernel': ['rbf']
}

gscv = GridSearchCV(estimator=clf, param_grid=hyper_parameters)
gscv.fit(X_train, t_train)
best_params = gscv.best_params_

print(f"Best params: {best_params}")

In [125]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, random_state=27)

# Fit the model to the training data
clf.fit(X_train_scaled, t_train)

# Evaluate the model
train_score = clf.score(X_train_scaled, t_train)
test_score = clf.score(X_test_scaled, t_test)

print(f"Train Accuracy: {train_score}, Test Accuracy: {test_score}")

Train Accuracy: 0.05689900426742532, Test Accuracy: 0.022727272727272728


In [127]:
from sklearn.metrics import r2_score

# Assuming you have a regression model and predictions
# For demonstration, let's say y_pred are your model's predictions, and y_test are the true values
y_pred = clf.predict(X_test_scaled)

# Calculate the R^2 score
r2_score_value = r2_score(t_test, y_pred)

print(f"R^2 Score: {r2_score_value}")

R^2 Score: 0.13320740194018443


In [126]:
df_condo = pd.read_csv("HomeHarvest_20240328_180841.csv")
df_condo = df_condo.drop(columns=columns_to_drop)

# Drop the half baths because they are now accounted for in the full baths column
df_condo.drop(columns=['half_baths'], inplace=True)
df_condo.drop(columns=['sold_price'], inplace=True)

df_condo['full_baths'] = 1

predicted_sold_price = clf.predict(df_condo)

print(f"Predicted Sold Price: {predicted_sold_price}")

Predicted Sold Price: [440000]


