In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import requests as requests
import json

## Step 1: How to get the data? ##

#1A: Working with Zillo API from RapidAPI

In [None]:
import requests

url = "https://zillow-working-api.p.rapidapi.com/byaddress"

querystring = {"propertyaddress":"701 Royal Ct APT 304, Charlotte, NC 28202"}	#My condo address

headers = {
	"X-RapidAPI-Key": "e9cf3e65c2msh0de45007e2bea81p141686jsnc588f53da2b6",
	"X-RapidAPI-Host": "zillow-working-api.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)

print(response.json())

In [None]:
data = response.json()

In [None]:
print(json.dumps(data, indent=4))

#1B: Working with HomeHarvest 

https://github.com/Bunsly/HomeHarvest?tab=readme-ov-file

In [None]:
pip install homeharvest

In [33]:
from homeharvest import scrape_property
from datetime import datetime

# Loop through years from 2024 to 2004
for year in range(2024, 2015, -1):
    try:
        
        filename = f"HomeHarvest_{year}_5_Miles_From_Home.csv"
        
        # Set the date range for the current year
        date_from = f"{year}-01-01"
        date_to = f"{year}-12-31"
        
        # Call scrape_property for the current year
        properties = scrape_property(
            location="701 Royal Ct, Apt 304, Charlotte, NC",
            radius=5,
            listing_type="sold",
            date_from=date_from,
            date_to=date_to,
            foreclosure=False
        )
        
        # Check if properties were returned before attempting to save to a CSV
        if len(properties) > 0:
            print(f"Number of properties for {year}: {len(properties)}")
            
            # Save the properties to a CSV file
            properties.to_csv(filename, index=False)
            print(f"Data saved to {filename}")
            print(properties.head())
        else:
            print(f"No properties found for {year}.")
    except Exception as e:
        print(f"An error occurred for {year}: {e}. Skipping to the next year.")


Number of properties for 2024: 899
Data saved to HomeHarvest_2024_5_Miles_From_Home.csv
                                        property_url   mls   mls_id status  \
0  https://www.realtor.com/realestateandhomes-det...  CHNC  4105984   SOLD   
1  https://www.realtor.com/realestateandhomes-det...  CHNC  4117761   SOLD   
2  https://www.realtor.com/realestateandhomes-det...  CHNC  4112620   SOLD   
3  https://www.realtor.com/realestateandhomes-det...  CHNC  4107609   SOLD   
4  https://www.realtor.com/realestateandhomes-det...  CHNC  4113947   SOLD   

                        style               street  unit       city state  \
0      PropertyType.TOWNHOMES  12005  Brooklyn Ave  None  Charlotte    NC   
1      PropertyType.TOWNHOMES       436  Belton St  None  Charlotte    NC   
2  PropertyType.SINGLE_FAMILY    1837  Kenwood Ave  None  Charlotte    NC   
3  PropertyType.SINGLE_FAMILY      410  Nelson Ave  None  Charlotte    NC   
4  PropertyType.SINGLE_FAMILY   716  Lexington Ave  None  

In [None]:
from homeharvest import scrape_property
from datetime import datetime

current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"HomeHarvest_{current_timestamp}.csv"
#filename = "HomeHarvest_2023_10_Miles_From_Home.csv"

properties = scrape_property(
  location="701 Royal Ct,Apt 304,Charlotte,NC",
  #radius=10,
  #location="Charlotte, NC",  

  listing_type="sold",  # or (for_sale, for_rent, pending)
  #past_days=10,  # sold in last 30 days - listed in last 30 days if (for_sale, for_rent)
  
  date_from="2022-01-01", # alternative to past_days 
  date_to="2022-03-31", 
  foreclosure=False
  
  # mls_only=True,  # only fetch MLS listings
)
print(f"Number of properties: {len(properties)}")

properties.to_csv(filename, index=False)
print(properties.head())

### Step 2: Pull data and start to play with it

In [18]:
# Method for taking raw home harvest data and returning X_train, X_test, t_train, t_test so we can make a model!
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

def getDataSetFromHomeHarvest(df_Raw, columns_to_drop, propertyType):

    # Filter by propety type
    df_Rtn = df_Raw[df_Raw['style'] == propertyType]

    # Drop what we dont want
    df_Rtn = df_Rtn.drop(columns=columns_to_drop)

    # Combine half and full baths
    df_Rtn['half_baths'] = df_Rtn['half_baths'].fillna(0)
    df_Rtn['full_baths'] = df_Rtn['full_baths'] + 0.5 * df_Rtn['half_baths']        
    df_Rtn.drop(columns=['half_baths'], inplace=True)

    # Get em nulls outta here
    df_Rtn = df_Rtn.dropna()

    # Make these both positive, hopefully the data we are looking at is in relatively the same area, dont want negatives here to mess up the model
    df_Rtn['latitude'] = df_Rtn['latitude'].abs()
    df_Rtn['longitude'] = df_Rtn['longitude'].abs()

    scaler = StandardScaler()
    # Scale the year built to a new column, drop the old year built
    df_Rtn['year_built_scaled'] = scaler.fit_transform(df_Rtn[['year_built']])
    df_Rtn.drop(columns=['year_built'], inplace=True)

    # Scale the year sold to a new column, drop the old year built
    df_Rtn['last_sold_date'] = pd.to_datetime(df_Rtn['last_sold_date'])                 # Convert to date time
    df_Rtn['year_sold'] = df_Rtn['last_sold_date'].dt.year                              # Just use year sold
    df_Rtn['year_sold_scaled'] = scaler.fit_transform(df_Rtn[['year_sold']])            # Scale the year sold
    df_Rtn.drop(columns=['last_sold_date', 'year_sold'], inplace=True)                  # Drop what we dont need

    #Define our data and target
    T = df_Rtn['sold_price']
    X = df_Rtn.drop(columns=['sold_price'])

    return(train_test_split(X, T, test_size=0.2, random_state=27))

In [19]:
import os
import pandas as pd

# Define the directory path
directory = "10 Miles From Home"
dfs = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

propertyType = 'PropertyType.SINGLE_FAMILY'
columns_to_drop = ['property_url', 'mls', 'mls_id', 'status', 'style', 'street', 'unit', 'city', 'state', 'zip_code',
                   'list_date', 'list_price', 'lot_sqft', 'stories', 'hoa_fee', 'parking_garage', 
                   'primary_photo', 'alt_photos', 'days_on_mls', 'price_per_sqft']

X_train, X_test, t_train, t_test = getDataSetFromHomeHarvest(combined_df, columns_to_drop, propertyType)

print(X_train.shape, X_test.shape, t_train.shape, t_test.shape)

(36110, 7) (9028, 7) (36110,) (9028,)


In [20]:
train_data = pd.concat([X_train, t_train], axis=1)
sampled_train_data = train_data.sample(n=10000, random_state=42)
X_train_sampled = sampled_train_data.drop(columns=['sold_price'])
t_train_sampled = sampled_train_data['sold_price']

print(X_train_sampled.shape, t_train_sampled.shape)

(10000, 7) (10000,)


In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, r2_score

# Define the SVC model
svc = SVC()

# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Define scoring function
scoring = {'R2': make_scorer(r2_score)}

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring=scoring, refit='R2', verbose=2)

# Fit the grid search to find the best parameters
grid_search.fit(X_train_sampled, t_train_sampled)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best R2 Score:", best_score)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




In [5]:
from sklearn.svm import SVC
from sklearn.metrics import r2_score

svc = SVC()

# Train the SVC
svc.fit(X_train, t_train)

# Make predictions
predictions = svc.predict(X_test)

# Calculate the R2 score
r2 = r2_score(t_test, predictions)
print("R2 Score:", r2)

R2 Score: 0.2975093849803445
