# Set Up 2023 Data

In [46]:
import json
import pandas as pd

filepath = "Zillow-ChapelHill-Spring2023-18k-Properties.json"

# First, Load the JSON file and convery it to a pandas data frame
with open(filepath) as f:
  data2023 = json.load(f)
data2023.keys()


df_2023 = pd.DataFrame(data2023)

# Display the first few rows of the DataFrame
print(df_2023.head())


      zpid                                            address  bedrooms  \
0   914857  {'streetAddress': '2308 Ridgewood Rd', 'city':...       4.0   
1  1176855  {'streetAddress': '122 Dixie Dr', 'city': 'Ind...       3.0   
2  2143678  {'streetAddress': '19 Oakwood Dr', 'city': 'Ne...       4.0   
3  5614449  {'streetAddress': '260 Edgewood Rd', 'city': '...       2.0   
4  5642139  {'streetAddress': '115 Windsor Rd', 'city': 'A...       4.0   

   bathrooms   price  yearBuilt  longitude   latitude     homeStatus  \
0        2.0  230000     1961.0 -86.810970  33.618390  RECENTLY_SOLD   
1        2.0  210000     1962.0 -86.156540  39.637882  RECENTLY_SOLD   
2        2.0  365000     1962.0 -93.210335  45.049557           SOLD   
3        1.0  484600     1975.0 -82.560425  35.620270          OTHER   
4        3.0  680000     1951.0 -82.552260  35.637710           SOLD   

                                         description  ...  chanceToSellFaster  \
0  This home has been lovingly care

# Set up 2025 Data

In [47]:

filepath0 = 'Zillow-March2025-dataset_part0.json'
filepath1 = 'Zillow-March2025-dataset_part1.json'
filepath2 = 'Zillow-March2025-dataset_part2.json'
filepath3 = 'Zillow-March2025-dataset_part3.json'

# Load the JSON files and convert them to pandas data frames
with open(filepath0, encoding='utf-8') as f:
  data0 = json.load(f)
with open(filepath1, encoding='utf-8') as f:
  data1 = json.load(f)
with open(filepath2, encoding='utf-8') as f:
  data2 = json.load(f)
with open(filepath3, encoding='utf-8') as f:
  data3 = json.load(f)
# Combine the data from all files into a single DataFrame
df0 = pd.DataFrame(data0)
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
df3 = pd.DataFrame(data3)
df_2025 = pd.concat([df0, df1, df2, df3], ignore_index=True)

print(df_2025[['streetAddress', 'city', 'state', 'price', 'bedrooms', 'bathrooms', 'daysOnZillow', 'homeStatus']])


                   streetAddress         city state   price  bedrooms  \
0             302 Orchard Ln #92     Carrboro    NC  764100       5.0   
1         232 McCauley St UNIT A  Chapel Hill    NC  829100       8.0   
2          2034 Foxwood Farm Trl  Chapel Hill    NC  965600       4.0   
3      1509 Partridgeberry Rd #A  Chapel Hill    NC  453400       4.0   
4                  308 Ransom St  Chapel Hill    NC  701600       2.0   
...                          ...          ...   ...     ...       ...   
32028             807 Kenmore Rd  Chapel Hill    NC  788700       3.0   
32029       195 Old Piedmont Cir  Chapel Hill    NC  549900       3.0   
32030        114 Weaver Dairy Rd  Chapel Hill    NC  433500       NaN   
32031               46 Davie Cir  Chapel Hill    NC  679700       4.0   
32032       317 Charleston Ln #1  Chapel Hill    NC  419400       2.0   

       bathrooms  daysOnZillow     homeStatus  
0            2.5       12234.0          OTHER  
1            6.0        656

# 2025 Walkable Locations

reduce data to locations that are <=2 miles from center of campus 

In [48]:
from geopy.distance import geodesic

# Define key campus locations (e.g., The Pit, Franklin Street)
unc_coordinates = (35.9106, -79.0472)
maximum_distance = 2  # in miles

# Calculate walking/biking distance for each property
def calculate_distance_to_location(row, location_coords):
    property_coords = (row['latitude'], row['longitude'])
    return geodesic(property_coords, location_coords).miles

# Remove rows with missing or invalid latitude/longitude
df_2025 = df_2025.dropna(subset=['latitude', 'longitude'])

# Calculate distance from campus for each property
df_2025['distance_from_campus'] = df_2025.apply(
    lambda row: calculate_distance_to_location(row, unc_coordinates), axis=1
)

# Filter for properties within 5 miles of campus
df_2025_nearby = df_2025[df_2025['distance_from_campus'] <= maximum_distance]

df_2025 = df_2025_nearby

# Display the filtered properties
print(df_2025[['streetAddress', 'city', 'state', 'price', 'bedrooms', 'bathrooms', 'distance_from_campus', 'daysOnZillow', 'homeStatus']])



                    streetAddress         city state   price  bedrooms  \
0              302 Orchard Ln #92     Carrboro    NC  764100       5.0   
1          232 McCauley St UNIT A  Chapel Hill    NC  829100       8.0   
4                   308 Ransom St  Chapel Hill    NC  701600       2.0   
11          1002 Willow Dr APT 17  Chapel Hill    NC  133100       2.0   
13              500 Umstead Dr #B  Chapel Hill    NC  205500       3.0   
...                           ...          ...   ...     ...       ...   
32007         100 Brandon Rd #323  Chapel Hill    NC    1684       1.0   
32010  140 W Franklin St UNIT 201  Chapel Hill    NC  859900       2.0   
32012     401 Umstead Dr UNIT 100  Chapel Hill    NC  290300       NaN   
32017     143 Oldham Estate Dr #4  Chapel Hill    NC  350000       NaN   
32031                46 Davie Cir  Chapel Hill    NC  679700       4.0   

       bathrooms  distance_from_campus  daysOnZillow homeStatus  
0            2.5              1.943333       

# Property Type Filter

Focus on:
- Condos, townhomes, single family homes
- Units with 2–3 bedrooms (for possible roommates)





### Posible properties
- 'apartment' 
- 'single_family' 
- 'townhouse' 
- 'multi_family' 
- 'condo'
- 'home_type_unknown' 
- 'lot'

In [49]:

home_types = ['single_family', 'townhouse', 'condo']

# Clean up the 'homeType' column
df_2025.loc[:, 'homeType'] = df_2025['homeType'].str.lower() # this just makes sure all home types are in lower case
df_2025_expected_home_type = df_2025[
    (df_2025_nearby['homeType'].isin(home_types)) &
    (df_2025_nearby['bedrooms'].between(2, 3))
]

df_2025 = df_2025_expected_home_type
# Display the filtered properties
print(df_2025[['streetAddress', 'city', 'homeType', 'price', 'bedrooms', 'bathrooms', 'distance_from_campus', 'daysOnZillow']])

                    streetAddress         city       homeType    price  \
11          1002 Willow Dr APT 17  Chapel Hill      townhouse   133100   
13              500 Umstead Dr #B  Chapel Hill          condo   205500   
17              375 Umstead Dr #A  Chapel Hill          condo   250000   
21          505 Coolidge St APT C  Chapel Hill      townhouse   597100   
27              371 Umstead Dr #B  Chapel Hill          condo   249100   
...                           ...          ...            ...      ...   
31967           130 E Longview St     Carrboro      townhouse   269600   
31973                514 North St  Chapel Hill  single_family  1234100   
31991           10 Mount Bolus Rd  Chapel Hill  single_family   762300   
32005           117 Barclay Rd #A  Chapel Hill  single_family   377400   
32010  140 W Franklin St UNIT 201  Chapel Hill          condo   859900   

       bedrooms  bathrooms  distance_from_campus  daysOnZillow  
11          2.0        2.0              1.7913

# Tax and Value analysis

In [50]:
def analyze_tax(tax_history):
    if not tax_history or len(tax_history) < 2:
        return {
            "average_annual_tax_increase_rate": None,
            "average_value_increase_rate": None,
        }
    # Convert timestamps to years
    for entry in tax_history:
        entry['year'] = pd.to_datetime(entry['time'], unit='ms').year

    # Calculate average annual tax increase rate
    tax_increase_rates = [entry['taxIncreaseRate'] for entry in tax_history]
    avg_tax_increase_rate = sum(tax_increase_rates) / len(tax_increase_rates)

    # Calculate average value increase rate
    value_increase_rates = [entry['valueIncreaseRate'] for entry in tax_history]
    avg_value_increase_rate = sum(value_increase_rates) / len(value_increase_rates)


    return {
        "average_annual_tax_increase_rate": avg_tax_increase_rate,
        "average_value_increase_rate": avg_value_increase_rate,
    }


In [51]:
max_tax_history_increase_rate = 0.02
max_value_increase_rate = 0.02

# Analyze tax on all properties
df_2025['taxAnalysis'] = df_2025['taxHistory'].apply(analyze_tax)

# Define a helper function to check the filters safely
def is_within_tax_range(tax_info):
    if not tax_info or not isinstance(tax_info, dict):
        return False
    avg_tax = tax_info.get('average_annual_tax_increase_rate')
    avg_val = tax_info.get('average_value_increase_rate')
    
    # Make sure both are numbers before comparing
    if isinstance(avg_tax, (int, float)) and isinstance(avg_val, (int, float)):
        return avg_tax <= max_tax_history_increase_rate and avg_val <= max_value_increase_rate
    return False

# Apply the filter
df_2025_expected_tax_range = df_2025[df_2025['taxAnalysis'].apply(is_within_tax_range)]

df_2025 = df_2025_expected_tax_range

# Display results
print(df_2025_expected_tax_range[['streetAddress', 'city', 'state', 'price', 'bedrooms', 'bathrooms', 'distance_from_campus', 'daysOnZillow', 'homeStatus', 'taxAnalysis']])


                    streetAddress         city state    price  bedrooms  \
17              375 Umstead Dr #A  Chapel Hill    NC   250000       2.0   
78              130 E Longview St     Carrboro    NC   269600       2.0   
198     513 W Barbee Chapel Rd #2  Chapel Hill    NC   814600       3.0   
218              3204 Environ Way  Chapel Hill    NC   680000       2.0   
323    213 E Franklin St UNIT 104  Chapel Hill    NC  1964600       3.0   
...                           ...          ...   ...      ...       ...   
31844   724 W Barbee Chapel Rd #4  Chapel Hill    NC   830200       3.0   
31848            2405 Environ Way  Chapel Hill    NC   233000       2.0   
31871  140 W Franklin St UNIT 407  Chapel Hill    NC   805000       2.0   
31967           130 E Longview St     Carrboro    NC   269600       2.0   
32010  140 W Franklin St UNIT 201  Chapel Hill    NC   859900       2.0   

       bathrooms  distance_from_campus  daysOnZillow homeStatus  \
17           1.5              1.

  df_2025['taxAnalysis'] = df_2025['taxHistory'].apply(analyze_tax)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2025['taxAnalysis'] = df_2025['taxHistory'].apply(analyze_tax)


# Time on Market / Turnover
- Look at average days on market for each property
- Higher turnover may signal student-friendly properties
- Low turnover could signal quiet, long-term-owner zones ideal for parents seeking stability

In [52]:
max_time_on_market = 60 # days

# Filter for properties that have been on the market for less than 1000 days
df_2025_high_turnover = df_2025[df_2025['daysOnZillow'] <= max_time_on_market]

df_2025 = df_2025_high_turnover

# Display the filtered properties
print(df_2025[['streetAddress', 'city', 'homeType', 'price', 'bedrooms', 'bathrooms', 'distance_from_campus', 'daysOnZillow']])


                    streetAddress         city       homeType   price  \
6666               118 Purefoy Rd  Chapel Hill  single_family  303800   
7030              203 Oak Tree Dr  Chapel Hill          condo  330000   
12359             603 Oak Tree Dr  Chapel Hill  single_family    1750   
18309                103 Inara Ct     Carrboro  single_family  869000   
21097             906 Oak Tree Dr  Chapel Hill          condo  299000   
22909            1206 Oak Tree Dr  Chapel Hill  single_family    1900   
26191            4416 Environ Way  Chapel Hill          condo  510000   
29383        601 Rosemary St #403  Chapel Hill          condo  719000   
30030  140 W Franklin St UNIT 406  Chapel Hill          condo  930000   
32010  140 W Franklin St UNIT 201  Chapel Hill          condo  859900   

       bedrooms  bathrooms  distance_from_campus  daysOnZillow  
6666        2.0        1.0              0.957789          40.0  
7030        3.0        2.0              1.582687          12.0  
1

# Filter out rentals
rentals and listing price are under the same value in zillow data. To filter out rentals, we will filter out properties with 'price' set to lower than 100.000, since this will signify its not rent and an actual listing price.

In [53]:
min_price = 250000

not_rentals = df_2025[
    (df_2025['price'] >= min_price)
]

df_2025 = not_rentals


# MAP


In [54]:
import folium
# Create a base map centered around Chapel Hill
map_center = [35.9106, -79.0472]
map_2025 = folium.Map(location=map_center, zoom_start=13)
# Add markers for each property
for _, row in df_2025.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=(
            f"Address: {row['streetAddress']}, {row['city']}, {row['state']}<br>"
            f"Price: ${row['price']}<br>"
            f"Bedrooms: {row['bedrooms']}<br>"
            f"Bathrooms: {row['bathrooms']}<br>"
            f"Distance from Campus: {row['distance_from_campus']:.2f} miles<br>"
            f"Days on Zillow: {row['daysOnZillow']}"
        ),
        icon=folium.Icon(color='blue')
    ).add_to(map_2025)
# Save the map to an HTML file
# Display the map
map_2025




# Final List

In [55]:
# Print Final List
print(f"Final List of Properties ({len(df_2025)}):")
print(df_2025[['streetAddress', 'city', 'state', 'price', 'bedrooms', 'bathrooms', 'distance_from_campus', 'daysOnZillow']])

Final List of Properties (8):
                    streetAddress         city state   price  bedrooms  \
6666               118 Purefoy Rd  Chapel Hill    NC  303800       2.0   
7030              203 Oak Tree Dr  Chapel Hill    NC  330000       3.0   
18309                103 Inara Ct     Carrboro    NC  869000       3.0   
21097             906 Oak Tree Dr  Chapel Hill    NC  299000       2.0   
26191            4416 Environ Way  Chapel Hill    NC  510000       2.0   
29383        601 Rosemary St #403  Chapel Hill    NC  719000       2.0   
30030  140 W Franklin St UNIT 406  Chapel Hill    NC  930000       2.0   
32010  140 W Franklin St UNIT 201  Chapel Hill    NC  859900       2.0   

       bathrooms  distance_from_campus  daysOnZillow  
6666         1.0              0.957789          40.0  
7030         2.0              1.582687          12.0  
18309        3.0              1.839623          17.0  
21097        2.0              1.556081           2.0  
26191        2.0            

# Appreciation Rate

- Compare what properties went up 20% between 2023 and 2025 sets and filter

In [56]:
# Clean up price and zpid columns
df_2023['price'] = pd.to_numeric(df_2023['price'], errors='coerce')
df_2025['price'] = pd.to_numeric(df_2025['price'], errors='coerce')
df_2023['zpid'] = pd.to_numeric(df_2023['zpid'], errors='coerce')
df_2025['zpid'] = pd.to_numeric(df_2025['zpid'], errors='coerce')

# Merge 2023 and 2025 data on 'zpid'
merged_df = pd.merge(
    df_2025[['zpid', 'price', 'streetAddress']],  # keep zpid, price, streetAddress from 2025
    df_2023[['zpid', 'price']],  # keep zpid and price from 2023
    on='zpid',
    suffixes=('_2025', '_2023')
)

# Calculate percentage price increase
merged_df['price_increase_percentage'] = ((merged_df['price_2025'] - merged_df['price_2023']) / merged_df['price_2023']) * 100

# Filter properties with price appreciation ≥ 20%
filtered_properties = merged_df[merged_df['price_increase_percentage'] >= 20]

# Show the result
print(filtered_properties[['zpid', 'streetAddress', 'price_2023', 'price_2025', 'price_increase_percentage']])

         zpid    streetAddress  price_2023  price_2025  \
1  60072594.0  203 Oak Tree Dr      230000      330000   

   price_increase_percentage  
1                  43.478261  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2025['price'] = pd.to_numeric(df_2025['price'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2025['zpid'] = pd.to_numeric(df_2025['zpid'], errors='coerce')
