In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from geocoding_utils import Geocoder 
import geopandas as  gpd

import warnings
warnings.filterwarnings('ignore')

state = "ia"

In [18]:
states = ["ia", "id", "mn", "wi"]

# Define bedroom and bathroom ranges
bedroom_range = [1, 2, 3, 4]
bathroom_range = [1, 2, 3]

# Initialize an empty DataFrame to store listings across all pages and URL variations
data = {
    "Property Link": [],
    "Address": [],
    "Price": [],
    "Beds and Baths": []
}

# Initialize the driver
driver = webdriver.Chrome()

for state in states:
    # Loop through each combination of bedroom and bathroom counts
    for bedrooms in bedroom_range:
        for bathrooms in bathroom_range:
            # Construct the URL for each combination
            url = f"https://www.apartments.com/houses/{state}/{bedrooms}-bedrooms-{bathrooms}-bathrooms/"
            driver.get(url)

            print(f"Scraping URL: {url}")

            # Wait for the page to load content
            try:
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "mortar-wrapper"))
                )
            except Exception as e:
                print(f"No listings found for {bedrooms} bedrooms and {bathrooms} bathrooms.")
                continue

            page = 1

            # Pagination loop
            while True:
                print(f"Scraping page {page} for {bedrooms} beds and {bathrooms} baths...")

                # Find all apartment listings on the current page
                listings = driver.find_elements(By.CLASS_NAME, "mortar-wrapper")

                for listing in listings:
                    try:
                        # Extract property link
                        link_element = listing.find_element(By.CSS_SELECTOR, ".property-link")
                        property_link = link_element.get_attribute("href")

                        # Extract property address (from aria-label of property link)
                        property_address = link_element.get_attribute("aria-label")

                        # Extract price
                        try:
                            price = listing.find_element(By.CLASS_NAME, "property-pricing").text
                        except:
                            try:
                                price = listing.find_element(By.CLASS_NAME, "price-range").text
                            except:
                                price = "Price not available"

                        # Extract beds and baths information
                        try:
                            beds_baths = listing.find_element(By.CLASS_NAME, "property-beds").text
                        except:
                            try:
                                beds_baths = listing.find_element(By.CLASS_NAME, "bed-range").text
                            except:
                                beds_baths = "Beds and baths not available"

                        # Append data to the DataFrame
                        data["Property Link"].append(property_link)
                        data["Address"].append(property_address)
                        data["Price"].append(price)
                        data["Beds and Baths"].append(beds_baths)

                        print(f"Scraped: {property_address}")

                    except Exception as e:
                        continue

                # Check for the "Next" button to go to the next page
                try:
                    next_button = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, ".next"))
                    )

                    # Check if the button is disabled (meaning it's the last page)
                    if "disabled" in next_button.get_attribute("class"):
                        print("Reached the last page.")
                        break

                    # Click the "Next" button
                    next_button.click()
                    page += 1

                    # Wait for the new page content to load
                    WebDriverWait(driver, 15).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "mortar-wrapper"))
                    )
                    time.sleep(2)  # Small delay to let content fully load

                except Exception as e:
                    print("No more pages to scrape or error navigating to next page:")
                    break

# Convert data dictionary to DataFrame
df = pd.DataFrame(data)
print(df)

driver.quit()



Scraping URL: https://www.apartments.com/houses/ia/1-bedrooms-1-bathrooms/
Scraping page 1 for 1 beds and 1 baths...
Scraped: 440 Florence St, Waterloo, IA
Scraped: 4221 Douglas Ave, Des Moines, IA
Scraped: 204 E Harrison St, Exira, IA
Scraped: 205 W Benton St, Iowa City, IA
Scraped: 328 W Columbia Ave, Davenport, IA
Scraped: 2232 E 11th St, Davenport, IA
Scraped: 4706 El Rancho Dr, Davenport, IA
Scraped: 623 N 7th Ave, Washington, IA 52353
Scraped: 1321 S Nevada Ave, Davenport, IA 52802
Scraped: 213 N Main St, Roland, IA 50236
Scraped: 306 S Lucas St, Iowa City, IA 52240
Scraped: 3324 Story St, Ames, IA 50014
Scraped: 1308 Emma Ave, Des Moines, IA 50315
Scraped: 3322 Story St, Ames, IA 50014
Scraped: 2021 10th Ave N, Fort Dodge, IA 50501
Scraped: 1745 John F Kennedy Rd, Dubuque, IA 52002
Scraped: 624 S Gilbert St, Iowa City, IA 52240
Scraped: 5900 Osage Rd, Waterloo, IA 50703
Scraped: 1822 Belle Ave, Davenport, IA 52803
Scraped: 1825 Caleb Ct, North Liberty, IA 52317
Scraped: 1029 Ack

In [19]:
print(df.shape)
df = df.drop_duplicates()
print(df.shape)

(5312, 4)
(3081, 4)


# Extract Beds and Baths

In [20]:
df_filtered = df[df["Price"].str.contains('-', na=False)==False]

df_filtered[['Beds', 'Baths', 'Sqft']] = df_filtered['Beds and Baths'].str.extract(r'(?:(\d+\.?\d*) Beds?)?(?:, (\d+\.?\d*) Baths?)?(?:, ([\d,]+) sq ft)?')

# Convert columns to appropriate types, handling NaNs
df_filtered['Beds'] = pd.to_numeric(df_filtered['Beds'], errors='coerce')
df_filtered['Baths'] = pd.to_numeric(df_filtered['Baths'], errors='coerce')
df_filtered['Sqft'] = df_filtered['Sqft'].str.replace(',', '').astype(float, errors='ignore')

In [21]:
# Split the Address column into the needed parts: Unique ID, Street, City, State, ZIP
df_filtered["Unique ID"] = range(1, len(df_filtered) + 1)  # Create a unique ID for each record
df_filtered[["Street address", "City", "State", "ZIP"]] = df_filtered["Address"].str.extract(
    r'(.+?),\s*(.+?),\s*(\w{2})(?:\s*(\d{5}))?$'
)

df_api = df_filtered[["Unique ID", "Street address", "City", "State", "ZIP"]]

In [22]:
import requests
import pandas as pd
from io import StringIO
import math

def batch_geocode_census(df, batch_size=10000):
    """
    Geocodes addresses using the Census Bureau's batch geocoding API.
    
    Parameters:
    - df: DataFrame with columns ["Unique ID", "Street address", "City", "State", "ZIP"]
    - batch_size: Maximum number of records per batch (default: 10,000)
    
    Returns:
    - DataFrame with original data plus matched address, latitude, and longitude.
    """
    # Census Bureau API endpoint
    url = "https://geocoding.geo.census.gov/geocoder/locations/addressbatch"
    params = {
        "returntype": "locations",  # locations for geocoding
        "benchmark": "Public_AR_Current"  # Use the current benchmark
    }
    
    results = []  # To collect results from each batch
    total_records = len(df)
    num_batches = math.ceil(total_records / batch_size)
    
    print(f"Total records: {total_records}")
    print(f"Processing in {num_batches} batches of up to {batch_size} records each.")

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, total_records)
        batch_df = df.iloc[start_idx:end_idx]

        # Format batch without headers to match API's expected input format
        csv_buffer = StringIO()
        batch_df.to_csv(csv_buffer, index=False, header=False)
        csv_buffer.seek(0)
        
        # Submit the batch to the API
        response = requests.post(url, files={'addressFile': ('batch.csv', csv_buffer, 'text/csv')}, data=params)
        
        if response.status_code == 200:
            # Parse response text as a DataFrame
            batch_results = pd.read_csv(StringIO(response.text), header=None, names=[
                "Unique ID", "input_address", "matched_address", "match_type", "output_address", "coordinates", "tigerLine", "side"
            ])

            batch_results[['longitude', 'latitude']] = batch_results['coordinates'].str.split(',', expand=True).astype(float)
            results.append(batch_results)
            print(f"Batch {i+1}/{num_batches} - Sent {len(batch_df)} records, Received {len(batch_results)} records.")
        else:
            print(f"Error in batch {i+1}/{num_batches}: {response.status_code} - {response.text}")
    
    # Combine all batch results into a single DataFrame
    final_results = pd.concat(results, ignore_index=True)
    return final_results

In [23]:
geocoded_df = batch_geocode_census(df_api)

Total records: 3039
Processing in 1 batches of up to 10000 records each.
Batch 1/1 - Sent 3039 records, Received 3039 records.


In [24]:
df_final = df_filtered.merge(geocoded_df, on="Unique ID", how="left")

df_final = df_final[["Property Link", "Address", "output_address","latitude", "longitude", "Beds", "Baths", "Sqft", "Price"]]

df_final = df_final[df_final['latitude'].notnull() & df_final['longitude'].notnull()]


In [25]:
geocoder = Geocoder(
        df_final, 
        latitude_col='latitude', 
        longitude_col='longitude'
    )

print(f"Geocoding {state}")
df_geocoded = geocoder.geocode_all(
    demographic_areas_path=r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\census_block_group_source_nationwide\v107\blkgrp.gdb",
    cbsa_source_path=r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\cbsa_source\tl_2020_us_cbsa.shp", 
    state_source_path=r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\state_source\States_shapefile.shp"
)


Geocoding wi


In [28]:
df_geocoded['Price'] = pd.to_numeric(
    df_geocoded['Price'].replace('[\$,]', '', regex=True), errors='coerce'
)

In [29]:
df_geocoded.to_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\aptscom_scraped_1029.csv")

In [1]:
print("Test")

Test
