In [1]:
# Install Necessary Packages
!pip install dotenv pandas requests psycopg2-binary sqlalchemy

Defaulting to user installation because normal site-packages is not writeable


In [24]:

# Importing necessary librabries needed
import psycopg2
import requests
import pandas as pd
import psycopg2
from sqlalchemy import create_engine, Table, Column, Integer, String, Float, DateTime, MetaData, ForeignKey
from sqlalchemy.dialects.postgresql import insert
import os
import dotenv
from datetime import datetime

In [25]:
# Extraction Phase: 
# Extract sales and rental data from the Rentcast API

# Define API key and URLs
API_KEY = '48d189f7a54543799141fb0f8e65db0c'
HEADERS = {"X-Api-Key": API_KEY, "accept": "application/json"}
SALE_URL = "https://api.rentcast.io/v1/listings/sale?city=Austin&state=TX&status=Active&limit=500"
RENTAL_URL = "https://api.rentcast.io/v1/listings/rental/long-term?city=Austin&state=TX&status=Active&limit=500"

# Extraction function
def extract_data(url, category):
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code == 200:
        data = response.json()
        df = pd.DataFrame(data)
        df["listing_category"] = category  # add category (sale or rental)
        print(f"Extracted {len(df)} records for {category}.")
        return df
    else:
        raise Exception(f"API request failed: {response.status_code} - {response.text}")

# Run extractions
sales_df = extract_data(SALE_URL, "sale")
rentals_df = extract_data(RENTAL_URL, "rental")

# Combine into one dataset
combined_df = pd.concat([sales_df, rentals_df], ignore_index=True)
print(f"Total records extracted: {len(combined_df)}")

# Idempotent save to CSV
combined_df.to_csv("austin_listings.csv", index=False)



Extracted 500 records for sale.
Extracted 500 records for rental.
Total records extracted: 1000


In [26]:

# Transformation function:

def transform_data(sales_df, rentals_df):
    # Combine dataframes
    combined_df = pd.concat([sales_df, rental_df], ignore_index=True)
    
    # Handle missing values (based on schema)
    combined_df.fillna({
        'addressLine2': '', 'county': '', 'lotSize': 0, 'yearBuilt': 0, 'hoa': {'fee': 0},
        'listingType': 'Standard', 'daysOnMarket': 0, 'removedDate': None, 'mlsName': '', 'mlsNumber': '',
        'listingAgent': {}, 'listingOffice': {}, 'builder': {}, 'history': {}
    }, inplace=True)
    
    # Extract nested fields
    combined_df['hoa_fee'] = combined_df['hoa'].apply(lambda x: x.get('fee', 0) if isinstance(x, dict) else 0)
    combined_df['agent_name'] = combined_df['listingAgent'].apply(lambda x: x.get('name', '') if isinstance(x, dict) else '')
    combined_df['agent_phone'] = combined_df['listingAgent'].apply(lambda x: x.get('phone', '') if isinstance(x, dict) else '')
    combined_df['agent_email'] = combined_df['listingAgent'].apply(lambda x: x.get('email', '') if isinstance(x, dict) else '')
    combined_df['agent_website'] = combined_df['listingAgent'].apply(lambda x: x.get('website', '') if isinstance(x, dict) else '')
    combined_df['office_name'] = combined_df['listingOffice'].apply(lambda x: x.get('name', '') if isinstance(x, dict) else '')
    combined_df['office_phone'] = combined_df['listingOffice'].apply(lambda x: x.get('phone', '') if isinstance(x, dict) else '')
    # Handle builder for 'New Construction'
    combined_df['agent_name'] = combined_df.apply(lambda row: row['builder'].get('name', row['agent_name']) if row.get('listingType') == 'New Construction' else row['agent_name'], axis=1)
    # Extend similarly for other builder fields if present
    
    # Convert dates
    date_cols = ['listedDate', 'removedDate', 'createdDate', 'lastSeenDate']
    for col in date_cols:
        if col in combined_df.columns:
            combined_df[col] = pd.to_datetime(combined_df[col], errors='coerce')
    
    # Flatten history object
    history_records = []
    for idx, row in combined_df.iterrows():
        for history_date, event in row.get('history', {}).items():
            if isinstance(event, dict):
                event['listing_id'] = row['id']
                event['history_date'] = history_date
                history_records.append(event)
    history_df = pd.DataFrame(history_records)
    history_df['listedDate'] = pd.to_datetime(history_df['listedDate'], errors='coerce')
    history_df['removedDate'] = pd.to_datetime(history_df['removedDate'], errors='coerce')
    
    return combined_df, history_df

In [28]:
# Create snowflake schema in PostgreSQL (using SQLAlchemy)
def create_snowflake_schema(engine):
    metadata = MetaData()
    
    # Dimension Tables
    dim_state = Table('dim_state', metadata,
        Column('state_id', Integer, primary_key=True, autoincrement=True),
        Column('state', String, unique=True)
    )

    dim_city = Table('dim_city', metadata,
        Column('city_id', Integer, primary_key=True, autoincrement=True),
        Column('city', String),
        Column('county', String),
        Column('state_id', Integer, ForeignKey('dim_state.state_id'))
    )

    dim_zip = Table('dim_zip', metadata,
        Column('zip_id', Integer, primary_key=True, autoincrement=True),
        Column('zip_code', String, unique=True),
        Column('city_id', Integer, ForeignKey('dim_city.city_id'))
    )

    dim_address = Table('dim_address', metadata,
        Column('address_id', Integer, primary_key=True, autoincrement=True),
        Column('address_line1', String),
        Column('address_line2', String),
        Column('formatted_address', String),
        Column('zip_id', Integer, ForeignKey('dim_zip.zip_id'))
    )

    dim_location = Table('dim_location', metadata,
        Column('location_id', Integer, primary_key=True, autoincrement=True),
        Column('latitude', Float),
        Column('longitude', Float),
        Column('address_id', Integer, ForeignKey('dim_address.address_id'))
    )

    # Property Hierarchy
    dim_hoa = Table('dim_hoa', metadata,
        Column('hoa_id', Integer, primary_key=True, autoincrement=True),
        Column('fee', Float)
    )
    
    dim_property = Table('dim_property', metadata,
        Column('property_id', Integer, primary_key=True, autoincrement=True),
        Column('property_type', String),
        Column('bedrooms', Float),
        Column('bathrooms', Float),
        Column('square_footage', Float),
        Column('lot_size', Float),
        Column('year_built', Integer),
        Column('hoa_id', Integer, ForeignKey('dim_hoa.hoa_id'))
    )

    # Date Dimension
    dim_date = Table('dim_date', metadata,
        Column('date_id', Integer, primary_key=True, autoincrement=True),
        Column('full_date', DateTime),
        Column('year', Integer),
        Column('month', Integer),
        Column('day', Integer)
    )

    # Agent Hierarchy
    dim_office = Table('dim_office', metadata,
        Column('office_id', Integer, primary_key=True, autoincrement=True),
        Column('name', String),
        Column('phone', String)
    )
    
    dim_agent = Table('dim_agent', metadata,
        Column('agent_id', Integer, primary_key=True, autoincrement=True),
        Column('name', String),
        Column('phone', String),
        Column('email', String),
        Column('website', String),
        Column('type', String),  # 'agent' or 'builder'
        Column('office_id', Integer, ForeignKey('dim_office.office_id'))
    )

    # Fact Table
    fact_listings = Table('fact_listings', metadata,
        Column('listing_id', String, primary_key=True),
        Column('listing_category', String),  # 'sale' or 'rental'
        Column('price', Float),
        Column('status', String),
        Column('listing_type', String),
        Column('days_on_market', Integer),
        Column('mls_name', String),
        Column('mls_number', String),
        Column('created_date', DateTime),
        Column('last_seen_date', DateTime),
        Column('location_id', Integer, ForeignKey('dim_location.location_id')),
        Column('property_id', Integer, ForeignKey('dim_property.property_id')),
        Column('listed_date_id', Integer, ForeignKey('dim_date.date_id')),
        Column('agent_id', Integer, ForeignKey('dim_agent.agent_id'))
    )

    # History Table (snowflaked from fact)
    listing_history = Table('listing_history', metadata,
        Column('history_id', Integer, primary_key=True, autoincrement=True),
        Column('listing_id', String, ForeignKey('fact_listings.listing_id')),
        Column('history_date', String),
        Column('event', String),
        Column('price', Float),
        Column('listing_type', String),
        Column('days_on_market', Integer),
        Column('listed_date', DateTime),
        Column('removed_date', DateTime)
    )
    metadata.create_all(engine)
    print("Snowflake schema created successfully.")
                           