In [1]:
from concurrent.futures import ProcessPoolExecutor
import logging
import pandas as pd
from pyspark.sql import SparkSession
from delta import *

In [2]:
import sys
sys.path.insert(1, '/opt/notebooks/input')
import real_estate_utils


In [3]:
# Parameters
city = 'bern'
radius = 4
date = '2022-07-09'

In [4]:
# Parameters
date = "2022-07-09"
city = "bern"
radius = 1


In [5]:
# Paths
BRONZE_TABLE_PATH = '/opt/data_lake/bronze/house_prices_raw'

In [6]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [7]:
logger.info(f"Getting houses from {city} ({radius}KM around) ...")

2022-07-10 22:04:58,362 [INFO] Getting houses from bern (1KM around) ...


In [None]:
# List the property ids
properties_ids = real_estate_utils.get_properties_ids(city, radius)

In [None]:
logger.info(f"Searching {len(properties_ids)} houses metadata ...")

# Build a Pandas DataFrame with the responses
with ProcessPoolExecutor(max_workers = None) as executor:
    try:
        results = list(executor.map(
            real_estate_utils.parse_property_metadata, 
            properties_ids
        ))

        results_total = pd.concat(results)
        results_total['date'] = date
        results_total['city'] = city
        results_total = results_total.dropna(axis='columns', how='all').reset_index(drop=True)

    except Exception as e:
        logger.error(e)
        pass

In [None]:
results_total.info()

In [None]:
# Create Spark Session
builder = (
    SparkSession
    .builder
    .appName("real-estate-etl")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
# Create a Spark DataFrame
results_total=spark.createDataFrame(results_total) 
results_total.printSchema()

In [None]:
# Store it on bronze Delta Table
(
    results_total
    .write
    .format('delta')
    .partitionBy(['date', 'city'])
    .mode('overwrite')
    .option('mergeSchema', 'true')
    .option('replaceWhere', f"date=='{date}' and city=='{city}'")
    .save(BRONZE_TABLE_PATH)
)

In [None]:
# # Vacuum table
# delta_table = DeltaTable.forPath(spark, BRONZE_TABLE_PATH)
# delta_table.vacuum()
