In [1]:
from concurrent.futures import ProcessPoolExecutor
import logging
import pandas as pd
from pyspark.sql import SparkSession
from delta import *

In [2]:
import sys
sys.path.insert(1, '/opt/notebooks/input')
import real_estate_utils


In [8]:
# Parameters
city = 'bern'
radius = 4
date = '2022-07-09'

In [4]:
# Paths
BRONZE_TABLE_PATH = '/opt/data_lake/bronze/house_prices_raw'

In [9]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [10]:
logger.info(f"Getting houses from {city} ({radius}KM around) ...")

2022-07-09 23:20:04,586 [INFO] Getting houses from locarno (1KM around) ...


In [11]:
# List the property ids
properties_ids = real_estate_utils.get_properties_ids(city, radius)

In [12]:
logger.info(f"Searching {len(properties_ids)} houses metadata ...")

# Build a Pandas DataFrame with the responses
with ProcessPoolExecutor(max_workers = None) as executor:
    try:
        results = list(executor.map(
            real_estate_utils.parse_property_metadata, 
            properties_ids
        ))

        results_total = pd.concat(results)
        results_total['date'] = date
        results_total['city'] = city
        results_total = results_total.dropna(axis='columns', how='all').reset_index(drop=True)

    except Exception as e:
        logger.error(e)
        pass

2022-07-09 23:20:17,055 [INFO] Searching 48 houses metadata ...
2022-07-09 23:20:18,801 [INFO] 7213986 ready.
2022-07-09 23:20:18,806 [INFO] 7203448 ready.
2022-07-09 23:20:18,809 [INFO] 7204253 ready.
2022-07-09 23:20:18,805 [INFO] 7203447 ready.
2022-07-09 23:20:19,610 [INFO] 7145725 ready.
2022-07-09 23:20:19,610 [INFO] 7165658 ready.
2022-07-09 23:20:19,629 [INFO] 7131431 ready.
2022-07-09 23:20:19,659 [INFO] 7187137 ready.
2022-07-09 23:20:20,772 [INFO] 7187135 ready.
2022-07-09 23:20:20,791 [INFO] 7163974 ready.
2022-07-09 23:20:20,794 [INFO] 7112827 ready.
2022-07-09 23:20:20,795 [INFO] 7110124 ready.
2022-07-09 23:20:21,739 [INFO] 6999908 ready.
2022-07-09 23:20:21,741 [INFO] 7173867 ready.
2022-07-09 23:20:21,738 [INFO] 7093642 ready.
2022-07-09 23:20:21,740 [INFO] 7149544 ready.
2022-07-09 23:20:22,639 [INFO] 7162928 ready.
2022-07-09 23:20:22,639 [INFO] 6667520 ready.
2022-07-09 23:20:22,696 [INFO] 7154723 ready.
2022-07-09 23:20:22,750 [INFO] 7082624 ready.
2022-07-09 23:20

In [13]:
results_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 25 columns):
 #   Column                                            Non-Null Count  Dtype 
---  ------                                            --------------  ----- 
 0   property_id                                       48 non-null     object
 1   attributes_size_volume                            7 non-null      object
 2   attributes_size_number_of_floors                  21 non-null     object
 3   attributes_inside_animal_allowed                  15 non-null     object
 4   attributes_inside_bathrooms                       5 non-null      object
 5   attributes_inside_cellar                          4 non-null      object
 6   attributes_technology_dishwasher                  5 non-null      object
 7   attributes_technology_cable_tv                    12 non-null     object
 8   attributes_outside_balcony                        27 non-null     object
 9   attributes_outside_parking        

In [14]:
# Create Spark Session
builder = (
    SparkSession
    .builder
    .appName("real-estate-etl")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()



:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-08776f7e-413a-4373-9f69-bcfc12bc2143;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.0.0 in central
	found org.antlr#antlr4;4.7 in central
	found org.antlr#antlr4-runtime;4.7 in central
	found org.antlr#antlr-runtime;3.5.2 in central
	found org.antlr#ST4;4.0.8 in central
	found org.abego.treelayout#org.abego.treelayout.core;1.0.3 in central
	found org.glassfish#javax.json;1.0.4 in central
	found com.ibm.icu#icu4j;58.2 in central
downloading https://repo1.maven.org/maven2/io/delta/delta-core_2.12/1.0.0/delta-core_2.12-1.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-core_2.12;1.0.0!delta-core_2.12.jar (2021ms)
downloading https://repo1.maven.org/maven2/org/antlr/antlr4/4.7/antlr4-4.7.jar ...
	[SUCCESSFUL ] org.antlr#antlr4;4.7!antlr4.jar (511ms)
downloading https

In [16]:
# Create a Spark DataFrame
results_total=spark.createDataFrame(results_total) 
results_total.printSchema()

TypeError: data is already a DataFrame

In [17]:
# Store it on bronze Delta Table
(
    results_total
    .write
    .format('delta')
    .partitionBy(['date', 'city'])
    .mode('overwrite')
    .option('mergeSchema', 'true')
    .option('replaceWhere', f"date=='{date}' and city=='{city}'")
    .save(BRONZE_TABLE_PATH)
)

                                                                                

In [18]:
# # Vacuum table
# delta_table = DeltaTable.forPath(spark, BRONZE_TABLE_PATH)
# delta_table.vacuum()
