In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, when 

In [2]:
# set a Spark session
spark = SparkSession.builder \
    .appName("DataFormatting") \
    .master("local[*]") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/23 11:55:13 WARN Utils: Your hostname, MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.134 instead (on interface en0)
25/06/23 11:55:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/23 11:55:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/23 11:55:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# set the paths for the input and output data
landing_zone = ("landing_zone")
formatted_zone = ("formatted_zone")

In [4]:
# load all files from each folder of the landing zone
df_idealista = spark.read.option("multiline", True).json(f"{landing_zone}/idealista")
df_income = spark.read.option("header", True).csv(f"{landing_zone}/Income")
df_lookup = spark.read.option("header", True).csv(f"{landing_zone}/lookup_tables")

                                                                                

In [5]:
df_idealista.show()

25/06/23 11:55:18 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------------+---------+-------+--------------------+--------+--------------+--------+-----------------+-----+------+---------+-------+-------+----------+--------+----------+---------+--------------------+--------------------+--------------+----------------------+---------+---------+------------------+---------+-----------+------------+------------+---------+-----+-----------+-----+------+--------------------+--------------------+-----------------+--------------------+
|             address|bathrooms|country|        detailedType|distance|      district|exterior|externalReference|floor|has360|has3DTour|hasLift|hasPlan|hasStaging|hasVideo|  latitude|longitude|        municipality|        neighborhood|newDevelopment|newDevelopmentFinished|numPhotos|operation|      parkingSpace|    price|priceByArea|propertyCode|propertyType| province|rooms|showAddress| size|status|      suggestedTexts|           thumbnail|topNewDevelopment|                 url|
+--------------------+---------+----

In [6]:
# we check for duplicates - same house observed over more years 
df_idealista.groupBy(df_idealista.columns) \
  .agg(count("*").alias("occurrences")) \
  .filter("occurrences > 1") \
  .show(truncate=False)



+----------------------------------+---------+-------+--------------------------+--------+----------------------------------+--------+-----------------+-----+------+---------+-------+-------+----------+--------+----------+---------+-----------------------+--------------------------------------+--------------+----------------------+---------+---------+------------------+---------+-----------+------------+------------+---------+-----+-----------+------+------+------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------+-----------------+--------------------------------------------+-----------+
|address                           |bathrooms|country|detailedType              |distance|district                          |exterior|externalReference|floor|has360|has3DTour|hasLift|hasPlan|hasStaging|hasVideo|latitude  |longitude|municipality           |neighborhood                 

                                                                                

The Idealista JSON files were originally split by year, but since the year column is not included in the data, we are unable to distinguish between them. This leads to duplicate entries across files, which we will remove. Moreover, we cannot leverage temporal information for the analysis, as the Idealista dataset spans from 2020 to 2021, while the income dataset covers 2007 to 2017. 

As a result, we will ignore the year dimension entirely and proceed with an analysis that does not consider the time component.

In [7]:
# remove duplicates
df_idealista = df_idealista.dropDuplicates()

In [8]:
df_income.show()

+----+--------------+--------------+----------+--------------------+--------+-------------------------+
| Any|Codi_Districte| Nom_Districte|Codi_Barri|           Nom_Barri|Població|Índex RFD Barcelona = 100|
+----+--------------+--------------+----------+--------------------+--------+-------------------------+
|2013|             1|  Ciutat Vella|         1|            el Raval|   49225|                     60.3|
|2013|             1|  Ciutat Vella|         2|      el Barri Gòtic|   16327|                    103.6|
|2013|             1|  Ciutat Vella|         3|      la Barceloneta|   15571|                     82.1|
|2013|             1|  Ciutat Vella|         4|Sant Pere, Santa ...|   22821|                     91.2|
|2013|             2|      Eixample|         5|       el Fort Pienc|   31754|                     99.0|
|2013|             2|      Eixample|         6|  la Sagrada Família|   51725|                     97.5|
|2013|             2|      Eixample|         7|la Dreta de l'Eix

In the 'income' dataframe we won't have exact duplicates since the population and the RDF index change over the years, so we used a different mechanism to remove the time dimension:
- we grouped by the neighborhood,
- we computed average population and average RDF index

In [9]:
# we tried to normally groupby 'Nom_Barri' and then average the 'Població' and 'Índex RFD Barcelona = 100' columns, 
# but we found that some values were not numeric, so here we handle this. 
df_income_cleaned = df_income.withColumn(
    "Poblacio_num", when(col("Població").rlike("^\d+$"), col("Població").cast("double"))).withColumn(
    "Index_RFD_num", when(col("Índex RFD Barcelona = 100").rlike("^\d+(\.\d+)?$"), col("Índex RFD Barcelona = 100").cast("double")))

# normal groupby and average
df_income = df_income_cleaned.groupBy("Nom_Barri").agg(
    avg("Poblacio_num").alias("Poblacio_average"),
    avg("Index_RFD_num").alias("Index_RFD_average"))

In [10]:
df_income.show()

+--------------------+------------------+------------------+
|           Nom_Barri|  Poblacio_average| Index_RFD_average|
+--------------------+------------------+------------------+
|         el Poblenou|32450.454545454544| 92.63636363636364|
|   la Vila de Gràcia| 51166.63636363636|104.81818181818181|
|el Besòs i el Mar...|23435.454545454544| 56.07272727272728|
|        la Guineueta|15231.727272727272|63.818181818181806|
|        la Teixonera|11400.727272727272| 71.89999999999999|
|la Dreta de l'Eix...| 43410.36363636364|155.25454545454548|
|      el Barri Gòtic| 18795.81818181818| 98.14545454545454|
|         el Guinardó|35770.818181818184| 85.58181818181818|
|            Vallbona|1338.1818181818182| 48.96363636363637|
|           Canyelles| 7169.181818181818| 64.53636363636363|
|Provençals del Po...|19809.090909090908| 87.87272727272727|
| la Verneda i la Pau|29134.545454545456| 62.77272727272727|
|Vilapicina i la T...| 25575.81818181818| 72.43636363636364|
|l'Antiga Esquerra...| 4

In [11]:
df_lookup.show()

+-------------------+--------------------+---------------------+-------------------+-----------+-------------------------+--------------------+---------------+
|           district|        neighborhood|district_n_reconciled|         district_n|district_id|neighborhood_n_reconciled|      neighborhood_n|neighborhood_id|
+-------------------+--------------------+---------------------+-------------------+-----------+-------------------------+--------------------+---------------+
|       Ciutat Vella|      el Barri Gòtic|         Ciutat Vella|       ciutat vella|    Q941385|           Gothic Quarter|      el barri gotic|         Q17154|
|         Nou Barris|         Can Peguera|           Nou Barris|         nou barris|   Q1641049|              Can Peguera|         can peguera|       Q3320716|
|         Nou Barris|    la Trinitat Nova|           Nou Barris|         nou barris|   Q1641049|         La Trinitat Nova|    la trinitat nova|       Q3750932|
|Sarrià-Sant Gervasi|Sant Gervasi - la..

In [12]:
# merge idealista and lookup 
df_1 = df_idealista.join(
    df_lookup,
    df_idealista["neighborhood"] == df_lookup["neighborhood_n_reconciled"],
    "inner")

In [13]:
# merge income and dataset just created 
final_df = df_income.join(
    df_1,
    df_income["Nom_Barri"] == df_1["neighborhood_n_reconciled"],
    "inner")

In [14]:
final_df.show()

+-----------+-----------------+-----------------+--------------------+---------+-------+------------+--------+--------------+--------+------------------+-----+------+---------+-------+-------+----------+--------+----------+---------+------------+------------+--------------+----------------------+---------+---------+------------------+--------+-----------+------------+------------+---------+-----+-----------+-----+------+--------------------+--------------------+-----------------+--------------------+--------------+------------+---------------------+--------------+-----------+-------------------------+--------------+---------------+
|  Nom_Barri| Poblacio_average|Index_RFD_average|             address|bathrooms|country|detailedType|distance|      district|exterior| externalReference|floor|has360|has3DTour|hasLift|hasPlan|hasStaging|hasVideo|  latitude|longitude|municipality|neighborhood|newDevelopment|newDevelopmentFinished|numPhotos|operation|      parkingSpace|   price|priceByArea|pro

In [15]:
final_cleaned = final_df \
    .drop("district_n", "district", "district_id", "neighborhood_n", "thumbnail", "url", 
    "neighborhood", "externalReference", "district_n_reconciled", "Nom_Barri") 

In [16]:
final_cleaned.show()

+-----------------+-----------------+--------------------+---------+-------+------------+--------+--------+-----+------+---------+-------+-------+----------+--------+----------+---------+------------+--------------+----------------------+---------+---------+------------------+--------+-----------+------------+------------+---------+-----+-----------+-----+------+--------------------+-----------------+-------------------------+---------------+
| Poblacio_average|Index_RFD_average|             address|bathrooms|country|detailedType|distance|exterior|floor|has360|has3DTour|hasLift|hasPlan|hasStaging|hasVideo|  latitude|longitude|municipality|newDevelopment|newDevelopmentFinished|numPhotos|operation|      parkingSpace|   price|priceByArea|propertyCode|propertyType| province|rooms|showAddress| size|status|      suggestedTexts|topNewDevelopment|neighborhood_n_reconciled|neighborhood_id|
+-----------------+-----------------+--------------------+---------+-------+------------+--------+--------

In [17]:
final_cleaned.select("neighborhood_n_reconciled", "price", "Index_RFD_average", "priceByArea").show()

+-------------------------+--------+-----------------+-----------+
|neighborhood_n_reconciled|   price|Index_RFD_average|priceByArea|
+-------------------------+--------+-----------------+-----------+
|                    Sants|219000.0|87.79999999999998|     3369.0|
|                    Sants|219000.0|87.79999999999998|     3369.0|
|                    Sants|375000.0|87.79999999999998|     3049.0|
|                    Sants|375000.0|87.79999999999998|     3049.0|
|              Sant Antoni|445000.0|99.94545454545455|     3134.0|
|              Sant Antoni|445000.0|99.94545454545455|     3134.0|
|                    Sants|359000.0|87.79999999999998|     2849.0|
|                    Sants|359000.0|87.79999999999998|     2849.0|
|                    Sants|230000.0|87.79999999999998|     3108.0|
|                    Sants|230000.0|87.79999999999998|     3108.0|
|              Sant Antoni|325000.0|99.94545454545455|     5000.0|
|              Sant Antoni|325000.0|99.94545454545455|     500

In [20]:
# transform in parquet file
final_cleaned.write.mode("overwrite").parquet(f"{formatted_zone}/formatted_data")
spark.stop()