In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum as spark_sum, avg, min as spark_min, max as spark_max, stddev, isnan, isnull, when
from pyspark.sql.types import NumericType

In [2]:
# set spark session 
spark = SparkSession.builder \
    .appName("DataValidation") \
    .master("local[*]") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/23 14:40:07 WARN Utils: Your hostname, MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.134 instead (on interface en0)
25/06/23 14:40:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/23 14:40:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# paths for the zones
formatted_zone = "formatted_zone"
exploitation_zone = "exploitation_zone"

In [4]:
# validate data in the formatted zone

# load data
formatted_data = spark.read.parquet(f"{formatted_zone}/formatted_data")

# visualize schema
print("Columns in formatted data:")
for i, (col_name, col_type) in enumerate(formatted_data.dtypes, 1):
    print(f"  {i:2d}. {col_name:<25} : {col_type}")

print(f"\ntotal columns: {len(formatted_data.columns)}")

Columns in formatted data:
   1. Poblacio_average          : double
   2. Index_RFD_average         : double
   3. address                   : string
   4. bathrooms                 : bigint
   5. country                   : string
   6. detailedType              : struct<subTypology:string,typology:string>
   7. distance                  : string
   8. exterior                  : boolean
   9. floor                     : string
  10. has360                    : boolean
  11. has3DTour                 : boolean
  12. hasLift                   : boolean
  13. hasPlan                   : boolean
  14. hasStaging                : boolean
  15. hasVideo                  : boolean
  16. latitude                  : double
  17. longitude                 : double
  18. municipality              : string
  19. newDevelopment            : boolean
  20. newDevelopmentFinished    : boolean
  21. numPhotos                 : bigint
  22. operation                 : string
  23. parkingSpace        

In [5]:
# verify where we have null values
print(f"{'Column Name':<25} {'Non-Null Count':<15} {'Null Count':<12} {'Null %':<8} {'Data Type':<15}")
print("-" * 80)
total_records = formatted_data.count()
for col_name, col_type in formatted_data.dtypes:
    non_null_count = formatted_data.filter(col(col_name).isNotNull()).count()
    null_count = total_records - non_null_count
    null_percentage = (null_count / total_records) * 100
    
    print(f"{col_name:<25} {non_null_count:<15,} {null_count:<12,} {null_percentage:<7.1f}% {col_type:<15}")

Column Name               Non-Null Count  Null Count   Null %   Data Type      
--------------------------------------------------------------------------------
Poblacio_average          2,812           0            0.0    % double         
Index_RFD_average         2,812           0            0.0    % double         
address                   2,812           0            0.0    % string         
bathrooms                 2,812           0            0.0    % bigint         
country                   2,812           0            0.0    % string         
detailedType              2,812           0            0.0    % struct<subTypology:string,typology:string>
distance                  2,812           0            0.0    % string         
exterior                  2,812           0            0.0    % boolean        
floor                     2,262           550          19.6   % string         
has360                    2,812           0            0.0    % boolean        
has3DTour   

In [6]:
# check statistics for the price column
price_stats = formatted_data.select(
    count("price").alias("count"),
    avg("price").alias("avg_price"),
    spark_min("price").alias("min_price"),
    spark_max("price").alias("max_price"),
    stddev("price").alias("std_price")
).collect()[0]

print(f"Price statistics:")
print(f"  Count: {price_stats['count']:,}")
print(f"  Average: €{price_stats['avg_price']:,.0f}")
print(f"  Min: €{price_stats['min_price']:,.0f}")
print(f"  Max: €{price_stats['max_price']:,.0f}")
print(f"  Std Dev: €{price_stats['std_price']:,.0f}")

Price statistics:
  Count: 2,812
  Average: €791,569
  Min: €69,500
  Max: €10,000,000
  Std Dev: €1,074,255


In [7]:
# check statistics for the income index
income_stats = formatted_data.select(
    count("Index_RFD_average").alias("count"),
    avg("Index_RFD_average").alias("avg_income"),
    spark_min("Index_RFD_average").alias("min_income"),
    spark_max("Index_RFD_average").alias("max_income")
).collect()[0]

print(f"\nIncome Index statistics:")
print(f"  Count: {income_stats['count']:,}")
print(f"  Average Index: {income_stats['avg_income']:.1f}")
print(f"  Min Index: {income_stats['min_income']:.1f}")
print(f"  Max Index: {income_stats['max_income']:.1f}")


Income Index statistics:
  Count: 2,812
  Average Index: 124.7
  Min Index: 55.2
  Max Index: 229.0


In [8]:
# neighborhood distribution
print(f"\n>>> Neighborhood distribution (top 10):")
neighborhood_dist = formatted_data.groupBy("neighborhood_n_reconciled") \
    .count() \
    .orderBy(col("count").desc())

neighborhood_dist.show(10, truncate=False)


>>> Neighborhood distribution (top 10):
+-----------------------------+-----+
|neighborhood_n_reconciled    |count|
+-----------------------------+-----+
|Sants                        |914  |
|Sant Antoni                  |520  |
|Sarrià                       |412  |
|Pedralbes                    |404  |
|Hostafrancs                  |288  |
|Vallcarca i els Penitents    |158  |
|Can Baró                     |70   |
|Porta                        |24   |
|Vilapicina i la Torre Llobeta|8    |
|Verdun                       |8    |
+-----------------------------+-----+
only showing top 10 rows


In [9]:
# property type distribution
print(f">>> Property type distribution:")
property_dist = formatted_data.groupBy("propertyType") \
    .count() \
    .orderBy(col("count").desc())

property_dist.show(truncate=False)

>>> Property type distribution:
+------------+-----+
|propertyType|count|
+------------+-----+
|flat        |2162 |
|chalet      |326  |
|penthouse   |222  |
|duplex      |76   |
|studio      |24   |
|countryHouse|2    |
+------------+-----+



In [10]:
# validate data in the exploitation zone

# laod data 
train_data = spark.read.parquet(f"{exploitation_zone}/train_data")
test_data = spark.read.parquet(f"{exploitation_zone}/test_data")
ml_ready_data = spark.read.parquet(f"{exploitation_zone}/ml_ready_data")

In [11]:
# check the splits
total_ml_records = ml_ready_data.count()
train_records = train_data.count()
test_records = test_data.count()

print(f"  Total ML records: {total_ml_records:,}")
print(f"  Train records: {train_records:,} ({train_records/total_ml_records*100:.1f}%)")
print(f"  Test records: {test_records:,} ({test_records/total_ml_records*100:.1f}%)")

  Total ML records: 2,720
  Train records: 2,198 (80.8%)
  Test records: 522 (19.2%)


In [12]:
# check what columns we have
print("Train data columns:")
train_data.printSchema()

# check feature vector
print("\nFirst feature vector:")
train_data.select("features").show(1, truncate=False)

# get vector size
vector_size = len(train_data.select("features").first()["features"])
print(f"Vector has {vector_size} dimensions")

Train data columns:
root
 |-- price_clean: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- neighborhood_n_reconciled: string (nullable = true)


First feature vector:
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                   |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(50,[0,1,2,3,4,5,6,7,8,12,16,18,26,27,32,34,46,49],[33.0,1.0,1.0,87.79999999999998,41356.72727272727,41.3743657,2.1294698,1959.0,18.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
+------------------------------------------------------------------------------------------------------------------

In [13]:
# check statistics for the target variable (=price_clean)
target_stats = train_data.select(
    count("price_clean").alias("count"),
    avg("price_clean").alias("avg_price"),
    spark_min("price_clean").alias("min_price"),
    spark_max("price_clean").alias("max_price"),
    stddev("price_clean").alias("std_price")
).collect()[0]

print(f"\nTarget variable (=price_clean) statistics:")
print(f"  Count: {target_stats['count']:,}")
print(f"  Average: €{target_stats['avg_price']:,.0f}")
print(f"  Min: €{target_stats['min_price']:,.0f}")
print(f"  Max: €{target_stats['max_price']:,.0f}")
print(f"  Std Dev: €{target_stats['std_price']:,.0f}")


Target variable (=price_clean) statistics:
  Count: 2,198
  Average: €637,608
  Min: €69,500
  Max: €3,900,000
  Std Dev: €620,758


In [14]:
# check for missing values in target
null_targets = train_data.filter(col("price_clean").isNull()).count()
print(f"  Null values: {null_targets}")

  Null values: 0


In [15]:
# DA QUA 

In [16]:
# compare record counts between zones
formatted_count = formatted_data.count()
ml_ready_count = ml_ready_data.count()
records_removed = formatted_count - ml_ready_count
removal_percentage = (records_removed / formatted_count) * 100

print(f"  Formatted Zone: {formatted_count:,} records")
print(f"  Exploitation Zone: {ml_ready_count:,} records")
print(f"  Records removed: {records_removed:,} ({removal_percentage:.1f}%)")

# check price ranges
formatted_price_range = formatted_data.select(
    spark_min("price").alias("min"), 
    spark_max("price").alias("max")
).collect()[0]

ml_price_range = ml_ready_data.select(
    spark_min("price_clean").alias("min"), 
    spark_max("price_clean").alias("max")
).collect()[0]

print(f"  Formatted Zone price range: €{formatted_price_range['min']:,.0f} - €{formatted_price_range['max']:,.0f}")
print(f"  ML Zone price range: €{ml_price_range['min']:,.0f} - €{ml_price_range['max']:,.0f}")

  Formatted Zone: 2,812 records
  Exploitation Zone: 2,720 records
  Records removed: 92 (3.3%)
  Formatted Zone price range: €69,500 - €10,000,000
  ML Zone price range: €69,500 - €3,900,000


In [17]:
spark.stop()