In [10]:
import os

# Set HADOOP_HOME to the parent folder of the 'bin' with winutils.exe
os.environ["HADOOP_HOME"] = r"C:\winutils-master\hadoop-3.0.0"
os.environ["PATH"] += r";C:\winutils-master\hadoop-3.0.0\bin"

# (Optional: Set JAVA_HOME if not already set)
os.environ["JAVA_HOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-17.0.15.6-hotspot"

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum as spark_sum, avg, min as spark_min, max as spark_max, stddev, isnan, isnull, when
from pyspark.sql.types import NumericType

In [12]:
# set spark session 
spark = SparkSession.builder \
    .appName("DataValidation") \
    .master("local[*]") \
    .getOrCreate()

In [13]:
# paths for the zones
formatted_zone = "formatted_zone"
exploitation_zone = "exploitation_zone"

In [14]:
# validate data in the formatted zone

# load data
formatted_data = spark.read.parquet(f"{formatted_zone}/formatted_data")

# visualize schema
print("Columns in formatted data:")
for i, (col_name, col_type) in enumerate(formatted_data.dtypes, 1):
    print(f"  {i:2d}. {col_name:<25} : {col_type}")

print(f"\ntotal columns: {len(formatted_data.columns)}")

Columns in formatted data:
   1. bathrooms                 : bigint
   2. distance                  : string
   3. district                  : string
   4. exterior                  : boolean
   5. floor                     : string
   6. has360                    : boolean
   7. has3DTour                 : boolean
   8. hasLift                   : boolean
   9. hasPlan                   : boolean
  10. hasStaging                : boolean
  11. hasVideo                  : boolean
  12. latitude                  : double
  13. longitude                 : double
  14. neighborhood              : string
  15. newDevelopment            : boolean
  16. newDevelopmentFinished    : boolean
  17. numPhotos                 : bigint
  18. parkingSpace              : struct<hasParkingSpace:boolean,isParkingSpaceIncludedInPrice:boolean,parkingSpacePrice:double>
  19. price                     : double
  20. priceByArea               : double
  21. propertyCode              : string
  22. propertyT

In [15]:
# verify where we have null values
print(f"{'Column Name':<25} {'Non-Null Count':<15} {'Null Count':<12} {'Null %':<8} {'Data Type':<15}")
print("-" * 80)
total_records = formatted_data.count()
for col_name, col_type in formatted_data.dtypes:
    non_null_count = formatted_data.filter(col(col_name).isNotNull()).count()
    null_count = total_records - non_null_count
    null_percentage = (null_count / total_records) * 100
    
    print(f"{col_name:<25} {non_null_count:<15,} {null_count:<12,} {null_percentage:<7.1f}% {col_type:<15}")

Column Name               Non-Null Count  Null Count   Null %   Data Type      
--------------------------------------------------------------------------------
bathrooms                 4,062           0            0.0    % bigint         
distance                  4,062           0            0.0    % string         
district                  4,062           0            0.0    % string         
exterior                  4,062           0            0.0    % boolean        
floor                     3,486           576          14.2   % string         
has360                    4,062           0            0.0    % boolean        
has3DTour                 4,062           0            0.0    % boolean        
hasLift                   3,736           326          8.0    % boolean        
hasPlan                   4,062           0            0.0    % boolean        
hasStaging                4,062           0            0.0    % boolean        
hasVideo                  4,062        

In [16]:
# check statistics for the price column
price_stats = formatted_data.select(
    count("price").alias("count"),
    avg("price").alias("avg_price"),
    spark_min("price").alias("min_price"),
    spark_max("price").alias("max_price"),
    stddev("price").alias("std_price")
).collect()[0]

print(f"Price statistics:")
print(f"  Count: {price_stats['count']:,}")
print(f"  Average: €{price_stats['avg_price']:,.0f}")
print(f"  Min: €{price_stats['min_price']:,.0f}")
print(f"  Max: €{price_stats['max_price']:,.0f}")
print(f"  Std Dev: €{price_stats['std_price']:,.0f}")

Price statistics:
  Count: 4,062
  Average: €579,315
  Min: €34,000
  Max: €12,000,000
  Std Dev: €683,459


In [17]:
# check statistics for the income index
income_stats = formatted_data.select(
    count("Index_RFD_average").alias("count"),
    avg("Index_RFD_average").alias("avg_income"),
    spark_min("Index_RFD_average").alias("min_income"),
    spark_max("Index_RFD_average").alias("max_income")
).collect()[0]

print(f"\nIncome Index statistics:")
print(f"  Count: {income_stats['count']:,}")
print(f"  Average Index: {income_stats['avg_income']:.1f}")
print(f"  Min Index: {income_stats['min_income']:.1f}")
print(f"  Max Index: {income_stats['max_income']:.1f}")


Income Index statistics:
  Count: 4,062
  Average Index: 108.0
  Min Index: 43.6
  Max Index: 229.0


In [18]:
# neighborhood distribution
print(f"\n>>> Neighborhood distribution (top 10):")
neighborhood_dist = formatted_data.groupBy("neighborhood") \
    .count() \
    .orderBy(col("count").desc())

neighborhood_dist.show(10, truncate=False)


>>> Neighborhood distribution (top 10):
+-------------------------------+-----+
|neighborhood                   |count|
+-------------------------------+-----+
|La Dreta de l'Eixample         |352  |
|Sants                          |346  |
|El Poble Sec - Parc de Montjuïc|300  |
|La Nova Esquerra de l'Eixample |298  |
|La Marina del Port             |231  |
|La Maternitat i Sant Ramon     |229  |
|Sants - Badal                  |226  |
|El Gòtic                       |218  |
|Les Corts                      |214  |
|La Bordeta                     |201  |
+-------------------------------+-----+
only showing top 10 rows


In [19]:
# property type distribution
print(f">>> Property type distribution:")
property_dist = formatted_data.groupBy("propertyType") \
    .count() \
    .orderBy(col("count").desc())

property_dist.show(truncate=False)

>>> Property type distribution:
+------------+-----+
|propertyType|count|
+------------+-----+
|flat        |3421 |
|penthouse   |231  |
|chalet      |216  |
|duplex      |127  |
|studio      |66   |
|countryHouse|1    |
+------------+-----+



In [20]:
# validate data in the exploitation zone

# laod data 
train_data = spark.read.parquet(f"{exploitation_zone}/train_data")
test_data = spark.read.parquet(f"{exploitation_zone}/test_data")
ml_ready_data = spark.read.parquet(f"{exploitation_zone}/ml_ready_data")

In [21]:
# check the splits
total_ml_records = ml_ready_data.count()
train_records = train_data.count()
test_records = test_data.count()

print(f"  Total ML records: {total_ml_records:,}")
print(f"  Train records: {train_records:,} ({train_records/total_ml_records*100:.1f}%)")
print(f"  Test records: {test_records:,} ({test_records/total_ml_records*100:.1f}%)")

  Total ML records: 3,950
  Train records: 3,219 (81.5%)
  Test records: 731 (18.5%)


In [22]:
# check what columns we have
print("Train data columns:")
train_data.printSchema()

# check feature vector
print("\nFirst feature vector:")
train_data.select("features").show(1, truncate=False)

# get vector size
vector_size = len(train_data.select("features").first()["features"])
print(f"Vector has {vector_size} dimensions")

Train data columns:
root
 |-- price_clean: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- neighborhood: string (nullable = true)


First feature vector:
+---------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                               |
+---------------------------------------------------------------------------------------------------------------------------------------+
|(91,[0,3,4,5,6,7,8,12,22,26,28,32,37,89],[10.0,110.32727272727271,58016.0,41.3830747,2.1486212,3228.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
+---------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row
Vector has 91 dimensions


In [23]:
# check statistics for the target variable (=price_clean)
target_stats = train_data.select(
    count("price_clean").alias("count"),
    avg("price_clean").alias("avg_price"),
    spark_min("price_clean").alias("min_price"),
    spark_max("price_clean").alias("max_price"),
    stddev("price_clean").alias("std_price")
).collect()[0]

print(f"\nTarget variable (=price_clean) statistics:")
print(f"  Count: {target_stats['count']:,}")
print(f"  Average: €{target_stats['avg_price']:,.0f}")
print(f"  Min: €{target_stats['min_price']:,.0f}")
print(f"  Max: €{target_stats['max_price']:,.0f}")
print(f"  Std Dev: €{target_stats['std_price']:,.0f}")


Target variable (=price_clean) statistics:
  Count: 3,219
  Average: €499,316
  Min: €34,000
  Max: €2,600,000
  Std Dev: €409,612


In [24]:
# check for missing values in target
null_targets = train_data.filter(col("price_clean").isNull()).count()
print(f"  Null values: {null_targets}")

  Null values: 0


In [25]:
# DA QUA 

In [26]:
# compare record counts between zones
formatted_count = formatted_data.count()
ml_ready_count = ml_ready_data.count()
records_removed = formatted_count - ml_ready_count
removal_percentage = (records_removed / formatted_count) * 100

print(f"  Formatted Zone: {formatted_count:,} records")
print(f"  Exploitation Zone: {ml_ready_count:,} records")
print(f"  Records removed: {records_removed:,} ({removal_percentage:.1f}%)")

# check price ranges
formatted_price_range = formatted_data.select(
    spark_min("price").alias("min"), 
    spark_max("price").alias("max")
).collect()[0]

ml_price_range = ml_ready_data.select(
    spark_min("price_clean").alias("min"), 
    spark_max("price_clean").alias("max")
).collect()[0]

print(f"  Formatted Zone price range: €{formatted_price_range['min']:,.0f} - €{formatted_price_range['max']:,.0f}")
print(f"  ML Zone price range: €{ml_price_range['min']:,.0f} - €{ml_price_range['max']:,.0f}")

  Formatted Zone: 4,062 records
  Exploitation Zone: 3,950 records
  Records removed: 112 (2.8%)
  Formatted Zone price range: €34,000 - €12,000,000
  ML Zone price range: €34,000 - €2,600,000


In [27]:
spark.stop()