In [33]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max as spark_max, min as spark_min, desc


1. Read data into RDD

In [34]:
spark = SparkSession.builder.appName("HousePriceAnalysis").getOrCreate()

df = spark.read.csv("Housing.csv", header=True, inferSchema=True)

2.Average house price

In [35]:
avg_price = df.select(avg("price").alias("average_price"))
avg_price.show()

+-----------------+
|    average_price|
+-----------------+
|4766729.247706422|
+-----------------+



3. Maximum and minimum area

In [38]:
area_stats = df.select(
    spark_max(col("area")).alias("max_area"),
    spark_min(col("area")).alias("min_area")
)

area_stats.show()


+--------+--------+
|max_area|min_area|
+--------+--------+
|   16200|    1650|
+--------+--------+



4.Number of houses with air conditioning


In [39]:
air_cond_count = df.filter(col("airconditioning") == "yes").count()
print("Number of houses with air conditioning:", air_cond_count)

Number of houses with air conditioning: 172


 5.Average price by furnishing status


In [40]:
avg_price_furnish = (
    df.groupBy("furnishingstatus")
      .agg(avg("price").alias("average_price"))
      .orderBy(desc("average_price"))
)
avg_price_furnish.show()

+----------------+------------------+
|furnishingstatus|     average_price|
+----------------+------------------+
|       furnished|         5495696.0|
|  semi-furnished|  4907524.22907489|
|     unfurnished|4013831.4606741574|
+----------------+------------------+



6.Houses with more than 2 parking spots

In [41]:
parking_count = df.filter(col("parking") > 2).count()
print("Number of houses with more than 2 parking spots:", parking_count)

Number of houses with more than 2 parking spots: 12


7.Most common number of bedrooms


In [42]:
common_bedroom = (
    df.groupBy("bedrooms")
      .count()
      .orderBy(desc("count"))
      .first()
)
print("Most common number of bedrooms:", common_bedroom)

Most common number of bedrooms: Row(bedrooms=3, count=300)
