In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DateType, DecimalType

In [3]:
spark = SparkSession.builder.appName("Task2").getOrCreate()

In [4]:
schema = StructType([
    StructField("transaction_unique_identifier", StringType(), True),
    StructField("price", DecimalType(10, 2), True),
    StructField("date_of_transfer", DateType(), True),
    StructField("postcode", StringType(), True),
    StructField("property_type", StringType(), True),
    StructField("old_new", StringType(), True),
    StructField("duration", StringType(), True),
    StructField("paon", StringType(), True),
    StructField("saon", StringType(), True),
    StructField("street", StringType(), True),
    StructField("locality", StringType(), True),
    StructField("town_city", StringType(), True),
    StructField("district", StringType(), True),
    StructField("county", StringType(), True),
    StructField("ppd_category_type", StringType(), True),
    StructField("record_status", StringType(), True)
])

df = (
    spark.read.format("csv")
    .option("header", "true")
    .schema(schema)
    .load("../data/pp-2021.csv")
)

df = df.select(
    "transaction_unique_identifier",
    "price",
    "postcode"
)
df.show()

+-----------------------------+----------+--------+
|transaction_unique_identifier|     price|postcode|
+-----------------------------+----------+--------+
|         {E53EDD2E-C6F1-83...| 150000.00|LS27 9AL|
|         {E53EDD2E-C6F3-83...| 430000.00| LS6 1BU|
|         {E53EDD2E-C6F4-83...| 135000.00|LS10 1LP|
|         {E53EDD2E-C704-83...| 131000.00| HD2 2SN|
|         {E53EDD2E-C705-83...| 180000.00|LS10 1NG|
|         {E53EDD2E-C707-83...|  65000.00| HD4 6DL|
|         {E53EDD2E-C708-83...| 180000.00|LS10 1NW|
|         {E53EDD2E-C709-83...|  60000.00| BD4 7EJ|
|         {E53EDD2E-C712-83...| 255000.00|BD10 0QZ|
|         {E53EDD2E-C716-83...|  93500.00|LS12 5LT|
|         {E53EDD2E-C71C-83...|6492000.00|WF13 2SU|
|         {E53EDD2E-C71E-83...| 300000.00| BD4 7HY|
|         {E53EDD2E-C725-83...| 127500.00| LS9 8FE|
|         {E53EDD2E-C72A-83...| 114000.00|HU14 3FJ|
|         {E53EDD2E-C72B-83...| 114000.00|HU14 3FJ|
|         {E53EDD2E-C72C-83...|  83500.00|HU14 3FJ|
|         {E

In [5]:

df_cb = df.withColumn("postcode_area", F.split(F.col("postcode"), " ").getItem(0))

df_cb = df_cb.filter(F.col("postcode_area").startswith("CB"))

mean_prices = (
    df_cb.groupBy("postcode_area")
    .agg(F.mean("price").alias("mean_price"))
    .orderBy("mean_price")
    .limit(5)
)

mean_prices.show()

+-------------+-------------+
|postcode_area|   mean_price|
+-------------+-------------+
|          CB9|303575.126582|
|          CB6|337535.547714|
|          CB7|341310.230950|
|         CB25|418859.199468|
|         CB24|435710.073770|
+-------------+-------------+

