In [23]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws, col,regexp_replace
import pyspark.sql.functions as F
from pyspark.ml.feature import Imputer
from pyspark.ml.feature import VectorAssembler , StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [10]:
# Set JAVA_HOME to the path of Java 17
# (This command finds the path dynamically using the mac system tool)
java17_path = os.popen("/usr/libexec/java_home -v 17").read().strip()

if java17_path:
    os.environ["JAVA_HOME"] = java17_path
    print(f"Successfully set JAVA_HOME to: {java17_path}")
else:
    print("Error: Java 17 not found! Please verify installation.")

Successfully set JAVA_HOME to: /Library/Java/JavaVirtualMachines/jdk-17.jdk/Contents/Home


In [11]:
spark=SparkSession.builder.appName("LearnSpark").getOrCreate()
spark

In [12]:
listings=spark.read.csv('../data/raw/listings.csv.gz',
header=True,
inferSchema=True,
sep=",",
quote='"',
escape='"',
multiLine=True,
mode="PERMISSIVE")

                                                                                

In [13]:
for field in listings.schema:
    print(field)

StructField('id', LongType(), True)
StructField('listing_url', StringType(), True)
StructField('scrape_id', LongType(), True)
StructField('last_scraped', DateType(), True)
StructField('source', StringType(), True)
StructField('name', StringType(), True)
StructField('description', StringType(), True)
StructField('neighborhood_overview', StringType(), True)
StructField('picture_url', StringType(), True)
StructField('host_id', IntegerType(), True)
StructField('host_url', StringType(), True)
StructField('host_name', StringType(), True)
StructField('host_since', DateType(), True)
StructField('host_location', StringType(), True)
StructField('host_about', StringType(), True)
StructField('host_response_time', StringType(), True)
StructField('host_response_rate', StringType(), True)
StructField('host_acceptance_rate', StringType(), True)
StructField('host_is_superhost', StringType(), True)
StructField('host_thumbnail_url', StringType(), True)
StructField('host_picture_url', StringType(), True)


In [16]:
neighbourhoods=listings.select(listings.neighbourhood_cleansed)
neighbourhoods.show(20,truncate=False)

+----------------------+
|neighbourhood_cleansed|
+----------------------+
|Islington             |
|Kensington and Chelsea|
|Westminster           |
|Wandsworth            |
|Tower Hamlets         |
|Richmond upon Thames  |
|Haringey              |
|Hammersmith and Fulham|
|Hammersmith and Fulham|
|Southwark             |
|Westminster           |
|Barnet                |
|Hounslow              |
|Southwark             |
|Waltham Forest        |
|Barnet                |
|Hammersmith and Fulham|
|Hammersmith and Fulham|
|Brent                 |
|Camden                |
+----------------------+
only showing top 20 rows


In [18]:
review_locations=listings.select(listings.review_scores_location)
review_locations.show(20,truncate=False)

+----------------------+
|review_scores_location|
+----------------------+
|4.78                  |
|4.93                  |
|4.89                  |
|4.6                   |
|4.85                  |
|4.9                   |
|4.77                  |
|4.53                  |
|4.79                  |
|4.79                  |
|4.5                   |
|4.64                  |
|4.84                  |
|4.86                  |
|4.0                   |
|4.75                  |
|NULL                  |
|4.66                  |
|4.67                  |
|5.0                   |
+----------------------+
only showing top 20 rows


In [19]:
listings \
    .select(listings.review_scores_location) \
        .show(20)

+----------------------+
|review_scores_location|
+----------------------+
|                  4.78|
|                  4.93|
|                  4.89|
|                   4.6|
|                  4.85|
|                   4.9|
|                  4.77|
|                  4.53|
|                  4.79|
|                  4.79|
|                   4.5|
|                  4.64|
|                  4.84|
|                  4.86|
|                   4.0|
|                  4.75|
|                  NULL|
|                  4.66|
|                  4.67|
|                   5.0|
+----------------------+
only showing top 20 rows


In [21]:
high_score_listings=listings \
    .filter(listings.review_scores_location > 4.5) \
        .select("id","price","name","review_scores_location")
high_score_listings.show(20,truncate=False)

+-----+-------+-------------------------------------------------+----------------------+
|id   |price  |name                                             |review_scores_location|
+-----+-------+-------------------------------------------------+----------------------+
|13913|$70.00 |Holiday London DB Room Let-on going              |4.78                  |
|15400|$149.00|Bright Chelsea  Apartment. Chelsea!              |4.93                  |
|17402|$411.00|Very Central Modern 3-Bed/2 Bath By Oxford St W1 |4.89                  |
|24328|NULL   |Battersea live/work artist house                 |4.6                   |
|36274|$210.00|Bright 1 bedroom apt off brick lane in Shoreditch|4.85                  |
|36299|$280.00|Kew Gardens 3BR house in cul-de-sac              |4.9                   |
|36660|$90.00 |You are GUARANTEED to love this                  |4.77                  |
|38605|$61.00 |SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST       |4.53                  |
|38610|$340.00|Short 

In [22]:
high_score_listings.dropna().show(20,truncate=False)

+-----+-------+--------------------------------------------------+----------------------+
|id   |price  |name                                              |review_scores_location|
+-----+-------+--------------------------------------------------+----------------------+
|13913|$70.00 |Holiday London DB Room Let-on going               |4.78                  |
|15400|$149.00|Bright Chelsea  Apartment. Chelsea!               |4.93                  |
|17402|$411.00|Very Central Modern 3-Bed/2 Bath By Oxford St W1  |4.89                  |
|36274|$210.00|Bright 1 bedroom apt off brick lane in Shoreditch |4.85                  |
|36299|$280.00|Kew Gardens 3BR house in cul-de-sac               |4.9                   |
|36660|$90.00 |You are GUARANTEED to love this                   |4.77                  |
|38605|$61.00 |SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST        |4.53                  |
|38610|$340.00|Short Term Home                                   |4.79                  |
|38995|$49

In [27]:
price_num_df=listings \
    .withColumn('price_num',regexp_replace('price','[$,]','').cast('float')) 

price_num_df.schema['price_num']

StructField('price_num', FloatType(), True)

In [28]:
price_num_df.select('price_num','name').show(20,truncate=False)

+---------+-------------------------------------------------+
|price_num|name                                             |
+---------+-------------------------------------------------+
|70.0     |Holiday London DB Room Let-on going              |
|149.0    |Bright Chelsea  Apartment. Chelsea!              |
|411.0    |Very Central Modern 3-Bed/2 Bath By Oxford St W1 |
|NULL     |Battersea live/work artist house                 |
|210.0    |Bright 1 bedroom apt off brick lane in Shoreditch|
|280.0    |Kew Gardens 3BR house in cul-de-sac              |
|90.0     |You are GUARANTEED to love this                  |
|61.0     |SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST       |
|340.0    |Short Term Home                                  |
|49.0     |SPACIOUS ROOM IN CONTEMPORARY STYLE FLAT         |
|NULL     |Stylish bedsit in Notting Hill ish flat.         |
|213.0    |2 Double bed apartment in quiet area North London|
|NULL     |Room in maisonette in chiswick                   |
|96.0   

In [30]:
col_list=['name','price','review_scores_location']
price_num_df.filter((price_num_df.price_num < 100) & (price_num_df.review_scores_location > 4.5)) \
    .select(col_list) \
        .show(truncate=False)

+--------------------------------------------------+------+----------------------+
|name                                              |price |review_scores_location|
+--------------------------------------------------+------+----------------------+
|Holiday London DB Room Let-on going               |$70.00|4.78                  |
|You are GUARANTEED to love this                   |$90.00|4.77                  |
|SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST        |$61.00|4.53                  |
|SPACIOUS ROOM IN CONTEMPORARY STYLE FLAT          |$49.00|4.79                  |
|Room with a view, shared flat,  central  Bankside |$96.00|4.86                  |
|You Will Save Money Here                          |$71.00|4.75                  |
|Quiet Comfortable Room in Fulham                  |$48.00|4.66                  |
|Room with a garden                                |$76.00|5.0                   |
|Pleasant Single Room in zone 1.                   |$50.00|4.77                  |
|Cos

In [31]:
price_num_df.filter('price_num < 100 and review_scores_location > 4.5') \
    .select(col_list) \
        .show(truncate=False)

+--------------------------------------------------+------+----------------------+
|name                                              |price |review_scores_location|
+--------------------------------------------------+------+----------------------+
|Holiday London DB Room Let-on going               |$70.00|4.78                  |
|You are GUARANTEED to love this                   |$90.00|4.77                  |
|SUNNY ROOM PRIVATE BATHROOM PLUS BREAKFAST        |$61.00|4.53                  |
|SPACIOUS ROOM IN CONTEMPORARY STYLE FLAT          |$49.00|4.79                  |
|Room with a view, shared flat,  central  Bankside |$96.00|4.86                  |
|You Will Save Money Here                          |$71.00|4.75                  |
|Quiet Comfortable Room in Fulham                  |$48.00|4.66                  |
|Room with a garden                                |$76.00|5.0                   |
|Pleasant Single Room in zone 1.                   |$50.00|4.77                  |
|Cos

In [34]:
listings \
    .select(listings.property_type,listings.room_type) \
        .distinct() \
            .show(20,truncate=False)

[Stage 15:>                                                         (0 + 1) / 1]

+----------------------------------+---------------+
|property_type                     |room_type      |
+----------------------------------+---------------+
|Room in hostel                    |Hotel room     |
|Private room in casa particular   |Private room   |
|Dome                              |Entire home/apt|
|Entire serviced apartment         |Entire home/apt|
|Private room in loft              |Private room   |
|Private room in villa             |Private room   |
|Farm stay                         |Entire home/apt|
|Room in hotel                     |Hotel room     |
|Shared room in rental unit        |Shared room    |
|Private room in guest suite       |Private room   |
|Room in rental unit               |Hotel room     |
|Room in serviced apartment        |Hotel room     |
|Private room in serviced apartment|Private room   |
|Private room in hostel            |Private room   |
|Shared room                       |Shared room    |
|Private room in yurt              |Private ro

                                                                                

In [35]:
listings \
    .select(listings.property_type) \
        .distinct() \
            .write \
                .csv('../data/processed/property_types.csv')

                                                                                

25/12/25 10:39:56 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 343349 ms exceeds timeout 120000 ms
25/12/25 10:39:56 WARN SparkContext: Killing executors is not supported by current scheduler.
25/12/25 10:39:56 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:359)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$