In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read Inside Airbnb data") \
    .getOrCreate()

24/11/24 22:22:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
listings = spark.read.csv("../data/listings.csv.gz", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)

                                                                                

In [3]:
listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: int

In [4]:
# 1. Get a non-null picture URL for any property ("picture_url" field)
# Select any non-null picture URL
listings.filter(
    listings.picture_url.isNotNull()
) \
.select('picture_url') \
.limit(1) \
.show(truncate=False)

+------------------------------------------------------------------------------------------------------+
|picture_url                                                                                           |
+------------------------------------------------------------------------------------------------------+
|https://a0.muscache.com/pictures/miso/Hosting-13913/original/d755aa6d-cebb-4464-80be-2722c921e8d5.jpeg|
+------------------------------------------------------------------------------------------------------+



In [5]:
# 2. Get number of properties that get more than 10 reviews per month
listings.filter(
  listings.reviews_per_month > 10
) \
.count()

                                                                                

66

In [6]:
# 3. Get a property that has more bathrooms than bedrooms
listings.filter(
    (listings.bathrooms > listings.bedrooms)
) \
.select('name', 'bathrooms', 'bedrooms') \
.show(10, truncate=False)

+--------------------------------------------------+---------+--------+
|name                                              |bathrooms|bedrooms|
+--------------------------------------------------+---------+--------+
|Battersea live/work artist house, garden & parking|1.5      |1       |
|West London-W7, Hanwell(Area Ealing) Room (Female)|1.5      |1       |
|West London,loft ensuite, 5min2tube               |1.5      |1       |
|Large Bedroom with EnSuite Bathroom               |1.5      |1       |
|Cosy Double studio in Zone 2 Hammersmith (1)      |1.5      |1       |
|Shoreditch Loft                                   |1.5      |1       |
|Five minute walk to South Bank                    |1.5      |1       |
|Designer room Park Views 4 mins zone 1 station    |1.5      |1       |
|Cosy Double room London, River View               |1.5      |1       |
|Maisonette in Central London Zone 1               |1.5      |1       |
+--------------------------------------------------+---------+--

In [7]:
# 4. Get 10 properties where the price is greater than 5,000. Collect the result as a Python list
from pyspark.sql.functions import regexp_replace

listings_with_price = listings \
  .withColumn('price_numeric', regexp_replace('price', '[$,]', '').cast('float'))

res = listings_with_price.filter(
    (listings_with_price.price_numeric > 5000)
) \
.select('name', 'price') \
.collect()

res

                                                                                

[Row(name='Bright ,Modern, 12m to Bond street.', price='$5,500.00'),
 Row(name='Room in a cosy flat. Central, clean', price='$8,000.00'),
 Row(name='3 Bed Flat in South Hampstead with Large Garden!', price='$25,000.00'),
 Row(name='Spacious Private Ground Floor Room', price='$7,693.00'),
 Row(name='No Longer Available', price='$53,588.00'),
 Row(name='Very nice double room in the heart of Soho', price='$5,100.00'),
 Row(name='Knightsbridge Penthouse', price='$8,895.00'),
 Row(name='Luxury 5-star Flat with Art & Tech', price='$6,276.00'),
 Row(name='The Apartments by The Sloane Club, L 2 Bedroom Apt', price='$7,589.00'),
 Row(name='The Apartments by The Sloane Club, One Bedroom Apt', price='$7,589.00'),
 Row(name='Great Apartment next to Sloane Square', price='$6,250.00'),
 Row(name='Lovely Private double room at zone 2 Holloway Road', price='$5,147.00'),
 Row(name='Kensington- Luxury 2 bedroom ground floor flat', price='$8,000.00'),
 Row(name='Single room. 7ft x 9ft - Over looking gard

In [8]:
# 5. Get a list of properties with the following characteristics:
# * price < 150
# * more than 20 reviews
# * review_scores_rating > 4.5
# Consider using the "&" operator

listings_with_price.filter(
    (listings_with_price.price_numeric < 150) &
    (listings_with_price.number_of_reviews > 20) &
    (listings_with_price.review_scores_rating > 4.5)
  ) \
.select('name', 'price_numeric', 'number_of_reviews', 'review_scores_rating') \
.show(truncate=False)

+--------------------------------------------------+-------------+-----------------+--------------------+
|name                                              |price_numeric|number_of_reviews|review_scores_rating|
+--------------------------------------------------+-------------+-----------------+--------------------+
|Holiday London DB Room Let-on going               |59.0         |44               |4.82                |
|Bright Chelsea  Apartment. Chelsea!               |120.0        |96               |4.8                 |
|Double Room (Unavailable for check in 31Dec-1Jan) |40.0         |38               |4.89                |
|A stylish Victorian home in West London           |131.0        |91               |4.85                |
|I Bedroom flat Tower of London                    |145.0        |248              |4.9                 |
|You are GUARANTEED to love this                   |82.0         |691              |4.86                |
|SPACIOUS ROOM IN CONTEMPORARY STYLE FLAT     

In [9]:
# 6. Get a list of properties with the following characteristics:
# * price < 150 OR more than one bathroom
# Use the "|" operator to implement the OR operator

listings_with_price.filter(
    (listings_with_price.price_numeric < 150) | (listings_with_price.bedrooms > 1)
  ) \
.select('name', 'price_numeric', 'bedrooms') \
.show(truncate=False)

+--------------------------------------------------+-------------+--------+
|name                                              |price_numeric|bedrooms|
+--------------------------------------------------+-------------+--------+
|Holiday London DB Room Let-on going               |59.0         |1       |
|Bright Chelsea  Apartment. Chelsea!               |120.0        |1       |
|Fab 3-Bed/2 Bath & Wifi: Trendy W1                |493.0        |3       |
|Beautiful Ensuite Richmond-upon-Thames borough    |140.0        |1       |
|Double Room (Unavailable for check in 31Dec-1Jan) |40.0         |1       |
|A stylish Victorian home in West London           |131.0        |1       |
|Contemporary central London apt                   |215.0        |2       |
|I Bedroom flat Tower of London                    |145.0        |1       |
|Bright 1 bedroom off brick land                   |109.0        |1       |
|Kew Gardens 3BR house in cul-de-sac               |245.0        |3       |
|You are GUA

In [10]:
# 7. Get the highest listing price in this dataset
# Consider using the "max" function from "pyspark.sql.functions"

from pyspark.sql.functions import max
listings_with_price.select(max('price_numeric')).show()


[Stage 10:>                                                         (0 + 1) / 1]

+------------------+
|max(price_numeric)|
+------------------+
|           80000.0|
+------------------+



                                                                                

In [19]:
# 8. Get the name and a price of property with the highest number of reviews per month
# Try to use "collect" method to get the price first, and then use it in a "filter" call 

res = listings_with_price.select(max('price_numeric')).collect()
res

                                                                                

[Row(max(price_numeric)=80000.0)]

In [20]:
max_price = res[0][0]
max_price

80000.0

In [21]:

listings_with_price \
  .filter(
    listings_with_price.price_numeric == max_price
  ) \
.select('name', 'price') \
.show()


[Stage 27:>                                                         (0 + 1) / 1]

+--------------------+----------+
|                name|     price|
+--------------------+----------+
|Room In Zone 1 (TOB)|$80,000.00|
|Close To London B...|$80,000.00|
+--------------------+----------+



                                                                                

In [12]:
# 9. Get the number of hosts in the dataset
listings.select('host_name').distinct().count()

                                                                                

16379

In [16]:
# 10. Get listings with a first review in 2024
# Consider using the "year" function from "pyspark.sql.functions"

from pyspark.sql.functions import year

listings.filter(
    year(listings.first_review) == 2024
) \
.select('name', 'first_review', year(listings.first_review)) \
.show(10, truncate=False)

+--------------------------------------------------+------------+------------------+
|name                                              |first_review|year(first_review)|
+--------------------------------------------------+------------+------------------+
|Close to Wimbledon All England Tennis -huge double|2024-08-11  |2024              |
|one Double bed room with en-suite facilities      |2024-03-21  |2024              |
|Double Room for one -Herne Hill Zone 2/3Thameslink|2024-05-26  |2024              |
|Apartment in Southwark - Private living room      |2024-07-09  |2024              |
|Sm double room  with own bathroom                 |2024-06-04  |2024              |
|Superlux flat in Knightsbridge                    |2024-01-01  |2024              |
|Luxurious Flat in South Kensington                |2024-06-19  |2024              |
|The Pink House, Notting Hill                      |2024-07-14  |2024              |
|Superior Single Room (En-suite)                   |2024-06-21  |