In [1]:
from pyspark.sql import SparkSession, Window
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Advanced_DF_EX1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# load data
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")


airbnb_gsc_file_path = 'gs://data_de2024-tristan/listings.csv'
reviews_gsc_file_path = 'gs://data_de2024-tristan/reviews-2.csv'
calendar_gsc_file_path = 'gs://data_de2024-tristan/calendar.csv'

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType, LongType
from pyspark.sql.functions import col, max, min, regexp_replace, upper, count, trim, rank, sum, when, avg, year


dataSchema = StructType(
        [StructField("id", LongType(), True),
         StructField("name", StringType(), True),
         StructField("host_id", IntegerType(), True),
         StructField("host_name", StringType(), True),
         StructField("neighbourhood_group", StringType(), True),
         StructField("neighbourhood", StringType(), True),       
         StructField("latitude", FloatType(), True),
         StructField("longitude", FloatType(), True),
         StructField("room_type", StringType(), True),
         StructField("price", FloatType(), True),       
         StructField("minimum_nights", IntegerType(), True),
         StructField("number_of_reviews", IntegerType(), True),
         StructField("last_review", DateType(), True),
         StructField("reviews_per_month", FloatType(), True),       
         StructField("calculated_host_listings_count", IntegerType(), True),
         StructField("availability_365", IntegerType(), True),
         StructField("number_of_reviews_ltm", IntegerType(), True),
         StructField("license", StringType(), True)
         ])

listing_df = spark.read.schema(dataSchema).format("csv").option("header", "true") \
       .load(airbnb_gsc_file_path)\
       .withColumn('borough', upper(col('neighbourhood_group')))

listing_df.printSchema()
listing_df.show(10)

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: float (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: date (nullable = true)
 |-- reviews_per_month: float (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: integer (nullable = true)
 |-- number_of_reviews_ltm: integer (nullable = true)
 |-- license: string (nullable = true)
 |-- borough: string (nullable = true)

+-------------------+--------------------+---------+--------------+-------------------+------------------+---------+----------+---------------+-----+--------------

In [3]:
listing_df = listing_df.withColumn('borough', upper(col('neighbourhood_group')))
listing_df = listing_df.drop('last_review', 'reviews_per_month', 'calculated_host_listings_count', 'neighbourhood', 'neighbourhood_group','availability_365', 'number_of_reviews_ltm', 'license')

listing_df.show()

+-------------------+--------------------+---------+--------------------+---------+----------+---------------+-----+--------------+-----------------+---------+
|                 id|                name|  host_id|           host_name| latitude| longitude|      room_type|price|minimum_nights|number_of_reviews|  borough|
+-------------------+--------------------+---------+--------------------+---------+----------+---------------+-----+--------------+-----------------+---------+
|1189243425411300671|       The Grand III| 61391963|      Stay With Vibe| 40.75025|-73.977745|Entire home/apt| 58.0|            30|                1|MANHATTAN|
| 651593916026998398|Cheerful one bedr...|136812643|               Marah| 40.62476| -73.93556|   Private room| 80.0|            30|                0| BROOKLYN|
|             310325|Large Sunny Bedro...|   745069|            Kimberly| 40.82359| -73.94601|   Private room| 75.0|            30|               31|MANHATTAN|
| 572612125615500056|Room by Sunny & B..

In [4]:
null_counts = listing_df.select(
    *[sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in listing_df.columns]
)
null_counts.show()
listing_df.count()

+---+----+-------+---------+--------+---------+---------+-----+--------------+-----------------+-------+
| id|name|host_id|host_name|latitude|longitude|room_type|price|minimum_nights|number_of_reviews|borough|
+---+----+-------+---------+--------+---------+---------+-----+--------------+-----------------+-------+
|157|  17|    306|      162|     178|      294|      193|14877|           170|              289|    157|
+---+----+-------+---------+--------+---------+---------+-----+--------------+-----------------+-------+



37698

In [5]:
listing_df_cleaned = listing_df.dropna(subset=['id', 'name', 'host_id', 'host_name', 'room_type', 'price','number_of_reviews', 'borough'])
listing_df_cleaned.count()

22680

In [6]:
listing_df_cleaned.show(5)

+-------------------+--------------------+---------+--------------+---------+----------+---------------+-----+--------------+-----------------+---------+
|                 id|                name|  host_id|     host_name| latitude| longitude|      room_type|price|minimum_nights|number_of_reviews|  borough|
+-------------------+--------------------+---------+--------------+---------+----------+---------------+-----+--------------+-----------------+---------+
|1189243425411300671|       The Grand III| 61391963|Stay With Vibe| 40.75025|-73.977745|Entire home/apt| 58.0|            30|                1|MANHATTAN|
| 651593916026998398|Cheerful one bedr...|136812643|         Marah| 40.62476| -73.93556|   Private room| 80.0|            30|                0| BROOKLYN|
|             310325|Large Sunny Bedro...|   745069|      Kimberly| 40.82359| -73.94601|   Private room| 75.0|            30|               31|MANHATTAN|
| 572612125615500056|Room by Sunny & B...|358089614|        Joshua| 40.63952

In [7]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType, LongType, BooleanType
from pyspark.sql.functions import col, max, min, regexp_replace, upper, count, trim, rank, sum, when, avg, year


dataSchema = StructType(
        [StructField("listing_id", LongType(), True),
         StructField("date", DateType(), True),
         StructField("available", StringType(), True),
         StructField("price", StringType(), True),
         StructField("adjusted_price", StringType(), True),       
         StructField("minimum_nights", IntegerType(), True),
         StructField("maximum_nights", IntegerType(), True)
         ])

df_calendar = spark.read.schema(dataSchema).format("csv").option("header", "true") \
       .load(calendar_gsc_file_path)


df_calendar.printSchema()
df_calendar.show(10)

root
 |-- listing_id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- available: string (nullable = true)
 |-- price: string (nullable = true)
 |-- adjusted_price: string (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- maximum_nights: integer (nullable = true)

+----------+----------+---------+-------+--------------+--------------+--------------+
|listing_id|      date|available|  price|adjusted_price|minimum_nights|maximum_nights|
+----------+----------+---------+-------+--------------+--------------+--------------+
|      2595|2024-09-05|        f|$225.00|          NULL|            30|          1125|
|      2595|2024-09-06|        f|$225.00|          NULL|            30|          1125|
|      2595|2024-09-07|        f|$225.00|          NULL|            30|          1125|
|      2595|2024-09-08|        f|$225.00|          NULL|            30|          1125|
|      2595|2024-09-09|        f|$225.00|          NULL|            30|          1125|
|     

In [8]:
df_calendar_cleaned = df_calendar.filter(year(col("date")) == 2025).groupBy("listing_id").agg(
    sum(when(col("available") == "t", 1).otherwise(0)).alias("availability_2025")
)

In [9]:
df_calendar_cleaned.show(10, truncate=False)

+----------+-----------------+
|listing_id|availability_2025|
+----------+-----------------+
|49048     |246              |
|385824    |247              |
|818518    |0                |
|867020    |247              |
|1115381   |0                |
|1617443   |0                |
|1826701   |246              |
|2863589   |0                |
|3156969   |247              |
|5681182   |151              |
+----------+-----------------+
only showing top 10 rows



In [10]:
df_calendar_cleaned.count()

37540

In [11]:
final_df = listing_df_cleaned.join(
    df_calendar_cleaned,
    df_calendar_cleaned["listing_id"] == listing_df_cleaned["id"],
    "inner"
).select(
    listing_df_cleaned["*"],  
    df_calendar_cleaned["availability_2025"]
)

final_df.show(10, truncate=False)


+-------+--------------------------------------------------+--------+----------------+---------+---------+---------------+-----+--------------+-----------------+---------+-----------------+
|id     |name                                              |host_id |host_name       |latitude |longitude|room_type      |price|minimum_nights|number_of_reviews|borough  |availability_2025|
+-------+--------------------------------------------------+--------+----------------+---------+---------+---------------+-----+--------------+-----------------+---------+-----------------+
|49048  |Serene Private Room w own bathroom                |35935   |Angela          |40.6829  |-73.95701|Private room   |95.0 |30            |28               |BROOKLYN |246              |
|385824 |New York City- Riverdale Modern two bedrooms unit |1931205 |Orit            |40.88095 |-73.91701|Entire home/apt|214.0|30            |146              |BRONX    |247              |
|818518 |Manhattan beautiful spacious fire escape 

In [12]:
null_counts = final_df.select(
    *[sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in final_df.columns]
)
null_counts.show()
listing_df.count()

+---+----+-------+---------+--------+---------+---------+-----+--------------+-----------------+-------+-----------------+
| id|name|host_id|host_name|latitude|longitude|room_type|price|minimum_nights|number_of_reviews|borough|availability_2025|
+---+----+-------+---------+--------+---------+---------+-----+--------------+-----------------+-------+-----------------+
|  0|   0|      0|        0|       0|        0|        0|    0|             0|                0|      0|                0|
+---+----+-------+---------+--------+---------+---------+-----+--------------+-----------------+-------+-----------------+



37698

In [13]:
window_spec_price = Window.partitionBy("borough").orderBy(col("price").desc())
window_spec_availability = Window.partitionBy("borough").orderBy(col("availability_2025").desc())
window_spec_reviews = Window.partitionBy("borough").orderBy(col("number_of_reviews").desc())

df_final = (
    final_df.withColumn("price_rank", rank().over(window_spec_price))
      .withColumn("availability_2025", rank().over(window_spec_availability))
      .withColumn("reviews_rank", rank().over(window_spec_reviews))
)

df_final.show(20)

+--------+--------------------+---------+-----------------+---------+---------+---------------+-----+--------------+-----------------+-------+-----------------+----------+------------+
|      id|                name|  host_id|        host_name| latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|borough|availability_2025|price_rank|reviews_rank|
+--------+--------------------+---------+-----------------+---------+---------+---------------+-----+--------------+-----------------+-------+-----------------+----------+------------+
| 9788114|Beautiful Grand S...|   785524|             Eric| 40.81965|-73.92911|   Private room| 99.0|             1|              524|  BRONX|              762|       387|           1|
|20480983|Spacious Top Floo...|   785524|             Eric| 40.82122|-73.92859|   Private room| 91.0|             1|              396|  BRONX|              762|       445|           2|
|  182177|A PRIVATE FLAT / ...|   873273|Christian & Carla| 40.86502|-73.85

In [None]:
from google.cloud import bigquery

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "de_jads_temp_708179"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket)
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
# Saving the data to BigQuery
df_final.write.format('bigquery') \
  .option('table', 'de2024-435320.assignment_2.airbnb') \
  .mode("overwrite") \
  .save()