In [76]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Advanced_DF_EX1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# load data
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")


airbnb_gsc_file_path = 'gs://data_de2024_708179/AB_NYC_2019.csv'

In [77]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType
from pyspark.sql.functions import col, max, min, regexp_replace, upper, count, trim, rank, sum, when, avg


dataSchema = StructType(
        [StructField("id", IntegerType(), True),
         StructField("name", StringType(), True),
         StructField("host_id", IntegerType(), True),
         StructField("host_name", StringType(), True),
         StructField("neighbourhood_group", StringType(), True),
         StructField("neighbourhood", StringType(), True),       
         StructField("latitude", FloatType(), True),
         StructField("longitude", FloatType(), True),
         StructField("room_type", StringType(), True),
         StructField("price", IntegerType(), True),       
         StructField("minimum_nights", IntegerType(), True),
         StructField("number_of_reviews", IntegerType(), True),
         StructField("last_review", DateType(), True),
         StructField("reviews_per_month", FloatType(), True),       
         StructField("calculated_host_listings_count", IntegerType(), True),
         StructField("availability_365", IntegerType(), True),
         ])

df.printSchema()
df.show(10)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: date (nullable = true)
 |-- reviews_per_month: float (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: integer (nullable = true)
 |-- borough: string (nullable = true)

+----+--------------------+-------+-----------+-------------------+------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------+
| 

In [80]:
df = df.withColumn('borough', upper(col('neighbourhood_group')))
df = df.drop('last_review', 'reviews_per_month', 'calculated_host_listings_count', 'neighbourhood_group')

df.show()

+----+--------------------+-------+----------------+------------------+--------+---------+---------------+-----+--------------+-----------------+----------------+---------+
|  id|                name|host_id|       host_name|     neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|availability_365|  borough|
+----+--------------------+-------+----------------+------------------+--------+---------+---------------+-----+--------------+-----------------+----------------+---------+
|2539|Clean & quiet apt...|   2787|            John|        Kensington|40.64749|-73.97237|   Private room|  149|             1|                9|             365| BROOKLYN|
|2595|Skylit Midtown Ca...|   2845|        Jennifer|           Midtown|40.75362|-73.98377|Entire home/apt|  225|             1|               45|             355|MANHATTAN|
|3647|THE VILLAGE OF HA...|   4632|       Elisabeth|            Harlem|40.80902| -73.9419|   Private room|  150|             3|        

In [82]:
null_counts = df.select(
    *[sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df.columns]
)
null_counts.show()
df.count()

+---+----+-------+---------+-------------+--------+---------+---------+-----+--------------+-----------------+----------------+-------+
| id|name|host_id|host_name|neighbourhood|latitude|longitude|room_type|price|minimum_nights|number_of_reviews|availability_365|borough|
+---+----+-------+---------+-------------+--------+---------+---------+-----+--------------+-----------------+----------------+-------+
|184|  32|    350|      206|          185|     194|      343|      185|  194|           188|              342|             342|    185|
+---+----+-------+---------+-------------+--------+---------+---------+-----+--------------+-----------------+----------------+-------+



49079

In [85]:
df_cleaned = df.dropna(subset=['id', 'name', 'host_id', 'host_name', 'room_type', 'price','number_of_reviews', 'availability_365', 'borough'])
df_cleaned.count()

48683

In [88]:
df_cleaned.show(5)

+----+--------------------+-------+-----------+-------------+--------+---------+---------------+-----+--------------+-----------------+----------------+---------+
|  id|                name|host_id|  host_name|neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|availability_365|  borough|
+----+--------------------+-------+-----------+-------------+--------+---------+---------------+-----+--------------+-----------------+----------------+---------+
|2539|Clean & quiet apt...|   2787|       John|   Kensington|40.64749|-73.97237|   Private room|  149|             1|                9|             365| BROOKLYN|
|2595|Skylit Midtown Ca...|   2845|   Jennifer|      Midtown|40.75362|-73.98377|Entire home/apt|  225|             1|               45|             355|MANHATTAN|
|3647|THE VILLAGE OF HA...|   4632|  Elisabeth|       Harlem|40.80902| -73.9419|   Private room|  150|             3|                0|             365|MANHATTAN|
|3831|Cozy Entire Floo

In [92]:
window_spec_price = Window.partitionBy("borough").orderBy(col("price").desc())
window_spec_availability = Window.partitionBy("borough").orderBy(col("availability_365").desc())
window_spec_reviews = Window.partitionBy("borough").orderBy(col("number_of_reviews").desc())

df_final = (
    df_cleaned.withColumn("price_rank", rank().over(window_spec_price))
      .withColumn("availability_rank", rank().over(window_spec_availability))
      .withColumn("reviews_rank", rank().over(window_spec_reviews))
)

df_final.show(20)

+--------+--------------------+--------+--------------+--------------------+--------+---------+---------------+-----+--------------+-----------------+----------------+--------+----------+-----------------+------------+
|      id|                name| host_id|     host_name|       neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|availability_365| borough|price_rank|availability_rank|reviews_rank|
+--------+--------------------+--------+--------------+--------------------+--------+---------+---------------+-----+--------------+-----------------+----------------+--------+----------+-----------------+------------+
| 3474320|Private brownston...|12949460|           Asa|          Park Slope|40.67926|-73.97711|Entire home/apt|  160|             1|              488|             269|BROOKLYN|      3865|             3450|           1|
|  166172|LG Private Room/F...|  792159|         Wanda|            Bushwick|40.70283|-73.92131|   Private room|   60|       

In [11]:
from google.cloud import bigquery

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "de_jads_temp_708179"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket)
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
# Saving the data to BigQuery
df_final.write.format('bigquery') \
  .option('table', 'de2024-435320.assignment_2.airbnb') \
  .mode("overwrite") \
  .save()