In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Advanced_DF_EX1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# load data
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

airbnb_gsc_file_path = 'gs://data_de2024_708179/AB_NYC_2019.csv'

In [9]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType
from pyspark.sql.functions import col, max, min, regexp_replace, upper, count, trim, rank



dataSchema = StructType(
        [StructField("id", IntegerType(), True),
         StructField("name", StringType(), True),
         StructField("host_id", IntegerType(), True),
         StructField("host_name", StringType(), True),
         StructField("neighbourhood_group", StringType(), True),
         StructField("neighbourhood", StringType(), True),       
         StructField("latitude", FloatType(), True),
         StructField("longitude", FloatType(), True),
         StructField("room_type", StringType(), True),
         StructField("price", IntegerType(), True),       
         StructField("minimum_nights", IntegerType(), True),
         StructField("number_of_reviews", IntegerType(), True),
         StructField("last_review", DateType(), True),
         StructField("reviews_per_month", FloatType(), True),       
         StructField("calculated_host_listings_count", IntegerType(), True),
         StructField("availability_365", IntegerType(), True),
         ])

df = spark.read.schema(dataSchema).format("csv").option("header", "true") \
       .load(airbnb_gsc_file_path)\
       .withColumn('borough', upper(col('neighbourhood_group')))
df.printSchema()
df.show(10)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: date (nullable = true)
 |-- reviews_per_month: float (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: integer (nullable = true)
 |-- borough: string (nullable = true)

+----+--------------------+-------+-----------+-------------------+------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------+
| 

In [11]:
from google.cloud import bigquery

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "de_jads_temp_708179"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket)
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
# Saving the data to BigQuery
df.write.format('bigquery') \
  .option('table', 'de2024-435320.assignment_2.airbnb') \
  .mode("overwrite") \
  .save()