In [0]:
_ = spark.sql(f"CREATE CATALOG IF NOT EXISTS airbnb")
_ = spark.sql(f"CREATE SCHEMA IF NOT EXISTS airbnb.raw")
_ = spark.sql(f"CREATE VOLUME IF NOT EXISTS airbnb.raw.vol")

In [0]:
contracts_path = "/Volumes/airbnb/raw/vol/contracts"
print(f'Contracts path : contrancts_path="{contracts_path}"')

In [0]:
import json
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType

In [0]:
listing_schema = StructType(
    [
        StructField("listing_url", StringType(), True),
        StructField("scrape_id", LongType(), True),
        StructField("last_scraped", StringType(), True),
        StructField("source", StringType(), True),
        StructField("name", StringType(), True),
        StructField("description", StringType(), True),
        StructField("neighborhood_overview", StringType(), True),
        StructField("picture_url", StringType(), True),
        StructField("host_id", LongType(), True),
        StructField("host_url", StringType(), True),
        StructField("host_name", StringType(), True),
        StructField("host_since", StringType(), True),
        StructField("host_location", StringType(), True),
        StructField("host_about", StringType(), True),
        StructField("host_response_time", StringType(), True),
        StructField("host_response_rate", StringType(), True),
        StructField("host_acceptance_rate", StringType(), True),
        StructField("host_is_superhost", StringType(), True),
        StructField("host_thumbnail_url", StringType(), True),
        StructField("host_picture_url", StringType(), True),
        StructField("host_neighbourhood", StringType(), True),
        StructField("host_listings_count", DoubleType(), True),
        StructField("host_total_listings_count", DoubleType(), True),
        StructField("host_verifications", StringType(), True),
        StructField("host_has_profile_pic", StringType(), True),
        StructField("host_identity_verified", StringType(), True),
        StructField("neighbourhood", StringType(), True),
        StructField("neighbourhood_cleansed", StringType(), True),
        StructField("neighbourhood_group_cleansed", DoubleType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("property_type", StringType(), True),
        StructField("room_type", StringType(), True),
        StructField("accommodates", LongType(), True),
        StructField("bathrooms", DoubleType(), True),
        StructField("bathrooms_text", StringType(), True),
        StructField("bedrooms", DoubleType(), True),
        StructField("beds", DoubleType(), True),
        StructField("amenities", StringType(), True),
        StructField("price", StringType(), True),
        StructField("minimum_nights", LongType(), True),
        StructField("maximum_nights", LongType(), True),
        StructField("minimum_minimum_nights", LongType(), True),
        StructField("maximum_minimum_nights", LongType(), True),
        StructField("minimum_maximum_nights", LongType(), True),
        StructField("maximum_maximum_nights", LongType(), True),
        StructField("minimum_nights_avg_ntm", DoubleType(), True),
        StructField("maximum_nights_avg_ntm", DoubleType(), True),
        StructField("calendar_updated", DoubleType(), True),
        StructField("has_availability", StringType(), True),
        StructField("availability_30", LongType(), True),
        StructField("availability_60", LongType(), True),
        StructField("availability_90", LongType(), True),
        StructField("availability_365", LongType(), True),
        StructField("calendar_last_scraped", StringType(), True),
        StructField("number_of_reviews", LongType(), True),
        StructField("number_of_reviews_ltm", LongType(), True),
        StructField("number_of_reviews_l30d", LongType(), True),
        StructField("first_review", StringType(), True),
        StructField("last_review", StringType(), True),
        StructField("review_scores_rating", DoubleType(), True),
        StructField("review_scores_accuracy", DoubleType(), True),
        StructField("review_scores_cleanliness", DoubleType(), True),
        StructField("review_scores_checkin", DoubleType(), True),
        StructField("review_scores_communication", DoubleType(), True),
        StructField("review_scores_location", DoubleType(), True),
        StructField("review_scores_value", DoubleType(), True),
        StructField("license", StringType(), True),
        StructField("instant_bookable", StringType(), True),
        StructField("calculated_host_listings_count", LongType(), True),
        StructField("calculated_host_listings_count_entire_homes", LongType(), True),
        StructField("calculated_host_listings_count_private_rooms", LongType(), True),
        StructField("calculated_host_listings_count_shared_rooms", LongType(), True),
        StructField("reviews_per_month", DoubleType(), True),
    ]
)

In [0]:
calendar_schema = StructType(
    [
        StructField("date", StringType(), True),
        StructField("available", StringType(), True),
        StructField("price", StringType(), True),
        StructField("adjusted_price", StringType(), True),
        StructField("minimum_nights", LongType(), True),
        StructField("maximum_nights", LongType(), True),
    ]
)

In [0]:
agg_listing_schema = StructType(
    [
        StructField("name", StringType(), True),
        StructField("host_id", LongType(), True),
        StructField("host_name", StringType(), True),
        StructField("neighbourhood_group", DoubleType(), True),
        StructField("neighbourhood", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("room_type", StringType(), True),
        StructField("price", LongType(), True),
        StructField("minimum_nights", LongType(), True),
        StructField("number_of_reviews", LongType(), True),
        StructField("last_review", StringType(), True),
        StructField("reviews_per_month", DoubleType(), True),
        StructField("calculated_host_listings_count", LongType(), True),
        StructField("availability_365", LongType(), True),
        StructField("number_of_reviews_ltm", LongType(), True),
        StructField("license", StringType(), True),
    ]
)

In [0]:
dbutils.fs.put(contracts_path + "/listing_schema.json", listing_schema.json(), True)
dbutils.fs.put(contracts_path + "/calendar_schema.json", calendar_schema.json(), True)
dbutils.fs.put(
    contracts_path + "/agg_listing_schema.json", agg_listing_schema.json(), True
)