In [1]:
#import necessary dependencies
from pyspark.context import SparkContext
from awsglue.context import GlueContext
import boto3
import io
import sys
import os
import re
import random
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, StructType, StructField
from pyspark.sql.window import Window
from awsglue.transforms import *
from awsglue.transforms import ApplyMapping
from awsglue.dynamicframe import DynamicFrame

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Trying to create a Glue session for the kernel.
Session Type: glueetl
Session ID: c0df4415-bd32-4a9f-aa29-90dbd9212d68
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session c0df4415-bd32-4a9f-aa29-90dbd9212d68 to get into ready status...
Session c0df4415-bd32-4a9f-aa29-90dbd9212d68 has been created.



In [3]:
#initialize spark context and glue context
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session




In [4]:
# create dynamicframes 
def create_dynamic_frame_from_csv(file):
    """
    This function creates a dynamic frame from a CSV file
  
    Parameter(csv file): The CSV file to create a dynamic frame
    
    Returns: The dynamic frame created from the CSV file
    """

    dynamic_frame = glueContext.create_dynamic_frame.from_options(
        format_options={"withHeader": True, "separator": ","},
        connection_type="s3",
        format="csv",
        connection_options={"paths": [file], "recurse": True},
        transformation_ctx="airbnbETLpipeline"
  )
    return dynamic_frame




In [5]:
# create dynamicframes for all the cities
dubaiData = create_dynamic_frame_from_csv('s3://airbnb-listings-data/raw-data/DubaiData.csv')
londonData  = create_dynamic_frame_from_csv('s3://airbnb-listings-data/raw-data/LondonData.csv')
miamiData = create_dynamic_frame_from_csv('s3://airbnb-listings-data/raw-data/MiamiData.csv')
newYorkCityData = create_dynamic_frame_from_csv('s3://airbnb-listings-data/raw-data/NYCData.csv')
sanFranciscoData = create_dynamic_frame_from_csv('s3://airbnb-listings-data/raw-data/SanFransiscoData.csv')
sydneyData = create_dynamic_frame_from_csv('s3://airbnb-listings-data/raw-data/SydneyData.csv')
tokyoData =  create_dynamic_frame_from_csv('s3://airbnb-listings-data/raw-data/TokyoData.csv')
torontoData  = create_dynamic_frame_from_csv('s3://airbnb-listings-data/raw-data/TorontoData.csv')
losAngelesData = create_dynamic_frame_from_csv('s3://airbnb-listings-data/raw-data/LAData.csv')




In [6]:
#convert dynamicframe to dataframe and check the schema to map the columns
dubaiData.printSchema()

root
|-- Listing Title: string
|-- Property Type: string
|-- Listing Type: string
|-- Created Date: string
|-- Last Scraped Date: string
|-- Country: string
|-- City: string
|-- Zipcode: string
|-- Currency Native: string
|-- Number of Reviews: string
|-- Bedrooms: string
|-- Bathrooms: string
|-- Max Guests: string
|-- Airbnb Superhost: string
|-- Cancellation Policy: string
|-- Cleaning Fee (USD): string
|-- Cleaning Fee (Native): string
|-- Extra People Fee (USD): string
|-- Extra People Fee(Native): string
|-- Check-in Time: string
|-- Checkout Time: string
|-- Minimum Stay: string
|-- Latitude: string
|-- Longitude: string
|-- Exact Location: string
|-- Overall Rating: string
|-- Airbnb Communication Rating: string
|-- Airbnb Accuracy Rating: string
|-- Airbnb Cleanliness Rating: string
|-- Airbnb Checkin Rating: string
|-- Airbnb Location Rating: string
|-- Airbnb Value Rating: string
|-- Amenities: string
|-- picture_url: string
|-- License: string
|-- Airbnb Property ID: string

In [12]:
# create a list of all the dynamic frames
mapped_data = [
    ("Listing Title", "string", "listingTitle", "string"),
    ("Property Type", "string", "propertyType", "string"),
    ("Listing Type", "string", "listingType", "string"),
    ("Created Date", "string", "createdDate", "string"),
    ("Last Scraped Date", "string", "lastScrapedDate", "string"),
    ("Country", "string", "country", "string"),
    ("State", "string", "state", "string"),
    ("City", "string", "city", "string"),
    ("Zipcode", "string", "zipcode", "string"),
    ("Longitude", "string", "longitude", "float"),  
    ("guest_controls", "string", "guestControls", "string"), 
    ("Airbnb Location Rating", "string", "airbnbLocationRating", "int"), 
    ("Currency Native", "string", "currencyNative", "string"),
    ("Average Daily Rate (Native)", "float", "averageDailyRateNative", "float"), 
    ("Average Daily Rate (USD)", "float", "averageDailyRateUSD", "float"), 
    ("Count Blocked Days LTM", "string", "countBlockedDaysLTM", "int"), 
    ("Bedrooms", "string", "bedrooms", "int"), 
    ("Airbnb Property ID", "string", "airbnbPropertyID", "string"), 
    ("Check-in Time", "string", "checkInTime", "string"), 
    ("Bathrooms", "string", "bathrooms", "int"), 
    ("Airbnb Communication Rating", "string", "airbnbCommunicationRating", "int"),
    ("Airbnb Host ID", "string", "airbnbHostID", "string"),
    ("Pets Allowed", "string", "petsAllowed", "string"),
    ("Extra People Fee(Native)", "string", "extraPeopleFeeNative", "float"),
    ("License", "string", "license", "string"),
    ("Instantbook Enabled", "string", "instantbookEnabled", "string"),
    ("Amenities", "string", "amenities", "string"),
    ("Overall Rating", "string", "overallRating", "float"),
    ("Airbnb Accuracy Rating", "string", "airbnbAccuracyRating", "int"),
    ("Cancellation Policy", "string", "cancellationPolicy", "string"),
    ("Cleaning Fee (USD)", "string", "cleaningFeeUSD", "float"),
    ("Listing URL", "string", "listingURL", "string"),
    ("instant_bookable", "string", "instantBookable", "string"),
    ("picture_url", "string", "pictureUrl", "string"),
    ("Latitude", "string", "latitude", "float"),
    ("Count Available Days LTM", "int", "countavailableDaysLTM", "int"),
    ("Count Reservation Days LTM", "int", "countReservationDaysLTM", "int"),
    ("Number of Reviews", "string", "numberofReviews", "int"),
    ("Checkout Time", "string", "checkoutTime", "string"),
    ("Airbnb Value Rating", "string", "airbnbValueRating", "int"),
    ("Extra People Fee (USD)", "string", "extraPeopleFeeUSD", "float"),
    ("Airbnb Checkin Rating", "string", "airbnbCheckinRating", "int"),
    ("Airbnb Superhost", "string", "airbnbSuperhost", "string"),
    ("Exact Location", "string", "exactLocation", "string"),
    ("Host Listing Count", "string", "hostListingCount", "int"),
    ("Minimum Stay", "string", "minimumStay", "int"),
    ("Occupancy Rate LTM", "string", "occupancyRateLTM", "int"),
    ("Max Guests", "string", "maxGuests", "int"),
    ("Number of Bookings LTM", "string", "numberOfBookingsLTM", "int"),
    ("Number of Bookings LTM - Number of observed month", "string", "numberOfBookingsLTMObservedMonth", "int"),
    ("Airbnb Cleanliness Rating", "string", "airbnbCleanlinessRating", "int"),
    ("Annual Revenue LTM (USD)", "string", "annualRevenueLTMUSD", "float"), 
    ("Annual Revenue LTM (Native)", "string", "annualRevenueLTMNative", "float"), 
]





In [8]:
def applymappings(dyf, mapped_data, transform_ctx_tag):
    """ 
    This function applies mappings to the dataframe

    Parameters:
    dyf: The dataframe to apply mappings to
    mapped_data: The mapping data
    transform_ctx_tag: The transformation context tag
    Returns: The mapped dataframe
    """
    mapped_dyf = ApplyMapping.apply(
        frame = dyf,
        mappings = mapped_data,
        transformation_ctx = transform_ctx_tag
    )
    
    return mapped_dyf




In [13]:
# apply mappings to all the cities
mapped_losAngelesData= applymappings(losAngelesData, mapped_data, "mappedLosAngelesData")
mapped_londonData = applymappings(londonData, mapped_data, "mappedLondonData")
mapped_newYorkCityData = applymappings(newYorkCityData, mapped_data, "mappedNewYorkCityData")
mapped_sanFranciscoData = applymappings(sanFranciscoData, mapped_data, "mappedSanFranciscoData")
mapped_sydneyData = applymappings(sydneyData, mapped_data, "mappedSydneyData")
mapped_tokyoData = applymappings(tokyoData, mapped_data, "mappedTokyoData")
mapped_torontoData = applymappings(torontoData, mapped_data, "mapped_torontoData")




In [14]:
# Apply mapping fucntions to miamiData and add column differences
mapped_data_copy_miami = mapped_data.copy()

mapped_data_copy_miami.extend([("Neighbourhood", "string", "neighbourhood", "string"), 
                           ("Metropolitan Statistical Area", "string", "metropolitan_statistical_area'", "string"),
                           ("Last Host Count Updated Date", "string", "last_host_count_updated_date", "string")])

mapped_miamiData = applymappings(miamiData, mapped_data_copy_miami, "mappedMiamiData")




In [15]:
# Apply mapping fucntions to dubaiData and add column differences
mapped_data_copy_dubai = mapped_data.copy()

mapped_data_copy_dubai.remove(("State", "string", "state", "string"))

mapped_dubaiData = applymappings(dubaiData, mapped_data_copy_dubai, 'mappedDubaiData')




In [16]:
# Convert dynamic frame to spark dataframe
losAngelesData_dyf = mapped_losAngelesData.toDF()
londonData_dyf = mapped_londonData.toDF() 
miamiData_dyf = mapped_miamiData.toDF() 
newYorkCityData_dyf = mapped_newYorkCityData.toDF() 
sanFranciscoData_dyf = mapped_sanFranciscoData.toDF() 
sydneyData_dyf = mapped_sydneyData.toDF() 
tokyoData_dyf = mapped_tokyoData.toDF()
dubaiData_dyf = mapped_dubaiData.toDF() 
torontoData_dyf = mapped_torontoData.toDF()




In [17]:
# show the first 10 rows of the dubaiData
dubaiData_dyf.show(10)

+--------------------+--------------------+------------+-----------+---------------+-------+-----+-------+---------+--------------------+--------------------+--------------+----------------------+-------------------+-------------------+--------+------------------+--------------------+---------+-------------------------+------------+-----------+--------------------+---------------+------------------+--------------------+-------------+--------------------+--------------------+--------------+--------------------+---------------+--------------------+--------+---------------------+-----------------------+---------------+------------+-----------------+-----------------+-------------------+---------------+-------------+----------------+-----------+----------------+---------+-------------------+--------------------------------+-----------------------+-------------------+----------------------+
|        listingTitle|        propertyType| listingType|createdDate|lastScrapedDate|country| city|zip

In [18]:
def get_shape(dyf):
    """
    Rturns the number of rows and columns of a dataframe
    """
    return f"rows: {dyf.count()}, columns: {len(dyf.columns)}"




In [19]:
print(f"Dubai: {get_shape(dubaiData_dyf)}")
print(f"Los Angeles: {get_shape(losAngelesData_dyf)}")
print(f"London: {get_shape(londonData_dyf)}")
print(f"Miami: {get_shape(miamiData_dyf)}")
print(f"New York City: {get_shape(newYorkCityData_dyf)}")
print(f"San Francisco: {get_shape(sanFranciscoData_dyf)}")
print(f"Sydney: {get_shape(sydneyData_dyf)}")
print(f"Tokyo: {get_shape(tokyoData_dyf)}")
print(f"Toronto: {get_shape(torontoData_dyf)}")

Dubai: rows: 13560, columns: 52
Los Angeles: rows: 12837, columns: 53
London: rows: 43580, columns: 53
Miami: rows: 10205, columns: 56
New York City: rows: 40343, columns: 53
San Francisco: rows: 3801, columns: 53
Sydney: rows: 5938, columns: 53
Tokyo: rows: 9661, columns: 53
Toronto: rows: 5900, columns: 53


In [20]:
# Find common columns across all the dataframes 
unique_columns = set(miamiData_dyf.columns).intersection(
    set(dubaiData_dyf.columns), 
    set(losAngelesData_dyf.columns),
    set(londonData_dyf.columns),
    set(newYorkCityData_dyf.columns),
    set(sanFranciscoData_dyf.columns),
    set(sydneyData_dyf.columns),
    set(tokyoData_dyf.columns),
    set(torontoData_dyf.columns)
)
print(f'{unique_columns} \n Number of columns in common: {len(unique_columns)}')

{'cleaningFeeUSD', 'bathrooms', 'occupancyRateLTM', 'guestControls', 'listingType', 'zipcode', 'averageDailyRateNative', 'bedrooms', 'airbnbCleanlinessRating', 'listingURL', 'numberOfBookingsLTMObservedMonth', 'airbnbLocationRating', 'checkInTime', 'airbnbCommunicationRating', 'amenities', 'lastScrapedDate', 'hostListingCount', 'airbnbAccuracyRating', 'country', 'countavailableDaysLTM', 'checkoutTime', 'airbnbValueRating', 'exactLocation', 'maxGuests', 'createdDate', 'averageDailyRateUSD', 'cancellationPolicy', 'airbnbHostID', 'countBlockedDaysLTM', 'propertyType', 'airbnbSuperhost', 'annualRevenueLTMNative', 'listingTitle', 'latitude', 'numberofReviews', 'city', 'petsAllowed', 'minimumStay', 'numberOfBookingsLTM', 'annualRevenueLTMUSD', 'extraPeopleFeeUSD', 'extraPeopleFeeNative', 'currencyNative', 'airbnbPropertyID', 'license', 'pictureUrl', 'longitude', 'airbnbCheckinRating', 'overallRating', 'instantbookEnabled', 'countReservationDaysLTM', 'instantBookable'} 
 Number of columns in 

In [21]:
# Find the column differences in comparison to miami dynamic frame with thighest number of columns
print(f"miami_dubai_column_differences: {set(miamiData_dyf.columns).difference(set(dubaiData_dyf.columns))}")

miami_dubai_column_differences: {'state', 'neighbourhood', "metropolitan_statistical_area'", 'last_host_count_updated_date'}


In [22]:
# Find the column differences in comparison to miami dynamic frame with highest number of columns
print(f"miami_toronto_column_differences: {set(miamiData_dyf.columns).difference(set(torontoData_dyf.columns))}")

miami_toronto_column_differences: {'neighbourhood', "metropolitan_statistical_area'", 'last_host_count_updated_date'}


In [23]:
# Find the column differences in comparison to miami dynamic frame with thighest number of columns
print(f"miami_la_column_differences: {set(miamiData_dyf.columns).difference(set(losAngelesData_dyf.columns))}")

miami_la_column_differences: {'neighbourhood', "metropolitan_statistical_area'", 'last_host_count_updated_date'}


In [24]:
print(f"miami_london_column_differences: {set(miamiData_dyf.columns).difference(set(londonData_dyf.columns))}")

miami_london_column_differences: {'neighbourhood', "metropolitan_statistical_area'", 'last_host_count_updated_date'}


In [25]:
print(f"miami_nyc_column_differences: {set(miamiData_dyf.columns).difference(set(newYorkCityData_dyf.columns))}")

miami_nyc_column_differences: {'neighbourhood', "metropolitan_statistical_area'", 'last_host_count_updated_date'}


In [26]:
print(f"miami_sanfransisco_column_differences: {set(miamiData_dyf.columns).difference(set(sanFranciscoData_dyf.columns))}")

miami_sanfransisco_column_differences: {'neighbourhood', "metropolitan_statistical_area'", 'last_host_count_updated_date'}


In [27]:
print(f"miami_sydney_column_differences: {set(miamiData_dyf.columns).difference(set(sydneyData_dyf.columns))}")

miami_sydney_column_differences: {'neighbourhood', "metropolitan_statistical_area'", 'last_host_count_updated_date'}


In [28]:
print(f"miami_tokyo_column_differences: {set(miamiData_dyf.columns).difference(set(tokyoData_dyf.columns))}")

miami_tokyo_column_differences: {'neighbourhood', "metropolitan_statistical_area'", 'last_host_count_updated_date'}


In [29]:
# Drop Neighbourhood, Last Host Count Updated Date,Metropolitan Statistical Area columns
miamiData_dyf = miamiData_dyf.drop('Neighbourhood', 'Last Host Count Updated Date','Metropolitan Statistical Area')




In [30]:
# Remove last scraped data from all the dataframes
losAngelesData_dyf = losAngelesData_dyf.drop('Last Scraped Date')
londonData_dyf = londonData_dyf.drop('Last Scraped Date')
miamiData_dyf = miamiData_dyf.drop('Last Scraped Date')
newYorkCityData_dyf = newYorkCityData_dyf.drop('Last Scraped Date')
sanFranciscoData_dyf = sanFranciscoData_dyf.drop('Last Scraped Date')
sydneyData_dyf = sydneyData_dyf.drop('Last Scraped Date')
tokyoData_dyf = tokyoData_dyf.drop('Last Scraped Date')
dubaiData_dyf = dubaiData_dyf.drop('Last Scraped Date')
torontoData_dyf = torontoData_dyf.drop('Last Scraped Date')




In [31]:
# Drop duplicates
londonData_dyf = londonData_dyf.dropDuplicates()
losAngelesData_dyf = losAngelesData_dyf.dropDuplicates()
miamiData_dyf = miamiData_dyf.dropDuplicates()
newYorkCityData_dyf = newYorkCityData_dyf.dropDuplicates()
sanFranciscoData_dyf = sanFranciscoData_dyf.dropDuplicates()
sydneyData_dyf = sydneyData_dyf.dropDuplicates()
tokyoData_dyf = tokyoData_dyf.dropDuplicates()
dubaiData_dyf = dubaiData_dyf.dropDuplicates()
torontoData_dyf = torontoData_dyf.dropDuplicates()




In [32]:
def fill_missing_values(dyf):
    """
    This function fills missing values in the dataframe

    Parameters:
    dyf: The dataframe to fill missing values

    Returns: The dataframe with missing values filled
    """
    for col_name, dtype in dyf.dtypes:
        if dtype in ["string"]:
            dyf = dyf.fillna("NA", subset=[col_name])
        elif dtype in ["float"]:
            dyf = dyf.fillna(0.0, subset=[col_name])
        elif dtype in ["int"]:
            dyf = dyf.fillna(0, subset=[col_name])
        elif dtype in ["date"]:
            dyf = dyf.fillna("NA", subset=[col_name])
        else:
            continue
    return dyf




In [33]:
# create id columns in dataframes
losAngelesData_dyf = fill_missing_values(losAngelesData_dyf)
londonData_dyf = fill_missing_values(londonData_dyf)
miamiData_dyf = fill_missing_values(miamiData_dyf)
newYorkCityData_dyf = fill_missing_values(newYorkCityData_dyf)
sanFranciscoData_dyf = fill_missing_values(sanFranciscoData_dyf)
sydneyData_dyf = fill_missing_values(sydneyData_dyf)
tokyoData_dyf = fill_missing_values(tokyoData_dyf)
dubaiData_dyf = fill_missing_values(dubaiData_dyf)
torontoData_dyf = fill_missing_values(torontoData_dyf)




In [34]:
# Define emojis in unicodes 
emoji_patterns = re.compile("(?u)["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002700-\U000027BF"  # Dingbats
        u"\U00002600-\U000026FF"  # Miscellaneous Symbols
        u"\U00002B00-\U00002BFF"  # Miscellaneous Symbols and Arrows
        u"\U0001F100-\U0001F1FF"  # Enclosed Alphanumeric Supplement
        u"\U0001F200-\U0001F2FF"  # Enclosed Ideographic Supplement
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u" \U000025A0-\U000025FF" # Geometric Shapes
          +"]")




In [35]:
def remove_emojis(text):
    """
    This function removes emojis from text

    Parameters:
    text: The text to remove emojis from

    Returns: The text with emojis removed
    """
    if isinstance(text, str):
        return emoji_patterns.sub(' ', text)
    else:
        return text




In [36]:
# create a udyf for remove emoji
remove_emoji_udf = F.udf(remove_emojis)




In [37]:
# Function to remove emojis from all the dataframes
def remove_emojis_from(dyf):
    """
    This function removes emojis from all the columns in a dataframe

    Parameters:
    dyf: The dataframe to remove emojis from

    Returns: The dataframe with emojis removed
    """
    for col_name in dyf.columns:
        dyf = dyf.withColumn(col_name, remove_emoji_udf(F.col(col_name)))
    return dyf




In [38]:
# Remove emojis from all the dataframes
tokyoData_dyf = remove_emojis_from(tokyoData_dyf)
dubaiData_dyf = remove_emojis_from(dubaiData_dyf)
losAngelesData_dyf = remove_emojis_from(losAngelesData_dyf)
londonData_dyf = remove_emojis_from(londonData_dyf)
newYorkCityData_dyf = remove_emojis_from(newYorkCityData_dyf)
sanFranciscoData_dyf = remove_emojis_from(sanFranciscoData_dyf)
sydneyData_dyf = remove_emojis_from(sydneyData_dyf)
torontoData_dyf = remove_emojis_from(torontoData_dyf)
miamiData_dyf = remove_emojis_from(miamiData_dyf)




In [40]:
dubaiData_dyf.show(10)

+--------------------+--------------------+------------+-----------+---------------+-------+-----+-------+------------------+--------------------+--------------------+--------------+----------------------+-------------------+-------------------+--------+------------------+--------------------+---------+-------------------------+------------+-----------+--------------------+------------+------------------+--------------------+-----------------+--------------------+--------------------+--------------+--------------------+---------------+--------------------+------------------+---------------------+-----------------------+---------------+------------+-----------------+------------------+-------------------+---------------+-------------+----------------+-----------+----------------+---------+-------------------+--------------------------------+-----------------------+-------------------+----------------------+
|        listingTitle|        propertyType| listingType|createdDate|lastScrapedD

In [41]:
def standardised_columns(dyf, country, city= None, state=None, currency_native=None):
    """
    This function standardises the columns in the dataframe

    Parameters:
    dyf: The dataframe to standardise columns
    country: The country to add to the dataframe
    city: The city to add to the dataframe
    state: The state to add to the dataframe
    currency_native: The currency native to add to the dataframe

    Returns: The dataframe with standardised columns
    """ 
    dyf = dyf.withColumn('Country', F.lit(country))
    if city is not None:
        dyf = dyf.withColumn('City', F.lit(city))
    if state is not None:
        dyf = dyf.withColumn('State', F.lit(state))
    if currency_native is not None:
        dyf = dyf.withColumn('Currency Native', F.lit(None))
    return dyf




In [42]:
# Standardise columns for all the dataframes
standardised_columns(dyf=dubaiData_dyf, country = 'AE', city = 'Dubai', state = 'dubai', currency_native= 'AED' )
standardised_columns(dyf=losAngelesData_dyf, country = 'US', state = 'Los Angeles', currency_native= 'USD')
standardised_columns(dyf=londonData_dyf,  country = 'GB', state = 'UK',  currency_native= 'GBP')
standardised_columns(dyf=miamiData_dyf, country = 'US',state = 'Florida',  currency_native= 'GBP' )
standardised_columns(dyf=newYorkCityData_dyf, country = 'US',  currency_native= 'USD' )
standardised_columns(dyf=sanFranciscoData_dyf,  country = 'US', state = 'California', city = 'San Francisco', currency_native= 'USD' )
standardised_columns(dyf=sydneyData_dyf, country = 'AU', state = 'New South Wales', currency_native= 'AUD' )
standardised_columns(dyf=tokyoData_dyf, country = 'JP', state = 'Tokyo', currency_native= 'JPY' )
standardised_columns(dyf=torontoData_dyf,country = 'CA', state = 'Ontario', currency_native= 'CAD' )




In [47]:
# create new ID columns for dimension tables
def create_id_columns(dyf, dyf_name):
  """
  This function creates new ID columns for the dataframe

  Parameters:
  dyf_name: The name of the dataframe
  dyf: The dataframe to create new ID columns

  Returns: The dataframe with new ID columns
  """

  prefix = dyf_name[:3].upper()

  dyf = dyf.withColumn("monotonically_increasing_id", F.monotonically_increasing_id()+ 1)
  window_spec = Window.orderBy("monotonically_increasing_id")
  dyf = dyf.withColumn("listingId", F.concat_ws('', F.lit(prefix), F.lit('LIS' ), F.row_number().over(window_spec).cast(StringType())))
  dyf = dyf.withColumn("amenitiesId", F.concat_ws('', F.lit(prefix), F.lit('AMT'), F.row_number().over(window_spec).cast(StringType())))
  dyf = dyf.withColumn("ratingId", F.concat_ws('', F.lit(prefix), F.lit('RTE'), F.row_number().over(window_spec).cast(StringType())))
  dyf = dyf.withColumn("bookingId", F.concat_ws('', F.lit(prefix), F.lit('BKD'), F.row_number().over(window_spec).cast(StringType())))

  dyf = dyf.drop('monotonically_increasing_id"')

  return dyf




In [48]:
#create id columns in dyf dictionary
dubaiData_dyf = create_id_columns(dubaiData_dyf, 'dubaiData_dyf')
losAngelesData_dyf = create_id_columns(losAngelesData_dyf, 'losAngelesData_dyf')
londonData_dyf = create_id_columns(londonData_dyf, 'londonData_dyf')
miamiData_dyf = create_id_columns(miamiData_dyf, 'miamiData_dyf')
newYorkCityData_dyf = create_id_columns(newYorkCityData_dyf, 'newYorkCityData_dyf')
sanFranciscoData_dyf = create_id_columns(sanFranciscoData_dyf, 'sanFranciscoData_dyf')
sydneyData_dyf = create_id_columns(sydneyData_dyf, 'sydneyData_dyf')
tokyoData_dyf = create_id_columns(tokyoData_dyf, 'tokyoData_dyf')
torontoData_dyf = create_id_columns(torontoData_dyf, 'torontoData_dyf')




In [49]:
# show the first 10 rows of the torontoData_dyf
torontoData_dyf.select('listingId').show()

+---------+
|listingId|
+---------+
|  TORLIS1|
|  TORLIS2|
|  TORLIS3|
|  TORLIS4|
|  TORLIS5|
|  TORLIS6|
|  TORLIS7|
|  TORLIS8|
|  TORLIS9|
| TORLIS10|
| TORLIS11|
| TORLIS12|
| TORLIS13|
| TORLIS14|
| TORLIS15|
| TORLIS16|
| TORLIS17|
| TORLIS18|
| TORLIS19|
| TORLIS20|
+---------+
only showing top 20 rows


In [50]:
#create lisiting dimension table
listing_dim = (dubaiData_dyf
            .select('listingId','listingTitle', 'propertyType', 'listingType', 'listingURL', 'createdDate', 'currencyNative')
            .union(losAngelesData_dyf.select('listingId','listingTitle', 'propertyType', 'listingType', 'listingURL', 'createdDate', 'currencyNative'))
            .union(londonData_dyf.select('listingId','listingTitle', 'propertyType', 'listingType', 'listingURL', 'createdDate', 'currencyNative'))
            .union(miamiData_dyf.select('listingId','listingTitle', 'propertyType', 'listingType', 'listingURL', 'createdDate', 'currencyNative'))
            .union(newYorkCityData_dyf.select('listingId','listingTitle', 'propertyType', 'listingType', 'listingURL', 'createdDate', 'currencyNative'))
            .union(sanFranciscoData_dyf.select('listingId','listingTitle', 'propertyType', 'listingType', 'listingURL', 'createdDate', 'currencyNative'))
            .union(sydneyData_dyf.select('listingId','listingTitle', 'propertyType', 'listingType', 'listingURL', 'createdDate', 'currencyNative'))
            .union(tokyoData_dyf.select('listingId','listingTitle', 'propertyType', 'listingType', 'listingURL', 'createdDate', 'currencyNative'))
            .union(torontoData_dyf.select('listingId','listingTitle', 'propertyType', 'listingType', 'listingURL', 'createdDate', 'currencyNative')))





In [52]:
listing_dim.show(5)

+---------+--------------------+------------------+-----------+--------------------+-----------+--------------+
|listingId|        listingTitle|      propertyType|listingType|          listingURL|createdDate|currencyNative|
+---------+--------------------+------------------+-----------+--------------------+-----------+--------------+
|  DUBLIS1|Durrani Homes-Mod...|      Entire condo|entire_home|http://airbnb.com...| 2023-07-31|           AED|
|  DUBLIS2|1 Bed Apt in 7 by...|Entire rental unit|entire_home|http://airbnb.com...| 2023-09-12|           AED|
|  DUBLIS3|Privat Villa in A...|      Entire villa|entire_home|http://airbnb.com...| 2023-07-31|           AED|
|  DUBLIS4|OFFER! 50% OFF! G...|Entire rental unit|entire_home|http://airbnb.com...| 2023-07-31|           AED|
|  DUBLIS5|BEAUTIFUL APARTME...|      Entire condo|entire_home|http://airbnb.com...| 2023-06-15|           AED|
+---------+--------------------+------------------+-----------+--------------------+-----------+--------

In [51]:
#create host dimension table
host_dim = (dubaiData_dyf.select('airbnbHostID', 'hostListingCount')
            .union(losAngelesData_dyf.select('airbnbHostID', 'hostListingCount'))
            .union(londonData_dyf.select('airbnbHostID', 'hostListingCount'))
            .union(miamiData_dyf.select('airbnbHostID', 'hostListingCount'))
            .union(newYorkCityData_dyf.select('airbnbHostID', 'hostListingCount'))
            .union(sanFranciscoData_dyf.select('airbnbHostID', 'hostListingCount'))
            .union(sydneyData_dyf.select('airbnbHostID', 'hostListingCount'))
            .union(tokyoData_dyf.select('airbnbHostID', 'hostListingCount'))
            .union(torontoData_dyf.select('airbnbHostID', 'hostListingCount')))




In [53]:
host_dim.show()

+------------+----------------+
|airbnbHostID|hostListingCount|
+------------+----------------+
|   329871146|              31|
|   443351231|               0|
|   355726564|               4|
|   167648591|               0|
|   161350886|               1|
|    88268205|               0|
|   112150026|               0|
|   475952488|               0|
|   474084485|               0|
|   436002352|               0|
|   419719406|               0|
|   200755588|               0|
|    45708270|              90|
|   405760695|              97|
|   462017709|               0|
|   385187973|               3|
|    57112103|               0|
|   471216887|               0|
|    50966510|             162|
|   407804837|               0|
+------------+----------------+
only showing top 20 rows


In [59]:
#create location dimension table
location_dim = (londonData_dyf
                .select('airbnbPropertyID', 'country', 'state', 'city', 'zipcode', 'latitude', 'longitude', 'exactLocation')
                .union(losAngelesData_dyf.select('airbnbPropertyID', 'country', 'state', 'city', 'zipcode', 'latitude', 'longitude', 'exactLocation'))
                .union(dubaiData_dyf.select('airbnbPropertyID', 'country', 'state', 'city', 'zipcode', 'latitude', 'longitude', 'exactLocation'))
                .union(miamiData_dyf.select('airbnbPropertyID', 'country', 'state', 'city', 'zipcode', 'latitude', 'longitude', 'exactLocation'))
                .union(newYorkCityData_dyf.select('airbnbPropertyID', 'country', 'state', 'city', 'zipcode', 'latitude', 'longitude', 'exactLocation'))
                .union(sanFranciscoData_dyf.select('airbnbPropertyID', 'country', 'state', 'city', 'zipcode', 'latitude', 'longitude', 'exactLocation'))
                .union(sydneyData_dyf.select('airbnbPropertyID', 'country', 'state', 'city', 'zipcode', 'latitude', 'longitude', 'exactLocation'))
                .union(tokyoData_dyf.select('airbnbPropertyID', 'country', 'state', 'city', 'zipcode', 'latitude', 'longitude', 'exactLocation'))
                .union(torontoData_dyf.select('airbnbPropertyID', 'country', 'state', 'city', 'zipcode', 'latitude', 'longitude', 'exactLocation')))




In [60]:
location_dim.show(5)

+------------------+-------+-----+--------------+-------+------------------+--------------------+-------------+
|  airbnbPropertyID|country|state|          city|zipcode|          latitude|           longitude|exactLocation|
+------------------+-------+-----+--------------+-------+------------------+--------------------+-------------+
|769598008324805688|     GB|   UK|Greater London|  W1W 8| 51.51631164550781|-0.13911999762058258|            f|
|          44111861|     GB|   UK|Greater London|  W1W 7| 51.51892852783203| -0.1401900053024292|            f|
|          52087579|     GB|   UK|Greater London|  W1T 3|51.518558502197266|-0.14003999531269073|            f|
|          26437915|     GB|   UK|Greater London|  W1T 1|  51.5177001953125| -0.1371999979019165|            f|
|          47732474|     GB|   UK|Greater London|  W1T 1|   51.518310546875|-0.13720999658107758|            f|
+------------------+-------+-----+--------------+-------+------------------+--------------------+-------

In [61]:
#create amenities dimension table
amenities_dim = (dubaiData_dyf.select('amenitiesID', 'amenities')
            .union(losAngelesData_dyf.select('amenitiesID', 'amenities'))
            .union(londonData_dyf.select('amenitiesID', 'amenities'))
            .union(miamiData_dyf.select('amenitiesID', 'amenities'))
            .union(newYorkCityData_dyf.select('amenitiesID', 'amenities'))
            .union(sanFranciscoData_dyf.select('amenitiesID', 'amenities'))
            .union(sydneyData_dyf.select('amenitiesID', 'amenities'))
            .union(tokyoData_dyf.select('amenitiesID', 'amenities'))
               .union(torontoData_dyf.select('amenitiesID', 'amenities')))




In [62]:
amenities_dim.show()

+-----------+--------------------+
|amenitiesID|           amenities|
+-----------+--------------------+
|    DUBAMT1|['Kitchen', 'Wifi...|
|    DUBAMT2|['Kitchen', 'Wifi...|
|    DUBAMT3|['Kitchen', 'Wifi...|
|    DUBAMT4|['Kitchen', 'Wifi...|
|    DUBAMT5|['Kitchen', 'Elev...|
|    DUBAMT6|['Kitchen', 'Wifi...|
|    DUBAMT7|['Kitchen', 'Wifi...|
|    DUBAMT8|['Kitchen', 'Wifi...|
|    DUBAMT9|['Kitchen', 'Wifi...|
|   DUBAMT10|['Indoor fireplac...|
|   DUBAMT11|['Indoor fireplac...|
|   DUBAMT12|['Kitchen', 'Wifi...|
|   DUBAMT13|['Kitchen', 'Wifi...|
|   DUBAMT14|['Kitchen', 'Wifi...|
|   DUBAMT15|['Kitchen', 'Wifi...|
|   DUBAMT16|['Kitchen', 'Wifi...|
|   DUBAMT17|['Kitchen', 'Wifi...|
|   DUBAMT18|['Kitchen', 'Wifi...|
|   DUBAMT19|['Kitchen', 'Wifi...|
|   DUBAMT20|['Kitchen', 'Wifi...|
+-----------+--------------------+
only showing top 20 rows


In [63]:
#create ratings dimension table
rating_dim = (dubaiData_dyf.select('ratingID', 'overallRating', 'airbnbCommunicationRating',
                   'airbnbAccuracyRating', 'airbnbCleanlinessRating', 'airbnbCheckinRating', 'airbnbLocationRating', 'airbnbValueRating')
             .union(losAngelesData_dyf.select('ratingID', 'overallRating', 'airbnbCommunicationRating',
                   'airbnbAccuracyRating', 'airbnbCleanlinessRating', 'airbnbCheckinRating', 'airbnbLocationRating', 'airbnbValueRating'))
             .union(londonData_dyf.select('ratingID', 'overallRating', 'airbnbCommunicationRating',
                   'airbnbAccuracyRating', 'airbnbCleanlinessRating', 'airbnbCheckinRating', 'airbnbLocationRating', 'airbnbValueRating'))
             .union(miamiData_dyf.select('ratingID', 'overallRating', 'airbnbCommunicationRating',
                   'airbnbAccuracyRating', 'airbnbCleanlinessRating', 'airbnbCheckinRating', 'airbnbLocationRating', 'airbnbValueRating'))
             .union(newYorkCityData_dyf.select('ratingID', 'overallRating', 'airbnbCommunicationRating',
                   'airbnbAccuracyRating', 'airbnbCleanlinessRating', 'airbnbCheckinRating', 'airbnbLocationRating', 'airbnbValueRating'))
             .union(sanFranciscoData_dyf.select('ratingID', 'overallRating', 'airbnbCommunicationRating',
                   'airbnbAccuracyRating', 'airbnbCleanlinessRating', 'airbnbCheckinRating', 'airbnbLocationRating', 'airbnbValueRating'))
             .union(sydneyData_dyf.select('ratingID', 'overallRating', 'airbnbCommunicationRating',
                   'airbnbAccuracyRating', 'airbnbCleanlinessRating', 'airbnbCheckinRating', 'airbnbLocationRating', 'airbnbValueRating'))
             .union(tokyoData_dyf.select('ratingID', 'overallRating', 'airbnbCommunicationRating',
                   'airbnbAccuracyRating', 'airbnbCleanlinessRating', 'airbnbCheckinRating', 'airbnbLocationRating', 'airbnbValueRating'))
             .union(torontoData_dyf.select('ratingID', 'overallRating', 'airbnbCommunicationRating',
                   'airbnbAccuracyRating', 'airbnbCleanlinessRating', 'airbnbCheckinRating', 'airbnbLocationRating', 'airbnbValueRating')))





In [64]:
rating_dim.show(5)

+--------+-----------------+-------------------------+--------------------+-----------------------+-------------------+--------------------+-----------------+
|ratingID|    overallRating|airbnbCommunicationRating|airbnbAccuracyRating|airbnbCleanlinessRating|airbnbCheckinRating|airbnbLocationRating|airbnbValueRating|
+--------+-----------------+-------------------------+--------------------+-----------------------+-------------------+--------------------+-----------------+
| DUBRTE1|              5.0|                       10|                  10|                     10|                 10|                   9|                9|
| DUBRTE2|              0.0|                        0|                   0|                      0|                  0|                   0|                0|
| DUBRTE3|4.800000190734863|                        9|                   9|                      9|                  9|                   8|                9|
| DUBRTE4|              5.0|                  

In [65]:
#create booking dimension table
booking_dim = (dubaiData_dyf
               .select('bookingID', 'checkInTime', 'checkoutTime', 'minimumStay')
               .union(losAngelesData_dyf.select('bookingID', 'checkInTime', 'checkoutTime', 'minimumStay'))
               .union(londonData_dyf.select('bookingID', 'checkInTime', 'checkoutTime', 'minimumStay'))
               .union(miamiData_dyf.select('bookingID', 'checkInTime', 'checkoutTime', 'minimumStay'))
               .union(newYorkCityData_dyf.select('bookingID', 'checkInTime', 'checkoutTime', 'minimumStay'))
               .union(sanFranciscoData_dyf.select('bookingID', 'checkInTime', 'checkoutTime', 'minimumStay'))
               .union(sydneyData_dyf.select('bookingID', 'checkInTime', 'checkoutTime', 'minimumStay'))
               .union(tokyoData_dyf.select('bookingID', 'checkInTime', 'checkoutTime', 'minimumStay'))
               .union(torontoData_dyf.select('bookingID', 'checkInTime', 'checkoutTime', 'minimumStay')))




In [66]:
booking_dim.show(5)

+---------+-----------------+------------+-----------+
|bookingID|      checkInTime|checkoutTime|minimumStay|
+---------+-----------------+------------+-----------+
|  DUBBKD1|    After 3:00 PM|    12:00 PM|          1|
|  DUBBKD2|    After 3:00 PM|    11:00 AM|          1|
|  DUBBKD3|3:00 PM - 6:00 PM|    12:00 PM|          7|
|  DUBBKD4|    After 3:00 PM|    11:00 AM|          2|
|  DUBBKD5|    After 3:00 PM|    12:00 PM|          3|
+---------+-----------------+------------+-----------+
only showing top 5 rows


In [68]:
#create booking fact table
booking_fact = (dubaiData_dyf.select('bookingID', 'listingID', 'airbnbHostID', 'airbnbPropertyID', 'amenitiesID',
                        'ratingID', 'numberOfReviews', 'bedrooms', 'bathrooms', 'maxGuests', 'airbnbSuperhost',
                        'cancellationPolicy', 'cleaningFeeUSD', 'extraPeopleFeeUSD',
                        'extraPeopleFeeNative', 'instantBookable', 'petsAllowed', 'occupancyRateLTM',
                        'numberOfBookingsLTM', 'numberOfBookingsLTMObservedMonth', 'averageDailyRateUSD') 
                 .union(losAngelesData_dyf.select('bookingID', 'listingID', 'airbnbHostID', 'airbnbPropertyID', 'amenitiesID',
                        'ratingID', 'numberOfReviews', 'bedrooms', 'bathrooms', 'maxGuests', 'airbnbSuperhost',
                        'cancellationPolicy', 'cleaningFeeUSD', 'extraPeopleFeeUSD',
                        'extraPeopleFeeNative', 'instantBookable', 'petsAllowed', 'occupancyRateLTM',
                        'numberOfBookingsLTM', 'numberOfBookingsLTMObservedMonth', 'averageDailyRateUSD'))
                  .union(londonData_dyf.select('bookingID', 'listingID', 'airbnbHostID', 'airbnbPropertyID', 'amenitiesID',
                        'ratingID', 'numberOfReviews', 'bedrooms', 'bathrooms', 'maxGuests', 'airbnbSuperhost',
                        'cancellationPolicy', 'cleaningFeeUSD', 'extraPeopleFeeUSD',
                        'extraPeopleFeeNative', 'instantBookable', 'petsAllowed', 'occupancyRateLTM',
                        'numberOfBookingsLTM', 'numberOfBookingsLTMObservedMonth', 'averageDailyRateUSD'))
                  .union(miamiData_dyf.select('bookingID', 'listingID', 'airbnbHostID', 'airbnbPropertyID', 'amenitiesID',
                        'ratingID', 'numberOfReviews', 'bedrooms', 'bathrooms', 'maxGuests', 'airbnbSuperhost',
                        'cancellationPolicy', 'cleaningFeeUSD', 'extraPeopleFeeUSD',
                        'extraPeopleFeeNative', 'instantBookable', 'petsAllowed', 'occupancyRateLTM',
                        'numberOfBookingsLTM', 'numberOfBookingsLTMObservedMonth', 'averageDailyRateUSD'))
                  .union(newYorkCityData_dyf.select('bookingID', 'listingID', 'airbnbHostID', 'airbnbPropertyID', 'amenitiesID',
                        'ratingID', 'numberOfReviews', 'bedrooms', 'bathrooms', 'maxGuests', 'airbnbSuperhost',
                        'cancellationPolicy', 'cleaningFeeUSD', 'extraPeopleFeeUSD',
                        'extraPeopleFeeNative', 'instantBookable', 'petsAllowed', 'occupancyRateLTM',
                        'numberOfBookingsLTM', 'numberOfBookingsLTMObservedMonth', 'averageDailyRateUSD'))
                  .union(sanFranciscoData_dyf.select('bookingID', 'listingID', 'airbnbHostID', 'airbnbPropertyID', 'amenitiesID',
                        'ratingID', 'numberOfReviews', 'bedrooms', 'bathrooms', 'maxGuests', 'airbnbSuperhost',
                        'cancellationPolicy', 'cleaningFeeUSD', 'extraPeopleFeeUSD',
                        'extraPeopleFeeNative', 'instantBookable', 'petsAllowed', 'occupancyRateLTM',
                        'numberOfBookingsLTM', 'numberOfBookingsLTMObservedMonth', 'averageDailyRateUSD'))
                  .union(sydneyData_dyf.select('bookingID', 'listingID', 'airbnbHostID', 'airbnbPropertyID', 'amenitiesID',
                        'ratingID', 'numberOfReviews', 'bedrooms', 'bathrooms', 'maxGuests', 'airbnbSuperhost',
                        'cancellationPolicy', 'cleaningFeeUSD', 'extraPeopleFeeUSD',
                        'extraPeopleFeeNative', 'instantBookable', 'petsAllowed', 'occupancyRateLTM',
                        'numberOfBookingsLTM', 'numberOfBookingsLTMObservedMonth', 'averageDailyRateUSD'))
                  .union(tokyoData_dyf.select('bookingID', 'listingID', 'airbnbHostID', 'airbnbPropertyID', 'amenitiesID',
                        'ratingID', 'numberOfReviews', 'bedrooms', 'bathrooms', 'maxGuests', 'airbnbSuperhost',
                        'cancellationPolicy', 'cleaningFeeUSD', 'extraPeopleFeeUSD',
                        'extraPeopleFeeNative', 'instantBookable', 'petsAllowed', 'occupancyRateLTM',
                        'numberOfBookingsLTM', 'numberOfBookingsLTMObservedMonth', 'averageDailyRateUSD'))
                  .union(torontoData_dyf.select('bookingID', 'listingID', 'airbnbHostID', 'airbnbPropertyID', 'amenitiesID',
                        'ratingID', 'numberOfReviews', 'bedrooms', 'bathrooms', 'maxGuests', 'airbnbSuperhost',
                        'cancellationPolicy', 'cleaningFeeUSD', 'extraPeopleFeeUSD',
                        'extraPeopleFeeNative', 'instantBookable', 'petsAllowed', 'occupancyRateLTM',
                        'numberOfBookingsLTM', 'numberOfBookingsLTMObservedMonth', 'averageDailyRateUSD')))




In [69]:
booking_fact.show(5)

+---------+---------+------------+------------------+-----------+--------+---------------+--------+---------+---------+---------------+--------------------+--------------+------------------+--------------------+---------------+-----------+----------------+-------------------+--------------------------------+-------------------+
|bookingID|listingID|airbnbHostID|  airbnbPropertyID|amenitiesID|ratingID|numberOfReviews|bedrooms|bathrooms|maxGuests|airbnbSuperhost|  cancellationPolicy|cleaningFeeUSD| extraPeopleFeeUSD|extraPeopleFeeNative|instantBookable|petsAllowed|occupancyRateLTM|numberOfBookingsLTM|numberOfBookingsLTMObservedMonth|averageDailyRateUSD|
+---------+---------+------------+------------------+-----------+--------+---------------+--------+---------+---------+---------------+--------------------+--------------+------------------+--------------------+---------------+-----------+----------------+-------------------+--------------------------------+-------------------+
|  DUBBKD1

In [70]:
#convert spark dataframe to dynamicframe
listing_dim_dyf = DynamicFrame.fromDF(listing_dim, glueContext, 'dynamic_frame')
location_dim_dyf = DynamicFrame.fromDF(location_dim, glueContext, 'dynamic_frame')
host_dim_dyf = DynamicFrame.fromDF(host_dim, glueContext, 'dynamic_frame')
amenities_dim_dyf = DynamicFrame.fromDF(amenities_dim, glueContext, 'dynamic_frame')
rating_dim_dyf = DynamicFrame.fromDF(rating_dim, glueContext, 'dynamic_frame')
booking_dim_dyf = DynamicFrame.fromDF(booking_dim, glueContext, 'dynamic_frame')
booking_fact_dyf = DynamicFrame.fromDF(booking_fact, glueContext, 'dynamic_frame')




In [80]:
#write dynamicframes to s3
def write_to_s3(dyf, dyf_name):
    """
    This function writes a dynamic frame to s3
  
    Parameters:
    dyf: The dynamic frame to write to s3
    dyf_name: The name of the dynamic frame
    
    Returns: None
    """
    sink = glueContext.getSink(
                path=f"s3://airbnb-listings-data/transformed-data/{dyf_name}.csv",
                connection_type="s3",
                updateBehavior="UPDATE_IN_DATABASE",
                partitionKeys=[],
                compression="gzip",
                enableUpdateCatalog=True,
                transformation_ctx=f"{dyf_name}_csv_sink")
    sink.setCatalogInfo(
                        catalogDatabase="airbnb_catalog_db",
                        catalogTableName=f"{dyf_name}_csv")
    sink.setFormat("csv")
    sink.writeFrame(dyf)
    print(f"Data written to {dyf_name}")




In [81]:
# write all the dynamicframes to s3
write_to_s3(listing_dim_dyf, 'listing-dim')
write_to_s3(location_dim_dyf, 'location-dim')
write_to_s3(host_dim_dyf, 'host-dim')
write_to_s3(amenities_dim_dyf, 'amenities-dim')
write_to_s3(rating_dim_dyf, 'rating-dim')
write_to_s3(booking_dim_dyf, 'booking-dim')
write_to_s3(booking_fact_dyf, 'booking-fact')

Data written to listing-dim
Data written to location-dim
Data written to host-dim
Data written to amenities-dim
Data written to rating-dim
Data written to booking-dim
Data written to booking-fact


In [82]:
spark.stop()


