# Transform logic for Airbnb Listings

## Initialize Spark Session

In [12]:
import sys
from pyspark.sql import SparkSession,  DataFrame, functions as F
from pyspark.sql.types import DoubleType

spark = SparkSession.builder.appName("clean_listings").getOrCreate()

## Define constants

In [13]:
S3_RAW_FILE = "s3://datalake/raw/kaggle_airbnb/listings.csv"
COLUMN_KEYWORDS_TO_EXCLUDE='url;scrape;license'.split(';')

## Read raw Listings CSV file

In [14]:
df = spark.read\
        .option("sep",",")\
        .option("inferSchema", "true")\
        .option("header", "true")\
        .option("multiline","true")\
        .option("quote", '"')\
        .option("escape", "\\")\
        .option("escape", '"')\
        .option("encoding", "UTF-8")\
        .option("ignoreLeadingWhiteSpace", "true")\
        .option("ignoreTrailingWhiteSpace", "true")\
        .csv(S3_RAW_FILE)
df.limit(5).toPandas().head()

                                                                                

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,...,10.0,f,,WASHINGTON,f,moderate,f,f,2,4.07
1,953595,https://www.airbnb.com/rooms/953595,20160104002432,2016-01-04,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",...,10.0,f,,WASHINGTON,f,strict,t,t,6,1.48
2,3308979,https://www.airbnb.com/rooms/3308979,20160104002432,2016-01-04,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,...,10.0,f,,WASHINGTON,f,strict,f,f,2,1.15
3,7421966,https://www.airbnb.com/rooms/7421966,20160104002432,2016-01-04,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,none,,...,,f,,WASHINGTON,f,flexible,f,f,1,
4,278830,https://www.airbnb.com/rooms/278830,20160104002432,2016-01-04,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,...,9.0,f,,WASHINGTON,f,strict,f,f,1,0.89


## Transform

### Define Functions used in DAG

In [76]:
def drop_unneeded_columns(df: DataFrame, words: list) -> DataFrame:
    columns_to_drop = [col for col in df.columns if any(word in col for word in words)]
    return df.drop(*columns_to_drop)

def transform_columns_old(df):
    # Convert boolean columns
    boolean_cols = [col_name for col_name in df.columns
                        if df.select(col_name).filter(df[col_name].isin(['t', 'f'])).count()
                        == df.select(col_name).filter(f'NOT {col_name} IS NULL').count()]
    
    for bc in boolean_cols:
        df = df.withColumn(bc, F.when(F.col(bc) == 't', True).otherwise(False))

    # Standardize price fields
    # Find columns with non-null values all starting in $ 
    # and remove non-numeric characters from price (e.g., '$', ',')
    price_cols = [col_name for col_name in df.columns
                  if df.filter((~F.col(col_name).rlike('^\\$')) & (~F.isnull(F.col(col_name)))).count() == 0]
    for pc in price_cols:
        df = df.withColumn(pc, F.regexp_replace(F.col(pc), "[$,]", "").cast(DoubleType()))  

    return df

    
def transform_columns(df):
    # Identify boolean columns
    boolean_cols = [
        col_name for col_name in df.columns 
        if df.filter(~(F.col(col_name).isin('t', 'f') | F.col(col_name).isNull())).count() == 0
    ]

    # Identify price columns 
    price_cols = [
        col_name for col_name in df.columns 
        if df.filter(~F.col(col_name).rlike('^\\$')).filter(F.col(col_name).isNotNull()).count() == 0
    ]     

    # Transform columns 
    transformed_cols = [
        F.when(F.col(col_name) == 't', True).otherwise(False).alias(col_name) if col_name in boolean_cols else
        F.regexp_replace(F.col(col_name), "[$,]", "").cast(DoubleType()).alias(col_name) if col_name in price_cols else
        F.col(col_name)  # Default to original column
        for col_name in df.columns
    ]

    # Return transformed DataFrame
    return df.select(*transformed_cols)

### Drop columns containing specified keywords

In [77]:
%%time
df_trimmed = df.transform(drop_unneeded_columns, COLUMN_KEYWORDS_TO_EXCLUDE)

CPU times: user 7.13 ms, sys: 1.78 ms, total: 8.92 ms
Wall time: 30.9 ms


### Clean remaining columns

In [78]:
%%time
df_clean = df_trimmed.transform(transform_columns_old)

                                                                                

CPU times: user 294 ms, sys: 77.8 ms, total: 372 ms
Wall time: 1min 1s


In [79]:
%%time
df_clean = df_trimmed.transform(transform_columns)

                                                                                

CPU times: user 180 ms, sys: 60.6 ms, total: 241 ms
Wall time: 36.5 s


### Display Schema

In [68]:
df_clean.dtypes

[('id', 'int'),
 ('name', 'string'),
 ('summary', 'string'),
 ('space', 'string'),
 ('description', 'string'),
 ('experiences_offered', 'string'),
 ('neighborhood_overview', 'string'),
 ('notes', 'string'),
 ('transit', 'string'),
 ('host_id', 'int'),
 ('host_name', 'string'),
 ('host_since', 'date'),
 ('host_location', 'string'),
 ('host_about', 'string'),
 ('host_response_time', 'string'),
 ('host_response_rate', 'string'),
 ('host_acceptance_rate', 'string'),
 ('host_is_superhost', 'boolean'),
 ('host_neighbourhood', 'string'),
 ('host_listings_count', 'int'),
 ('host_total_listings_count', 'int'),
 ('host_verifications', 'string'),
 ('host_has_profile_pic', 'boolean'),
 ('host_identity_verified', 'boolean'),
 ('street', 'string'),
 ('neighbourhood', 'string'),
 ('neighbourhood_cleansed', 'string'),
 ('neighbourhood_group_cleansed', 'string'),
 ('city', 'string'),
 ('state', 'string'),
 ('zipcode', 'string'),
 ('market', 'string'),
 ('smart_location', 'string'),
 ('country_code', 's

### Display cleaned data

In [8]:
df_clean.limit(5).toPandas().head()

Unnamed: 0,id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,host_id,...,review_scores_communication,review_scores_location,review_scores_value,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,,,956883,...,10.0,9.0,10.0,WASHINGTON,False,moderate,False,False,2,4.07
1,953595,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",What's up with the free pillows? Our home was...,"Convenient bus stops are just down the block, ...",5177328,...,10.0,10.0,10.0,WASHINGTON,False,strict,True,True,6,1.48
2,3308979,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,Our house is located just 5 short blocks to To...,A bus stop is just 2 blocks away. Easy bus a...,16708587,...,10.0,10.0,10.0,WASHINGTON,False,strict,False,False,2,1.15
3,7421966,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,none,,,,9851441,...,,,,WASHINGTON,False,flexible,False,False,1,
4,278830,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,Belltown,The nearest public transit bus (D Line) is 2 b...,1452570,...,10.0,9.0,9.0,WASHINGTON,False,strict,False,False,1,0.89


### Write some of the data to local drive for further review

In [9]:
df_clean.limit(20).write.option('header', True).option("multiline","true").option("escape", '"').mode('overwrite').csv(f'./output/clean_listings_10')

                                                                                

In [10]:
spark.stop()