### PySpark Module 4 Assignment

New York City Airbnb Open Data: This dataset is publicly available. This data file
includes all needed information to find out more about hosts, geographical availability, and
necessary metrics to make predictions and draw conclusions.
https://www.kaggle.com/datasets/arianazmoudeh/airbnbopendata/


In [1]:
# import pandas as pd

# df = pd.read_csv('../data/Airbnb_Open_Data.csv')
# df.info()

In [2]:
import findspark
findspark.find()
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder\
    .appName('AirBnb NewYork Insights')\
    .master('local[2]')\
    .getOrCreate()

In [3]:
spark

In [4]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,FloatType,DateType,BooleanType,LongType,CharType,ShortType

# schema = StructType([
#     StructField('id',IntegerType(),nullable=False),
#     StructField('name',StringType()),
#     StructField('host_id',StringType()),
#     StructField('host_identity_verified',StringType()),
#     StructField('host_name',StringType()),
#     StructField('neighborhood group',StringType()),
#     StructField('neighborhood',StringType()),
#     StructField('lat',FloatType(),False),
#     StructField('long',FloatType(),False),
#     StructField('country',StringType()),
#     StructField('country_code',StringType()),
#     StructField('instant_bookable',BooleanType()),
#     StructField('cancellation_policy',StringType()),
#     StructField('room',StringType()),
#     StructField('construction_year',StringType()),
#     StructField('price',StringType()),
#     StructField('service_fee',StringType()),
#     StructField('minimum_nights',IntegerType()),
#     StructField('number_of_reviews',IntegerType()),
#     StructField('last_review',DateType()),
#     StructField('reviews_per_month',StringType()),
#     StructField('review_rate_number',StringType()),
#     StructField('calculated_host_listings_count',StringType()),
#     StructField('availability_365',StringType()),
#     StructField('house_rules',StringType()),
#     StructField('license',StringType()),
# ])

### Using dataset, we will perform the following tasks:


In [5]:
# 1. Load the dataset
df = spark.read\
    .option('header',True)\
    .format('csv')\
    .load('../data/Airbnb_Open_Data.csv')

# 2. Print the first 10 rows
df.show(10)

+-------+--------------------+-----------+----------------------+---------+-------------------+------------------+--------+---------+-------------+------------+----------------+-------------------+---------------+-----------------+-------+-----------+--------------+-----------------+-----------+-----------------+------------------+------------------------------+----------------+--------------------+-------+
|     id|                NAME|    host id|host_identity_verified|host name|neighbourhood group|     neighbourhood|     lat|     long|      country|country code|instant_bookable|cancellation_policy|      room type|Construction year|  price|service fee|minimum nights|number of reviews|last review|reviews per month|review rate number|calculated host listings count|availability 365|         house_rules|license|
+-------+--------------------+-----------+----------------------+---------+-------------------+------------------+--------+---------+-------------+------------+----------------+-

In [6]:
from pyspark.sql.functions import col,regexp_replace

rdd = df\
    .withColumn('id',col('id').cast(IntegerType()))\
    .withColumn('name',col('name').cast(StringType()))\
    .withColumnRenamed('host id','host_id')\
    .withColumn('host_id',col('host_id').cast(LongType()))\
    .withColumn('host_identity_verified',col('host_identity_verified'))\
    .withColumnRenamed('host name','host_name')\
    .withColumn('host_name',col('host_name'))\
    .withColumnRenamed('neighbourhood group','neighborhood_group')\
    .withColumnRenamed('lat','latitude')\
    .withColumn('latitude',col('latitude').cast(FloatType()))\
    .withColumnRenamed('long','longitude')\
    .withColumn('longitude',col('longitude').cast(FloatType()))\
    .withColumn('country',col('country').cast(StringType()))\
    .withColumnRenamed('country code','country_code')\
    .withColumn('country_code',col('country_code').cast(CharType(3)))\
    .withColumn('instant_bookable',col('instant_bookable').cast(BooleanType()))\
    .withColumn('cancellation_policy',col('cancellation_policy').cast(StringType()))\
    .withColumnRenamed('room type','room_type')\
    .withColumn('room_type',col('room_type').cast(StringType()))\
    .withColumnRenamed('Construction year','construction_year')\
    .withColumn('construction_year',col('construction_year').cast(ShortType()))\
    .withColumn('price',regexp_replace(col('price'),r'[\D]','').cast(IntegerType()))\
    .withColumnRenamed('service fee','service_fee')\
    .withColumn('service_fee',regexp_replace(col('service_fee'),r'[\D]','').cast(IntegerType()))\
    .withColumnRenamed('minimum nights','minimum_nights')\
    .withColumn('minimum_nights',col('minimum_nights').cast(ShortType()))\
    .withColumnRenamed('number of reviews','number_of_reviews')\
    .withColumn('number_of_reviews',col('number_of_reviews').cast(IntegerType()))\
    .withColumnRenamed('last review','last_review')\
    .withColumnRenamed('reviews per month','reviews_per_month')\
    .withColumn('reviews_per_month',col('reviews_per_month').cast(FloatType()))\
    .withColumnRenamed('review rate number','review_rate_number')\
    .withColumn('review_rate_number',col('review_rate_number').cast(ShortType()))\
    .withColumnRenamed('calculated host listings count','calculated_host_listings_count')\
    .withColumn('calculated_host_listings_count',col('calculated_host_listings_count').cast(ShortType()))\
    .withColumnRenamed('availability 365','availability_365')\
    .withColumn('availability_365',col('availability_365').cast(ShortType()))\
    .withColumn('house_rules',col('house_rules').cast(StringType()))\
    .withColumn('license',col('license').cast(StringType()))\


rdd.show(10)

+-------+--------------------+-----------+----------------------+---------+------------------+------------------+--------+---------+-------------+------------+----------------+-------------------+---------------+-----------------+-----+-----------+--------------+-----------------+-----------+-----------------+------------------+------------------------------+----------------+--------------------+-------+
|     id|                name|    host_id|host_identity_verified|host_name|neighborhood_group|     neighbourhood|latitude|longitude|      country|country_code|instant_bookable|cancellation_policy|      room_type|construction_year|price|service_fee|minimum_nights|number_of_reviews|last_review|reviews_per_month|review_rate_number|calculated_host_listings_count|availability_365|         house_rules|license|
+-------+--------------------+-----------+----------------------+---------+------------------+------------------+--------+---------+-------------+------------+----------------+--------

In [7]:
# 3. Find the total number of private rooms in the ‘room_type’ column
no_of_private_rooms = rdd.filter(rdd['room_type'] == 'Private room').count()
f'No of private rooms: {no_of_private_rooms}'

'No of private rooms: 46351'

In [8]:
# 4. Find the max, min, and average of the price column
import pyspark.sql.functions as f
price_stat=rdd.agg(
    f.max("price").alias('max_price'),
    f.min('price').alias('min_price'),
    f.avg('price').alias('avg_price')
)
print(price_stat.dtypes)
price_stat.show()

[('max_price', 'int'), ('min_price', 'int'), ('avg_price', 'double')]
+---------+---------+-----------------+
|max_price|min_price|        avg_price|
+---------+---------+-----------------+
|     2022|       10|623.8110062339997|
+---------+---------+-----------------+



In [9]:
# 5. Find the number of rooms available for booking for less than 200 days a year (use the ‘availability_365’ column)
no_of_rooms_available=rdd\
    .where('availability_365 < 200 and room_type="Private room"')\
    .count()
f'No. of rooms available for booking for less than 200 days a year: {no_of_rooms_available}'

'No. of rooms available for booking for less than 200 days a year: 31347'

In [10]:
# 6. Find 10 host places that have the most number of reviews
top_10_records = rdd.orderBy(col("number_of_reviews").desc()).limit(10)
top_10_records.show()

+--------+--------------------+-----------+----------------------+------------+------------------+------------------+--------+---------+-------------+------------+----------------+-------------------+---------------+-----------------+-----+-----------+--------------+-----------------+-----------+-----------------+------------------+------------------------------+----------------+-----------+-------+
|      id|                name|    host_id|host_identity_verified|   host_name|neighborhood_group|     neighbourhood|latitude|longitude|      country|country_code|instant_bookable|cancellation_policy|      room_type|construction_year|price|service_fee|minimum_nights|number_of_reviews|last_review|reviews_per_month|review_rate_number|calculated_host_listings_count|availability_365|house_rules|license|
+--------+--------------------+-----------+----------------------+------------+------------------+------------------+--------+---------+-------------+------------+----------------+--------------

In [11]:
#Alternate method
rdd.createOrReplaceTempView("AirBnbNewYork")
top_10_records = spark.sql('select * from AirBnbNewYork order by number_of_reviews desc limit 10')
# ANSI sql does not support TOP keyword
top_10_records.show()

+--------+--------------------+-----------+----------------------+------------+------------------+------------------+--------+---------+-------------+------------+----------------+-------------------+---------------+-----------------+-----+-----------+--------------+-----------------+-----------+-----------------+------------------+------------------------------+----------------+-----------+-------+
|      id|                name|    host_id|host_identity_verified|   host_name|neighborhood_group|     neighbourhood|latitude|longitude|      country|country_code|instant_bookable|cancellation_policy|      room_type|construction_year|price|service_fee|minimum_nights|number_of_reviews|last_review|reviews_per_month|review_rate_number|calculated_host_listings_count|availability_365|house_rules|license|
+--------+--------------------+-----------+----------------------+------------+------------------+------------------+--------+---------+-------------+------------+----------------+--------------

In [12]:
# spark.stop()