## AirB&B Exploratory Data Analysis

In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, DecimalType

sc = SparkContext('local')
spark = SparkSession(sc)

In [2]:

format = StructType().add("neighbourhood", "string").add("room_type", "string").add("price", "integer").add("minimum_nights", "integer").add("availability_365", "integer")
df_Airbnb = spark.read.csv('/home/jovyan/work/MontrealAirBnB.csv', schema=format, header="true")

# Preliminary exploration
We are aiming to get:

    total number of offers,
    average price of all offers
    average minimum nights

In [3]:
n_count = df_Airbnb.groupBy('neighbourhood').count().toDF('neighbourhood','count')

In [4]:
n_count

DataFrame[neighbourhood: string, count: bigint]

In [5]:
n_avg_price = df_Airbnb.groupBy('neighbourhood').mean('price').toDF('neighbourhood','avg_price')
n_avg_price = n_avg_price.withColumn('average_price' , n_avg_price["avg_price"].cast(DecimalType(4,2)))
n_avg_price = n_avg_price.drop('avg_price')

In [6]:
n_avg_price.collect()

[Row(neighbourhood='Saint-Léonard', average_price=Decimal('89.51')),
 Row(neighbourhood='Rivière-des-Prairies-Pointe-aux-Trembles', average_price=None),
 Row(neighbourhood='Hampstead', average_price=None),
 Row(neighbourhood='Montréal-Est', average_price=Decimal('33.60')),
 Row(neighbourhood='Dorval', average_price=None),
 Row(neighbourhood='Le Plateau-Mont-Royal', average_price=None),
 Row(neighbourhood='Villeray-Saint-Michel-Parc-Extension', average_price=Decimal('75.45')),
 Row(neighbourhood='Côte-des-Neiges-Notre-Dame-de-Grâce', average_price=Decimal('88.69')),
 Row(neighbourhood='Outremont', average_price=None),
 Row(neighbourhood='Saint-Laurent', average_price=Decimal('92.17')),
 Row(neighbourhood='Dollard-des-Ormeaux', average_price=Decimal('98.88')),
 Row(neighbourhood='Ahuntsic-Cartierville', average_price=Decimal('78.86')),
 Row(neighbourhood="L'Île-Bizard-Sainte-Geneviève", average_price=None),
 Row(neighbourhood='Verdun', average_price=Decimal('80.32')),
 Row(neighbourhood=

In [7]:
n_avg_min_stay = df_Airbnb.groupBy('neighbourhood').mean('minimum_nights').toDF('neighbourhood','avg_min_stay')
n_avg_min_stay = n_avg_min_stay.withColumn('average_min_stay' , n_avg_min_stay["avg_min_stay"].cast(DecimalType(4,2)))
n_avg_min_stay = n_avg_min_stay.drop('avg_min_stay')

In [8]:
n_avg_min_stay.show()

+--------------------+----------------+
|       neighbourhood|average_min_stay|
+--------------------+----------------+
|       Saint-Léonard|           14.75|
|Rivière-des-Prair...|           13.24|
|           Hampstead|           10.13|
|        Montréal-Est|            6.80|
|              Dorval|            7.46|
|Le Plateau-Mont-R...|            6.57|
|Villeray-Saint-Mi...|            7.15|
|Côte-des-Neiges-N...|            9.16|
|           Outremont|           20.58|
|       Saint-Laurent|           12.29|
| Dollard-des-Ormeaux|            3.65|
|Ahuntsic-Cartierv...|            6.69|
|L'Île-Bizard-Sain...|            2.95|
|              Verdun|            7.20|
| Pierrefonds-Roxboro|           11.45|
|           Westmount|            8.10|
|               Anjou|           11.11|
|            Kirkland|            3.00|
|Mercier-Hochelaga...|            8.11|
|       Pointe-Claire|            3.64|
+--------------------+----------------+
only showing top 20 rows



In [9]:
df_Airbnb_p = n_count.join(n_avg_price, ["neighbourhood"])
df_Airbnb_p = df_Airbnb_p.join(n_avg_min_stay,["neighbourhood"])

In [10]:
df_Airbnb_p.show()

+--------------------+-----+-------------+----------------+
|       neighbourhood|count|average_price|average_min_stay|
+--------------------+-----+-------------+----------------+
|       Saint-Léonard|   55|        89.51|           14.75|
|Rivière-des-Prair...|   68|         null|           13.24|
|           Hampstead|   23|         null|           10.13|
|        Montréal-Est|    5|        33.60|            6.80|
|              Dorval|   63|         null|            7.46|
|Le Plateau-Mont-R...| 5616|         null|            6.57|
|Villeray-Saint-Mi...| 1081|        75.45|            7.15|
|Côte-des-Neiges-N...| 1284|        88.69|            9.16|
|           Outremont|  271|         null|           20.58|
|       Saint-Laurent|  173|        92.17|           12.29|
| Dollard-des-Ormeaux|   43|        98.88|            3.65|
|Ahuntsic-Cartierv...|  310|        78.86|            6.69|
|L'Île-Bizard-Sain...|   20|         null|            2.95|
|              Verdun|  473|        80.3

In [11]:
n_home = df_Airbnb.stat.crosstab("neighbourhood","room_type")