## AirB&B Exploratory Data Analysis

In [1]:
from pyspark import SparkContext, SparkConf
import operator

conf = SparkConf().setAppName('AirBnB_EDA')
conf = conf.setMaster('spark://master:7077')
#sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [2]:
# read csv file as rdd
airbnb_txt=sc.textFile("MontrealAirBnB.csv", minPartitions=2)
#remove header from rdd
tagsheader = airbnb_txt.first() 
header = sc.parallelize([tagsheader])
airbnb_rdd = airbnb_txt.subtract(header)

In [3]:
airbnb_rdd.take(20)

['Le Sud-Ouest,Entire home/apt,100,1,291',
 'Mercier-Hochelaga-Maisonneuve,Entire home/apt,37,30,255',
 'Côte-des-Neiges-Notre-Dame-de-Grâce,Private room,65,3,309',
 'Le Plateau-Mont-Royal,Entire home/apt,130,4,364',
 'Rosemont-La Petite-Patrie,Private room,80,3,75',
 'Ville-Marie,Entire home/apt,85,1,204',
 'Ville-Marie,Entire home/apt,125,1,44',
 'Mercier-Hochelaga-Maisonneuve,Entire home/apt,27,30,255',
 'Mercier-Hochelaga-Maisonneuve,Entire home/apt,27,30,255',
 'Le Plateau-Mont-Royal,Entire home/apt,100,3,265',
 'Le Plateau-Mont-Royal,Entire home/apt,100,3,265',
 'Côte-des-Neiges-Notre-Dame-de-Grâce,Entire home/apt,155,16,0',
 'Le Plateau-Mont-Royal,Private room,35,2,276',
 'Ville-Marie,Shared room,21,1,56',
 'Le Plateau-Mont-Royal,Entire home/apt,80,360,365',
 'Le Plateau-Mont-Royal,Private room,48,30,277',
 'Le Plateau-Mont-Royal,Entire home/apt,120,3,325',
 'Le Plateau-Mont-Royal,Private room,80,5,239',
 'Le Sud-Ouest,Entire home/apt,65,35,249',
 'Outremont,Entire home/apt,116,

In [13]:
airbnb_row = airbnb_rdd.map(lambda x: x.split(','))
airbnb_row.take(20)

[['Le Sud-Ouest', 'Entire home/apt', '100', '1', '291'],
 ['Mercier-Hochelaga-Maisonneuve', 'Entire home/apt', '37', '30', '255'],
 ['Côte-des-Neiges-Notre-Dame-de-Grâce', 'Private room', '65', '3', '309'],
 ['Le Plateau-Mont-Royal', 'Entire home/apt', '130', '4', '364'],
 ['Rosemont-La Petite-Patrie', 'Private room', '80', '3', '75'],
 ['Ville-Marie', 'Entire home/apt', '85', '1', '204'],
 ['Ville-Marie', 'Entire home/apt', '125', '1', '44'],
 ['Mercier-Hochelaga-Maisonneuve', 'Entire home/apt', '27', '30', '255'],
 ['Mercier-Hochelaga-Maisonneuve', 'Entire home/apt', '27', '30', '255'],
 ['Le Plateau-Mont-Royal', 'Entire home/apt', '100', '3', '265'],
 ['Le Plateau-Mont-Royal', 'Entire home/apt', '100', '3', '265'],
 ['Côte-des-Neiges-Notre-Dame-de-Grâce', 'Entire home/apt', '155', '16', '0'],
 ['Le Plateau-Mont-Royal', 'Private room', '35', '2', '276'],
 ['Ville-Marie', 'Shared room', '21', '1', '56'],
 ['Le Plateau-Mont-Royal', 'Entire home/apt', '80', '360', '365'],
 ['Le Plateau-

# Preliminary exploration
We are aiming to get:

    total number of offers,
    average price of all offers

In [5]:
neighbourhood_lst = airbnb_row.map(lambda x: (x[0], 1))

In [14]:
neighbourhood_lst.take(20)

[('Le Sud-Ouest', 1),
 ('Mercier-Hochelaga-Maisonneuve', 1),
 ('Côte-des-Neiges-Notre-Dame-de-Grâce', 1),
 ('Le Plateau-Mont-Royal', 1),
 ('Rosemont-La Petite-Patrie', 1),
 ('Ville-Marie', 1),
 ('Ville-Marie', 1),
 ('Mercier-Hochelaga-Maisonneuve', 1),
 ('Mercier-Hochelaga-Maisonneuve', 1),
 ('Le Plateau-Mont-Royal', 1),
 ('Le Plateau-Mont-Royal', 1),
 ('Côte-des-Neiges-Notre-Dame-de-Grâce', 1),
 ('Le Plateau-Mont-Royal', 1),
 ('Ville-Marie', 1),
 ('Le Plateau-Mont-Royal', 1),
 ('Le Plateau-Mont-Royal', 1),
 ('Le Plateau-Mont-Royal', 1),
 ('Le Plateau-Mont-Royal', 1),
 ('Le Sud-Ouest', 1),
 ('Outremont', 1)]

In [7]:
# calculate count by neighbourhood
n_count = neighbourhood_lst.reduceByKey(lambda x,y: x+y)

In [15]:
n_count.take(20)

[('Côte-des-Neiges-Notre-Dame-de-Grâce', 1284),
 ('Rosemont-La Petite-Patrie', 1961),
 ('Ville-Marie', 5208),
 ('Lachine', 111),
 ('Dorval', 63),
 ('Verdun', 473),
 ("L'Île-Bizard-Sainte-Geneviève", 20),
 ('Le Sud-Ouest', 1127),
 ('Mercier-Hochelaga-Maisonneuve', 925),
 ('Outremont', 271),
 ('Ahuntsic-Cartierville', 310),
 ('Westmount', 106),
 ('Hampstead', 23),
 ('Pierrefonds-Roxboro', 53),
 ('Montréal-Nord', 37),
 ('Dollard-des-Ormeaux', 43),
 ('Saint-Léonard', 55),
 ("Baie-d'Urfé", 7),
 ('Sainte-Anne-de-Bellevue', 4),
 ('Beaconsfield', 15)]

In [16]:
# calculate sum(price) per neighbourhood
price_lst = airbnb_row.map(lambda x: (x[0], int(x[2])))
price_lst_sum = price_lst.reduceByKey(operator.add)
price_lst_sum.take(20)

[('Côte-des-Neiges-Notre-Dame-de-Grâce', 113884),
 ('Rosemont-La Petite-Patrie', 173766),
 ('Ville-Marie', 701284),
 ('Lachine', 10500),
 ('Dorval', 7935),
 ('Verdun', 37993),
 ("L'Île-Bizard-Sainte-Geneviève", 3383),
 ('Le Sud-Ouest', 122099),
 ('Mercier-Hochelaga-Maisonneuve', 70272),
 ('Outremont', 29286),
 ('Ahuntsic-Cartierville', 24448),
 ('Westmount', 13592),
 ('Hampstead', 4716),
 ('Pierrefonds-Roxboro', 5671),
 ('Montréal-Nord', 2311),
 ('Dollard-des-Ormeaux', 4252),
 ('Saint-Léonard', 4923),
 ("Baie-d'Urfé", 2684),
 ('Sainte-Anne-de-Bellevue', 292),
 ('Beaconsfield', 887)]

In [10]:
#inner join total per neighbourhood and average by neighbourhood lists
count_price_lst = n_count.join(price_lst_sum)

In [17]:
count_price_lst.take(20)

[('Côte-des-Neiges-Notre-Dame-de-Grâce', (1284, 113884)),
 ('Rosemont-La Petite-Patrie', (1961, 173766)),
 ('Ville-Marie', (5208, 701284)),
 ('Lachine', (111, 10500)),
 ('Dorval', (63, 7935)),
 ('Verdun', (473, 37993)),
 ("L'Île-Bizard-Sainte-Geneviève", (20, 3383)),
 ('Le Sud-Ouest', (1127, 122099)),
 ('Mercier-Hochelaga-Maisonneuve', (925, 70272)),
 ('Outremont', (271, 29286)),
 ('Ahuntsic-Cartierville', (310, 24448)),
 ('Westmount', (106, 13592)),
 ('Hampstead', (23, 4716)),
 ('Pierrefonds-Roxboro', (53, 5671)),
 ('Montréal-Nord', (37, 2311)),
 ('Dollard-des-Ormeaux', (43, 4252)),
 ('Saint-Léonard', (55, 4923)),
 ("Baie-d'Urfé", (7, 2684)),
 ('Sainte-Anne-de-Bellevue', (4, 292)),
 ('Beaconsfield', (15, 887))]

In [18]:
#calculate average price per neighbourhood
avg_price_per_neighbourhood = count_price_lst.map(lambda x: (x[0], (x[1][1]/x[1][0])))
avg_price_per_neighbourhood.take(20)

[('Côte-des-Neiges-Notre-Dame-de-Grâce', 88.69470404984423),
 ('Rosemont-La Petite-Patrie', 88.61091279959204),
 ('Ville-Marie', 134.65514592933948),
 ('Lachine', 94.5945945945946),
 ('Dorval', 125.95238095238095),
 ('Verdun', 80.32346723044398),
 ("L'Île-Bizard-Sainte-Geneviève", 169.15),
 ('Le Sud-Ouest', 108.33984028393967),
 ('Mercier-Hochelaga-Maisonneuve', 75.96972972972974),
 ('Outremont', 108.06642066420665),
 ('Ahuntsic-Cartierville', 78.86451612903225),
 ('Westmount', 128.22641509433961),
 ('Hampstead', 205.04347826086956),
 ('Pierrefonds-Roxboro', 107.0),
 ('Montréal-Nord', 62.45945945945946),
 ('Dollard-des-Ormeaux', 98.88372093023256),
 ('Saint-Léonard', 89.50909090909092),
 ("Baie-d'Urfé", 383.42857142857144),
 ('Sainte-Anne-de-Bellevue', 73.0),
 ('Beaconsfield', 59.13333333333333)]