# OpenLab 2.1 Clustering

- In this openlab, we are going to apply the kmeans algorithm over a dataset with airbnb's places;
- The dataset contains: listing_url;price;latitude;longitude;


Create two cluster visualizations ranging the k from 3 to 5 for the two following sets of features:
1. geospatial **only** (latitude, longitude);
2. geospatial **and** price (latitude, longitude, price);



In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.mllib.stat import Statistics
import matplotlib.pyplot as plt
import numpy as np


In [2]:
conf = SparkConf().setAppName("pre-processing").setMaster("local[*]")
sc = SparkContext(conf=conf)

server = "11"
rdd = sc.textFile("hdfs://kddrtserver{0}.isti.cnr.it:9000/hpsa/datasets/venice_airbnb_clustering.csv".format(server))
print(rdd.count())
print(rdd.take(2))

151
['listing_url;price;latitude;longitude', 'https://www.airbnb.com/rooms/26620094;$73.00;45.49566;12.249410000000001']


In [3]:
rdd.take(4)

['listing_url;price;latitude;longitude',
 'https://www.airbnb.com/rooms/26620094;$73.00;45.49566;12.249410000000001',
 'https://www.airbnb.com/rooms/6917108;$240.00;45.437540000000006;12.3286',
 'https://www.airbnb.com/rooms/31884427;$340.00;45.442370000000004;12.33495']

### Pre-Processing

In [11]:
t = '1000.00'
float(t)

1000.0

In [9]:
rdd_tuples = rdd.map(lambda l: l.split(";"))
rdd_tuples.take(2)

[['listing_url', 'price', 'latitude', 'longitude'],
 ['https://www.airbnb.com/rooms/26620094',
  '$73.00',
  '45.49566',
  '12.249410000000001']]

In [12]:
rdd_tuples = rdd.map(lambda l: l.split(";"))

header = rdd_tuples.first()
rdd_tuples = rdd_tuples.filter(lambda l: l != header) \
          .map(lambda t: (t[0], float(t[1].replace("$","").replace(",", "")), float(t[2]), float(t[3])))


rdd_tuples.take(2)


[('https://www.airbnb.com/rooms/26620094', 73.0, 45.49566, 12.249410000000001),
 ('https://www.airbnb.com/rooms/6917108', 240.0, 45.437540000000006, 12.3286)]

In [36]:
# !pip3 install --user folium

In [16]:
import folium

color_dict = {0:"blue", 1: "red", 2: "green", 3: "purple", 4:"orange", 5:"gray", 6:"black", 7:"white"}

def plot_map(cluster_data): 
    m = folium.Map([45.42978, 12.35715], zoom_start = 11)
    
    for i in range(0,len(cluster_data)):
        cluster_id = cluster_data[i][0]
        place_url = cluster_data[i][1][0]
        price = cluster_data[i][1][1]
        lat_lon = (cluster_data[i][1][2], cluster_data[i][1][3])
        folium.Marker(lat_lon, popup="Cluster:{0}\n{1}\nPrice:{2}".format(cluster_id, place_url, price), icon=folium.Icon(color=color_dict[cluster_id], icon='info-sign')).add_to(m)
    return m

### 1. Geospatial only (latitude, longitude);

In [31]:
from pyspark.mllib.clustering import KMeans

# 'listing_url', 'price', 'latitude', 'longitude'
# select the features
rdd_geo_feat = rdd_tuples.map(lambda t: (t[2], t[3]))

# train
model = KMeans.train(rdd_geo_feat, k=6, runs=50, initializationMode="random")


In [32]:
model.centers

[array([45.43412732, 12.32901732]),
 array([45.42934   , 12.35736905]),
 array([45.44374762, 12.3252981 ]),
 array([45.48871429, 12.23320143]),
 array([45.43873333, 12.34199222]),
 array([45.38047, 12.34502])]

In [33]:
# assign to the clusters
# 'listing_url', 'price', 'latitude', 'longitude'
cluster_data = rdd_tuples.map(lambda x: (model.predict((x[2],x[3])), x)) \
            .collect()

plot_map(cluster_data)

### 2. Geospatial plus price (latitude, longitude, price)


In [37]:
# select the features
rdd_geo_price_feat = rdd_tuples.map(lambda t: (t[1], t[2], t[3]))

# train
model = KMeans.train(rdd_geo_price_feat, k=3, runs=50, initializationMode="random")

In [38]:
# assign to the clusters
cluster_data = rdd_tuples.map(lambda x: (model.predict((x[1],x[2],x[3])), x)) \
            .collect()

# visualization
plot_map(cluster_data)