## Initialization

In [1]:
# import findspark
import findspark
findspark.init()

In [2]:
# import SparkSession
from pyspark.sql import SparkSession

In [3]:
# create session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

In [4]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x000000AAFCE7AEB8>


In [5]:
# read dataset
df = spark.read.csv("new_york_tree_census_1995.csv", header=True, inferSchema=True)

In [6]:
# find out attributes
df.head()

Row(recordid=433600, address='73-031 57 AV', house_number='73-031', street='57 AV', zip_original=11378, cb_original=405, site='Front', species='QUPA', diameter=6, status='Good', wires='Yes', sidewalk_condition='Good', support_structure='None', borough='Queens', x=1015198.8, y=204725.3752, longitude=-73.888337, latitude=40.728546, cb_new=405, zip_new=11378, censustract_2010='49302', censusblock_2010='2000', nta_2010='QN30', segmentid=74525, spc_common='OAK PIN', spc_latin='QUERCUS PALUSTRIS', location='(40.728546 -73.888337)')

In [7]:
# find out each schema
df.schema

StructType(List(StructField(recordid,IntegerType,true),StructField(address,StringType,true),StructField(house_number,StringType,true),StructField(street,StringType,true),StructField(zip_original,IntegerType,true),StructField(cb_original,IntegerType,true),StructField(site,StringType,true),StructField(species,StringType,true),StructField(diameter,IntegerType,true),StructField(status,StringType,true),StructField(wires,StringType,true),StructField(sidewalk_condition,StringType,true),StructField(support_structure,StringType,true),StructField(borough,StringType,true),StructField(x,DoubleType,true),StructField(y,DoubleType,true),StructField(longitude,DoubleType,true),StructField(latitude,DoubleType,true),StructField(cb_new,IntegerType,true),StructField(zip_new,IntegerType,true),StructField(censustract_2010,StringType,true),StructField(censusblock_2010,StringType,true),StructField(nta_2010,StringType,true),StructField(segmentid,IntegerType,true),StructField(spc_common,StringType,true),StructFi

In [8]:
# create alias table to work on as 'trees'
df.createOrReplaceTempView("trees")

## Clustering and Visualization

### 1. Show the address of tree and its species that touches wires (power line) and has diameter >= 25 along with its location (in latitude, longitude)

In [9]:
query1 = spark.sql("SELECT DISTINCT address, species, latitude, longitude\
                    FROM trees\
                    WHERE wires <> 'None' AND diameter >= 25\
                    ORDER BY address")
query1.show()

+------------------+-------+---------+----------+
|           address|species| latitude| longitude|
+------------------+-------+---------+----------+
|    1 ASPINWALL ST|   PLAC|40.508272| -74.24948|
|     1 BELMONT TER|  ACSA1| 40.64403|-74.081925|
|     1 CARTERET ST|   PLAC|40.508581| -74.24847|
|   1 CONYINGHAM AV|   PLAC|40.637347|-74.103132|
|        1 E 233 ST|   QUPA|40.895179|-73.879776|
|       1 HOWARD CT|   TICO|40.641171|-74.113487|
|      1 POILLON AV|  ACSA1|40.536939|-74.179641|
|       1 SEWARD PL|   ACRU|40.613763|-74.135066|
|        1 SMITH CT|   ACRU|40.615023|-74.132941|
|1-016 PARSONS BLVD|  ACSA1| 40.79595|-73.827447|
|          10 77 ST|   PLAC| 40.63219|-74.037495|
|      10 AUBURN AV|  ACSA1|40.611484|-74.143056|
|     10 COLLEGE AV|   QUPA| 40.62224|-74.121088|
|   10 ELIZABETH AV|   TICO|40.641606|-74.112483|
|      10 FLOWER AV|  ACSA1| 40.50683|-74.224136|
|    10 GLENWOOD AV|  ACSA1|40.619095|-74.099161|
|   10 GRASSMERE DR|  ACSA1|40.601857|-74.084278|


In [10]:
# find out the length of query records
query1.count()

15982

In [11]:
# converting latitude and longitude into vector
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["latitude", "longitude"],
    outputCol='features')

query1 = assembler.transform(query1)
query1.show()

+------------------+-------+---------+----------+--------------------+
|           address|species| latitude| longitude|            features|
+------------------+-------+---------+----------+--------------------+
|    1 ASPINWALL ST|   PLAC|40.508272| -74.24948|[40.508272,-74.24...|
|     1 BELMONT TER|  ACSA1| 40.64403|-74.081925|[40.64403,-74.081...|
|     1 CARTERET ST|   PLAC|40.508581| -74.24847|[40.508581,-74.24...|
|   1 CONYINGHAM AV|   PLAC|40.637347|-74.103132|[40.637347,-74.10...|
|        1 E 233 ST|   QUPA|40.895179|-73.879776|[40.895179,-73.87...|
|       1 HOWARD CT|   TICO|40.641171|-74.113487|[40.641171,-74.11...|
|      1 POILLON AV|  ACSA1|40.536939|-74.179641|[40.536939,-74.17...|
|       1 SEWARD PL|   ACRU|40.613763|-74.135066|[40.613763,-74.13...|
|        1 SMITH CT|   ACRU|40.615023|-74.132941|[40.615023,-74.13...|
|1-016 PARSONS BLVD|  ACSA1| 40.79595|-73.827447|[40.79595,-73.827...|
|          10 77 ST|   PLAC| 40.63219|-74.037495|[40.63219,-74.037...|
|     

In [12]:
# train model to cluster
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans().setK(5).setSeed(1)
model = kmeans.fit(query1)

In [13]:
# predict and show result
predictions = model.transform(query1)
predictions.show(5)

+---------------+-------+---------+----------+--------------------+----------+
|        address|species| latitude| longitude|            features|prediction|
+---------------+-------+---------+----------+--------------------+----------+
| 1 ASPINWALL ST|   PLAC|40.508272| -74.24948|[40.508272,-74.24...|         3|
|  1 BELMONT TER|  ACSA1| 40.64403|-74.081925|[40.64403,-74.081...|         3|
|  1 CARTERET ST|   PLAC|40.508581| -74.24847|[40.508581,-74.24...|         3|
|1 CONYINGHAM AV|   PLAC|40.637347|-74.103132|[40.637347,-74.10...|         3|
|     1 E 233 ST|   QUPA|40.895179|-73.879776|[40.895179,-73.87...|         0|
+---------------+-------+---------+----------+--------------------+----------+
only showing top 5 rows



In [14]:
# evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.6431373308883238


In [15]:
# shows the result
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 40.78884611 -73.84890249]
[1. 1.]
[ 40.70260655 -73.79337388]
[ 40.59979726 -74.12059094]
[ 40.62616738 -73.95302096]


In [16]:
# visualization using pixiedust
import pixiedust

Pixiedust database opened successfully


In [None]:
display(predictions)

### Visualization Result

Showing first 100 rows in clusters:

![Clustering](img/cluster-1.jpg)