In [1]:
# !pip install graphframes
from pyspark import SparkContext,SQLContext
from graphframes import *
sc = SparkContext.getOrCreate()
sc.addPyFile('/Users/neerajpadarthi/Neeraj/spark-2.3.0-bin-hadoop2.7/jars/graphframes-0.7.0-spark2.3-s_2.11.jar')
sqlcontext=SQLContext(sc)

In [2]:
station= sqlcontext.read.format("csv").option("header", "true").csv('201508_station_data.csv')
trip= sqlcontext.read.format("csv").option("header", "true").csv('201508_trip_data.csv')

In [3]:
station.show()

+----------+--------------------+---------+-----------+---------+------------+------------+
|station_id|                name|      lat|       long|dockcount|    landmark|installation|
+----------+--------------------+---------+-----------+---------+------------+------------+
|         2|San Jose Diridon ...|37.329732|-121.901782|       27|    San Jose|    8/6/2013|
|         3|San Jose Civic Ce...|37.330698|-121.888979|       15|    San Jose|    8/5/2013|
|         4|Santa Clara at Al...|37.333988|-121.894902|       11|    San Jose|    8/6/2013|
|         5|    Adobe on Almaden|37.331415|  -121.8932|       19|    San Jose|    8/5/2013|
|         6|    San Pedro Square|37.336721|-121.894074|       15|    San Jose|    8/7/2013|
|         7|Paseo de San Antonio|37.333798|-121.886943|       15|    San Jose|    8/7/2013|
|         8| San Salvador at 1st|37.330165|-121.885831|       15|    San Jose|    8/5/2013|
|         9|           Japantown|37.348742|-121.894715|       15|    San Jose|  

In [4]:
# station.printSchema

In [5]:
trip.show()

+-------+--------+---------------+--------------------+--------------+---------------+--------------------+------------+------+---------------+--------+
|Trip ID|Duration|     Start Date|       Start Station|Start Terminal|       End Date|         End Station|End Terminal|Bike #|Subscriber Type|Zip Code|
+-------+--------+---------------+--------------------+--------------+---------------+--------------------+------------+------+---------------+--------+
| 913460|     765|8/31/2015 23:26|Harry Bridges Pla...|            50|8/31/2015 23:39|San Francisco Cal...|          70|   288|     Subscriber|    2139|
| 913459|    1036|8/31/2015 23:11|San Antonio Shopp...|            31|8/31/2015 23:28|Mountain View Cit...|          27|    35|     Subscriber|   95032|
| 913455|     307|8/31/2015 23:13|      Post at Kearny|            47|8/31/2015 23:18|   2nd at South Park|          64|   468|     Subscriber|   94107|
| 913454|     409|8/31/2015 23:10|  San Jose City Hall|            10|8/31/2015 23

In [6]:
# station

In [7]:
stationvertices=station.withColumnRenamed("name","id").select("id").distinct()
stationvertices.show(3)

+--------------------+
|                  id|
+--------------------+
|       2nd at Folsom|
|California Ave Ca...|
|Washington at Kea...|
+--------------------+
only showing top 3 rows



In [8]:
tripedges=trip.withColumnRenamed("Start Station","src")\
.withColumnRenamed("End Station","dst")\
.select("src","dst").groupBy("src","dst").count()
tripedges.show(3)

+--------------------+--------------------+-----+
|                 src|                 dst|count|
+--------------------+--------------------+-----+
|       2nd at Folsom|San Francisco Cal...|  179|
| Golden Gate at Polk|   Steuart at Market|   40|
|Grant Avenue at C...|Mechanics Plaza (...|   32|
+--------------------+--------------------+-----+
only showing top 3 rows



In [9]:
g=GraphFrame(stationvertices,tripedges)

In [10]:
g

GraphFrame(v:[id: string], e:[src: string, dst: string ... 1 more field])

In [36]:
g.triangleCount().show()

+-----+--------------------+
|count|                  id|
+-----+--------------------+
|  496|       2nd at Folsom|
|   23|California Ave Ca...|
|    0|Washington at Kea...|
|  496|Powell at Post (U...|
|  496| Golden Gate at Polk|
|  496|Yerba Buena Cente...|
|  496|   Market at Sansome|
|   90|         MLK Library|
|  496|     Spear at Folsom|
|   77|           Japantown|
|  496|Commercial at Mon...|
|   81|Paseo de San Antonio|
|   23|Rengstorff Avenue...|
|   61| San Salvador at 1st|
|  496|     Townsend at 7th|
|  496|Civic Center BART...|
|   41|         Ryland Park|
|   90|San Jose Diridon ...|
|   63|San Jose Civic Ce...|
|    0|     Post at Kearney|
+-----+--------------------+
only showing top 20 rows



In [39]:
g.shortestPaths(landmarks=["2nd at Folsom", "Japantown"]).show()

+--------------------+--------------------+
|                  id|           distances|
+--------------------+--------------------+
|       2nd at Folsom|[2nd at Folsom -> 0]|
|      Market at 10th|[2nd at Folsom -> 1]|
|California Ave Ca...|                  []|
|Washington at Kea...|                  []|
|Redwood City Publ...|                  []|
|Powell at Post (U...|[2nd at Folsom -> 1]|
| Golden Gate at Polk|[2nd at Folsom -> 2]|
|    Adobe on Almaden|    [Japantown -> 2]|
|Broadway St at Ba...|[2nd at Folsom -> 1]|
|Yerba Buena Cente...|[2nd at Folsom -> 1]|
|     Beale at Market|[2nd at Folsom -> 1]|
|   Market at Sansome|[2nd at Folsom -> 1]|
|         MLK Library|    [Japantown -> 1]|
|     Spear at Folsom|[2nd at Folsom -> 1]|
|       5th at Howard|[2nd at Folsom -> 1]|
|           Japantown|    [Japantown -> 0]|
|Commercial at Mon...|[2nd at Folsom -> 1]|
|    San Pedro Square|    [Japantown -> 1]|
|Paseo de San Antonio|    [Japantown -> 1]|
|Redwood City Medi...|          

In [40]:
results = g.pageRank(resetProbability=0.15, tol=0.01)
results.vertices.select("id", "pagerank").show()
results.edges.select("src", "dst", "weight").show()

+--------------------+-------------------+
|                  id|           pagerank|
+--------------------+-------------------+
|       2nd at Folsom| 0.9824888917828893|
|      Market at 10th|  1.038130411138078|
|California Ave Ca...| 1.1172141560493858|
|Washington at Kea...|0.16391984022625658|
|Redwood City Publ...| 0.6728086774388867|
|Powell at Post (U...| 1.0381304111380778|
| Golden Gate at Polk| 1.0381304111380778|
|    Adobe on Almaden| 0.9037422230312734|
|Broadway St at Ba...| 1.0115490961700713|
|Yerba Buena Cente...|  1.038130411138078|
|     Beale at Market|  1.038130411138078|
|   Market at Sansome|  1.038130411138078|
|         MLK Library| 1.0850875307932695|
|     Spear at Folsom|  1.038130411138078|
|       5th at Howard|  1.038130411138078|
|           Japantown| 1.1277529076126742|
|Commercial at Mon...|  1.038130411138078|
|    San Pedro Square|  1.262557303509268|
|Paseo de San Antonio| 1.1971448900978932|
|Redwood City Medi...|0.40228085743593783|
+----------

In [41]:
g.vertices.write.parquet('vertices')

In [42]:
g.edges.write.parquet('edges')

In [43]:
result = g.labelPropagation(maxIter=5)
result.select("id", "label").show()

+--------------------+-------------+
|                  id|        label|
+--------------------+-------------+
|       2nd at Folsom|            0|
|      Market at 10th|            0|
|California Ave Ca...|1649267441664|
|Washington at Kea...|  17179869184|
|Redwood City Publ...| 730144440320|
|Powell at Post (U...|            0|
| Golden Gate at Polk|            0|
|    Adobe on Almaden| 257698037761|
|Broadway St at Ba...|            0|
|Yerba Buena Cente...|            0|
|     Beale at Market|            0|
|   Market at Sansome|            0|
|         MLK Library| 257698037761|
|     Spear at Folsom|            0|
|       5th at Howard|            0|
|           Japantown| 257698037761|
|Commercial at Mon...|            0|
|    San Pedro Square| 257698037761|
|Paseo de San Antonio| 257698037761|
|Redwood City Medi...| 730144440320|
+--------------------+-------------+
only showing top 20 rows



In [12]:
paths = g.bfs("id = 'Japantown'", "id = 'MLK Library'")
paths.show()

+-----------+--------------------+-------------+
|       from|                  e0|           to|
+-----------+--------------------+-------------+
|[Japantown]|[Japantown, MLK L...|[MLK Library]|
+-----------+--------------------+-------------+

