In [2]:
from graphframes import *
from pyspark import *
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import SparkContext
from graphframes import *



In [3]:
#to inite new spark session instance
spark = SparkSession.builder.appName("graphProcessing").config("spark.master", "local").getOrCreate()

In [6]:
#Part(1)(1): To import the dataset as a csv file and create data frames directly on import 

station_data_RDD = spark.read.csv("input/201508_station_data.csv", header=True, inferSchema=True)
trip_data_RDD = spark.read.csv("input/201508_trip_data.csv", header=True, inferSchema=True)

#To Print RDDs Schema:
station_data_RDD.printSchema()
trip_data_RDD.printSchema()

root
 |-- station_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- dockcount: integer (nullable = true)
 |-- landmark: string (nullable = true)
 |-- installation: string (nullable = true)

root
 |-- Trip ID: integer (nullable = true)
 |-- Duration: integer (nullable = true)
 |-- Start Date: string (nullable = true)
 |-- Start Station: string (nullable = true)
 |-- Start Terminal: integer (nullable = true)
 |-- End Date: string (nullable = true)
 |-- End Station: string (nullable = true)
 |-- End Terminal: integer (nullable = true)
 |-- Bike #: integer (nullable = true)
 |-- Subscriber Type: string (nullable = true)
 |-- Zip Code: string (nullable = true)



In [11]:
#To show first 5 records of each RDD
station_data_RDD.show(5)


+----------+--------------------+---------+-----------+---------+--------+------------+
|station_id|                name|      lat|       long|dockcount|landmark|installation|
+----------+--------------------+---------+-----------+---------+--------+------------+
|         2|San Jose Diridon ...|37.329732|-121.901782|       27|San Jose|    8/6/2013|
|         3|San Jose Civic Ce...|37.330698|-121.888979|       15|San Jose|    8/5/2013|
|         4|Santa Clara at Al...|37.333988|-121.894902|       11|San Jose|    8/6/2013|
|         5|    Adobe on Almaden|37.331415|  -121.8932|       19|San Jose|    8/5/2013|
|         6|    San Pedro Square|37.336721|-121.894074|       15|San Jose|    8/7/2013|
+----------+--------------------+---------+-----------+---------+--------+------------+
only showing top 5 rows



In [12]:
#To show first 5 records of each RDD
trip_data_RDD.show(5)

+-------+--------+---------------+--------------------+--------------+---------------+--------------------+------------+------+---------------+--------+
|Trip ID|Duration|     Start Date|       Start Station|Start Terminal|       End Date|         End Station|End Terminal|Bike #|Subscriber Type|Zip Code|
+-------+--------+---------------+--------------------+--------------+---------------+--------------------+------------+------+---------------+--------+
| 913460|     765|8/31/2015 23:26|Harry Bridges Pla...|            50|8/31/2015 23:39|San Francisco Cal...|          70|   288|     Subscriber|    2139|
| 913459|    1036|8/31/2015 23:11|San Antonio Shopp...|            31|8/31/2015 23:28|Mountain View Cit...|          27|    35|     Subscriber|   95032|
| 913455|     307|8/31/2015 23:13|      Post at Kearny|            47|8/31/2015 23:18|   2nd at South Park|          64|   468|     Subscriber|   94107|
| 913454|     409|8/31/2015 23:10|  San Jose City Hall|            10|8/31/2015 23

In [13]:
#To create temporary view of RDDs
station_data_RDD.createOrReplaceTempView("stations")
trip_data_RDD.createOrReplaceTempView("trips")


In [47]:
#Part(1)(2) Concatenate chunks into list & convert to Data Frame
geo_info = spark.sql("SELECT (lat, long) as geo_data FROM stations")
geo_info.collect()
geo_info.show()
station_data_RDD.show()

station_data_RDD2 = station_data_RDD.withColumn("geo_info", geo_info("geo_data"))

station_data_RDD2.show()

+--------------------+
|            geo_data|
+--------------------+
|[37.329732, -121....|
|[37.330698, -121....|
|[37.333988, -121....|
|[37.331415, -121....|
|[37.336721, -121....|
|[37.333798, -121....|
|[37.330165, -121....|
|[37.348742, -121....|
|[37.337391, -121....|
|[37.335885, -121....|
|[37.332808, -121....|
|[37.339301, -121....|
|[37.332692, -121....|
|[37.333955, -121....|
|[37.481758, -122....|
|[37.486078, -122....|
|[37.487616, -122....|
|[37.484219, -122....|
|[37.48537, -122.2...|
|[37.487682, -122....|
+--------------------+
only showing top 20 rows

+----------+--------------------+---------+-----------+---------+------------+------------+
|station_id|                name|      lat|       long|dockcount|    landmark|installation|
+----------+--------------------+---------+-----------+---------+------------+------------+
|         2|San Jose Diridon ...|37.329732|-121.901782|       27|    San Jose|    8/6/2013|
|         3|San Jose Civic Ce...|37.330698|-121.888979

TypeError: 'DataFrame' object is not callable

In [49]:
#Part(1)(3): Remove duplicates
# Remove duplicate entries from stations RDD
station_data_RDD_NoDUP = station_data_RDD.dropDuplicates()

# Count the number of rows
print("There were {} rows before removing duplicates, and {} rows after removing duplicates".format(station_data_RDD.count(), station_data_RDD_NoDUP.count()))


# Remove duplicate entries from trips RDD
trip_data_RDD_NoDUP = trip_data_RDD.dropDuplicates()

# Count the number of rows
print("There were {} rows before removing duplicates, and {} rows after removing duplicates".format(trip_data_RDD.count(), trip_data_RDD_NoDUP.count()))


There were 70 rows before removing duplicates, and 70 rows after removing duplicates
There were 51927 rows before removing duplicates, and 51927 rows after removing duplicates


In [51]:
#Part(1)(4):Name Columns
station_data_RDD = station_data_RDD.withColumnRenamed("landmark", "town")
trip_data_RDD = trip_data_RDD.withColumnRenamed("Bike #", "Bike_ID")

In [52]:
#Part(1)(5): Output Data Frame
station_data_RDD.show(5)
trip_data_RDD.show(5)

+----------+--------------------+---------+-----------+---------+--------+------------+
|station_id|                name|      lat|       long|dockcount|    town|installation|
+----------+--------------------+---------+-----------+---------+--------+------------+
|         2|San Jose Diridon ...|37.329732|-121.901782|       27|San Jose|    8/6/2013|
|         3|San Jose Civic Ce...|37.330698|-121.888979|       15|San Jose|    8/5/2013|
|         4|Santa Clara at Al...|37.333988|-121.894902|       11|San Jose|    8/6/2013|
|         5|    Adobe on Almaden|37.331415|  -121.8932|       19|San Jose|    8/5/2013|
|         6|    San Pedro Square|37.336721|-121.894074|       15|San Jose|    8/7/2013|
+----------+--------------------+---------+-----------+---------+--------+------------+
only showing top 5 rows

+-------+--------+---------------+--------------------+--------------+---------------+--------------------+------------+-------+---------------+--------+
|Trip ID|Duration|     Start 

In [None]:
#Part(1)(6):Create vertices


In [None]:
#Part(1)(7):Create vertices


In [None]:
#Part(1)(8):Create vertices
