In [1]:
#read csv file containing subset data (csv file uploaded into databricks as Community5.csv)
from pyspark.sql.types import *
schema = StructType([StructField("Month", IntegerType()),StructField("Weekday", IntegerType()),StructField("Time_period", IntegerType()),StructField("Fare", IntegerType()), StructField("Pickup", StringType()),StructField("Dropoff",StringType())]) 
data = spark.read.csv("/FileStore/tables/Community5.csv",header=True,schema=schema)
data.show(10)

In [3]:
# restrict dataset to high fare trips
data = data.where("Fare = 2")

In [4]:
# Let Month variable start from 0 to be consistent with other variables
from pyspark.sql import SQLContext
data.registerTempTable("data")
data = sqlContext.sql("SELECT Month-1 AS Month, Weekday, Time_period, Fare, Pickup, Dropoff FROM data")
data.show(10)

In [5]:
#create one-hot vector from categorical variables
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer 
from pyspark.ml.feature import OneHotEncoder 
from pyspark.ml.feature import VectorAssembler

#Dropoff column to stringindex
indexer = [StringIndexer(inputCol = column,outputCol=column+"String").fit(data) for column in data.columns[4:6]]
pipeline = Pipeline(stages = indexer)                                                                                                   
data2 = pipeline.fit(data).transform(data)
data2 = data2.select("Month","Weekday","Time_period","Fare","PickupString","DropoffString")

encoder = [OneHotEncoder(inputCol=column,outputCol = column+"_ohe",dropLast=False) for column in data2.columns]
pipeline = Pipeline(stages = encoder)  
data3 = pipeline.fit(data2).transform(data2)
data3b = data3.select("Month_ohe","Weekday_ohe","Time_period_ohe","Fare_ohe","PickupString_ohe", "DropoffString_ohe")

va = VectorAssembler(inputCols=["Month_ohe","Weekday_ohe","Time_period_ohe","Fare_ohe","PickupString_ohe","DropoffString_ohe"], outputCol = "features")
data4 = va.transform(data3b)
data4.show(5)

In [6]:
# get sum of squares distance to centroid for different k to plot 'elbow' plot
from pyspark.ml.clustering import KMeans

for i in range(2, 51):
 kmeans = KMeans(k=i,maxIter = 100)
 clus = kmeans.fit(data4)
 kvalue = clus.computeCost(data4)
 print '' +str(i)+ ","+str(kvalue)+''

In [8]:
# get cluster centers for optimal K
import pandas as pd
from pyspark.sql import SQLContext
kmeans = KMeans(k=15,maxIter = 100)
clus = kmeans.fit(data4)
centers = clus.clusterCenters()

#convert array to sql dataframe to be exported easily using 'display'
centersdf = pd.DataFrame(centers)
sqlCtx = SQLContext(sc)
display(sqlCtx.createDataFrame(centersdf))