## K-Means Clustering of Stack Overflow
####Import pyspark libraries
####Import R Answers csv

In [2]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

In [3]:
%fs ls /FileStore/tables/R_Answers.csv

In [4]:
#Data is R_Answers
data = spark.sql("SELECT * FROM R_Answers_csv") #df_data_1

data1 = data.dropna()
data1.show(5)

In [5]:
data1.printSchema()

In [6]:
# Data2 is R_Tags
data2=spark.sql("SELECT * FROM R_Tags_csv") 
data2.show(5)

In [7]:
#Join Answers and Tags
data3=data2.join(data1, data1.ParentId==data2.Id)
data3.show(5)

## StringIndexer

In [9]:
from pyspark.ml.feature import StringIndexer
 
indexer = StringIndexer(inputCol="IsAcceptedAnswer", outputCol="IsAcceptedAnswer_Indexed").fit(data3)
indexed_data1 = indexer.transform(data3)
indexed_data1.drop("Id").show()

In [10]:
assembler = VectorAssembler(inputCols = [ "OwnerUserId", "ParentId", "Score","IsAcceptedAnswer_Indexed"], outputCol="features")
train = assembler.transform(indexed_data1)

knum = 2
# Make sure to set [predictionCol="prediction"]
kmeans = KMeans(featuresCol=assembler.getOutputCol(), predictionCol="prediction", k=knum, seed=0)
model = kmeans.fit(train)
print "Model Created!"

## Visualize Predictions
###Is Accepted Answer

In [12]:
# data set does not need to be divided to train and test
predictions = model.transform(train)
predictions.groupBy("prediction").count().orderBy("prediction").show()

In [13]:
predictions.select("features", "prediction").show(5)

In [14]:
predictions.select("features", "prediction").show(100)

## ClusterEvaluator
###Import from Pyspark

In [16]:
# > Spark 2.3.0?
IS_SPARK230 = True

if IS_SPARK230:
    from pyspark.ml.evaluation import ClusteringEvaluator

    # Evaluate clustering by computing Silhouette score
    #evaluator =  ClusteringEvaluator().setPredictionCol("cluster").setFeaturesCol("features").setMetricName("silhouette")
    evaluator = ClusteringEvaluator()
    #print evaluator.explainParams
    
    # Needs Parameters: prediction (of DoubleType values) and label (of float or double values)
    silhouette = evaluator.evaluate(predictions)
    # close to 1
    print("Silhouette with squared euclidean distance = " + str(silhouette))
else:
    # Previous Spark: Evaluate clustering by computing Within Set Sum of Squared Errors.
    wssse = model.computeCost(train)
    print("Within Set Sum of Squared Errors = " + str(wssse))

In [17]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

In [18]:
# Look at the features of each cluster

# define dictionary
customerCluster = {}
for i in range(0,knum):
    # Group by Cluster that is prediction
    tmp = predictions.select("Tag", "OwnerUserId", "CreationDate", "ParentId", "Score", \
                                        "IsAcceptedAnswer_Indexed", "Body")\
                                    .where("prediction =" +  str(i))
    customerCluster[str(i)]= tmp
    print "Cluster"+str(i)
    customerCluster[str(i)].show(5)