# Data Analysis

In [1]:
from __future__ import print_function
from pyspark.sql.functions import count, countDistinct
from pyspark.sql.functions import col
from pyspark.sql.functions import explode, expr
from pyspark.sql.functions import split
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark import SparkContext
from pyspark.sql import SQLContext
import os

spark = SparkSession.builder.getOrCreate()


In [2]:
#Load tweets as Dataframe to Spark
tweetsDF = spark.read.json("file:///Users/Laith/Downloads/features_tweets.json")

#Print the schema/data structure in a tree format
#tweetsDF.printSchema()

In [6]:
#cleanTweetsDF = cleanTweetsDF.where("friends_count != 0")
#cleanTweetsDF = cleanTweetsDF.where("followers_count != 0")
#cleanTweetsDF = cleanTweetsDF.where("friends_count < 10000")
#cleanTweetsDF = cleanTweetsDF.where("friends_count > 20")
#cleanTweetsDF = cleanTweetsDF.where("followers_count < 400000")
#cleanTweetsDF = cleanTweetsDF.where("followers_count < 400000")
#cleanTweetsDF.count()

6636

In [8]:
assembler = VectorAssembler(
    inputCols=["followers_count", "friends_count"],
    outputCol="features")
output = assembler.transform(tweetsDF)
print("Assembled columns 'followers_count', 'friends_count' to vector column 'features'")
featuresData = output.select("features", "userId")
featuresData.show(20)

Assembled columns 'followers_count', 'friends_count' to vector column 'features'
+---------------+-------------------+
|       features|             userId|
+---------------+-------------------+
|[2387.0,4996.0]|          351883595|
|  [326.0,403.0]| 744537718043086849|
|[3613.0,2396.0]|          112500296|
|[2221.0,4477.0]|1185604978964127756|
| [493.0,2061.0]|           57025876|
| [2832.0,785.0]|          106469050|
|[1808.0,1341.0]|         3511592775|
| [410.0,1106.0]| 773870985229590530|
|[4207.0,1042.0]|           40071024|
|[1203.0,2496.0]|          274021172|
|[4432.0,2269.0]|          363867255|
| [601.0,1444.0]|         2702818325|
|  [222.0,164.0]|           19535313|
|  [120.0,257.0]|          585658692|
|[7131.0,1022.0]|         2542945434|
|[1264.0,1393.0]|         1448731784|
| [470.0,1506.0]|         2197417861|
|  [668.0,251.0]| 752687748239200256|
|   [76.0,290.0]| 880568786654568448|
|  [595.0,458.0]|          835592406|
+---------------+-------------------+
only sh

In [4]:
#tweetsDF.count()

# Clustering

In [9]:
#Visualize tweeps based on followers and friends count, and Location/UserId
# Trains a k-means model.
k = 5
kmeans = KMeans().setK(k).setSeed(1)

model = kmeans.fit(featuresData)

#model2 = kmeans.fit(featuresL)

# Make predictions
predictions = model.transform(featuresData)
#predictions2 = model.transform(featuresL)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))



# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)



Silhouette with squared euclidean distance = 0.9640725407085363
Cluster Centers: 
[1585.13520954 1588.34184766]
[82509.81578947 11693.47368421]
[564248.   2572.]
[200248.5   41712.25]
[27632.26923077  9418.23076923]


In [6]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np

In [10]:
featuresData.printSchema()
featuresData.count()
predictions.printSchema()
predictions.count()

root
 |-- features: vector (nullable = true)
 |-- userId: long (nullable = true)

root
 |-- features: vector (nullable = true)
 |-- userId: long (nullable = true)
 |-- prediction: integer (nullable = false)



6737

In [11]:
#add to original data
tweetsDF = tweetsDF.select("userId", "followers_count", "friends_count")
dfPred = predictions.join(tweetsDF, 'userId')
dfPred.printSchema()
dfPred.show(4, False)
dfPred.count()

#dfPred.groupBy('prediction').count().sort(col("count").desc()).show()
#predictions.printSchema()
#predictions.show(4, False)
#Select and convert to panda
#pdDFpred = dfPred.select("userId", "location", "prediction", "followers_count", "friends_count").toPandas().set_index('userId')
#pdDFpred.count()

root
 |-- userId: long (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: integer (nullable = false)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)

+------------------+---------------+----------+---------------+-------------+
|userId            |features       |prediction|followers_count|friends_count|
+------------------+---------------+----------+---------------+-------------+
|351883595         |[2387.0,4996.0]|0         |2387           |4996         |
|744537718043086849|[326.0,403.0]  |0         |326            |403          |
|112500296         |[3613.0,2396.0]|0         |3613           |2396         |
|112500296         |[3613.0,2396.0]|0         |3613           |2396         |
+------------------+---------------+----------+---------------+-------------+
only showing top 4 rows



20199

In [None]:
#visualize clusters of users
fig, ax = plt.subplots()
scatter = ax.scatter(dfPred.followers_count, dfPred.friends_count, c=dfPred.prediction, s=50)

legend = ax.legend(*scatter.legend_elements(),
                    loc="upper right", title="Clusters")
ax.add_artist(legend)

#Draw
for center in centers:
    ax.scatter(center[0], center[1], c='black', s=200, alpha=0.5)

plt.title("Clustering Results")
plt.xlabel("Followers Count")
plt.ylabel("Friends Count")

plt.show()
