In [15]:
import os, sys
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [7]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("ETL").getOrCreate()

In [17]:
spark = SparkSession.builder\
                .master("local[*]")\
                .appName("ETL")\
                .config("spark.executor.logs.rolling.time.interval", "daily")\
                .getOrCreate()

In [50]:
datasetDir = "hdfs://localhost:9000/"

In [51]:
file = datasetDir + "data/higgs-social_network.edgelist.gz"

In [52]:
datasetDir

'hdfs://localhost:9000/'

In [53]:
schema = StructType([StructField("follower", IntegerType()), StructField("followed", IntegerType())])

In [54]:
socialDF = spark.read.csv(path=file, sep=" ", schema=schema)

In [55]:
file = datasetDir + "data/higgs-retweet_network.edgelist.gz"
schema = StructType([StructField("tweeter", IntegerType()), StructField("tweeted", IntegerType()), StructField("occur", IntegerType())])
retweetDF = spark.read.csv(path=file, sep=" ", schema=schema)

In [56]:
file = datasetDir + "data/higgs-reply_network.edgelist.gz"
schema = StructType([StructField("replier", IntegerType()), StructField("replied", IntegerType()), StructField("occur", IntegerType())])
replyDF = spark.read.csv(path=file, sep=" ", schema=schema)

In [57]:
file = datasetDir + "data/higgs-mention_network.edgelist.gz"
schema = StructType([StructField("mentioner", IntegerType()), StructField("mentioned", IntegerType()), StructField("occur", IntegerType())])
mentionDF = spark.read.csv(path=file, sep=" ", schema=schema)

In [58]:
file = datasetDir + "data/higgs-activity_time.txt.gz"
schema = StructType([StructField("userA", IntegerType()), \
                     StructField("userB", IntegerType()), \
                     StructField("timestamp", IntegerType()), \
                    StructField("interaction", StringType())])
                    #Interaction can be: RT (retweet), MT (mention) or RE (reply)
activityDF = spark.read.csv(path=file, sep=" ", schema=schema)


# Convert CSV's dataframes to Apache Parquet files

In [59]:
socialDF.write.save(datasetDir + "data/higgs-social_network.parquet")
retweetDF.write.save(datasetDir + "data/higgs-retweet_network.parquet")
replyDF.write.save(datasetDir + "data/higgs-reply_network.parquet")
mentionDF.write.save(datasetdir + "data/higgs-mention_network.parquet")
activityDF.write.save(datasetDir + "data/higgs-activity_time.parquet")



# Load the parquet files into new dataframes

In [60]:
socialDFpq = spark.read.load(datasetDir + "data/higgs-social_network.parquet")
retweetDFpq = spark.read.load(datasetDir + "data/higgs-retweet_network.parquet")
replyDFpq = spark.read.load(datasetDir + "data/higgs-reply_network.parquet")
mentionDFpq = spark.read.load(datasetDir + "data/higgs-mention_network.parquet")
activityDFpq = spark.read.load(datasetDir + "data/higgs-activity_time.parquet")

In [61]:
socialDFpq.printSchema()
socialDFpq.schema

root
 |-- follower: integer (nullable = true)
 |-- followed: integer (nullable = true)



StructType(List(StructField(follower,IntegerType,true),StructField(followed,IntegerType,true)))

In [62]:
socialDFpq.show(3)
retweetDFpq.show(3)
replyDFpq.show(3)
mentionDFpq.show(3)
activityDFpq.show(3)

+--------+--------+
|follower|followed|
+--------+--------+
|       1|       2|
|       1|       3|
|       1|       4|
+--------+--------+
only showing top 3 rows

+-------+-------+-----+
|tweeter|tweeted|occur|
+-------+-------+-----+
| 298960| 105232|    1|
|  95688|   3393|    1|
| 353237|  62217|    1|
+-------+-------+-----+
only showing top 3 rows

+-------+-------+-----+
|replier|replied|occur|
+-------+-------+-----+
| 161345|   8614|    1|
| 428368|  11792|    1|
|  77904|  10701|    1|
+-------+-------+-----+
only showing top 3 rows

+---------+---------+-----+
|mentioner|mentioned|occur|
+---------+---------+-----+
|   316609|     5011|    1|
|   439696|    12389|    1|
|    60059|     6929|    1|
+---------+---------+-----+
only showing top 3 rows

+------+------+----------+-----------+
| userA| userB| timestamp|interaction|
+------+------+----------+-----------+
|223789|213163|1341100972|         MT|
|223789|213163|1341100972|         RE|
|376989| 50329|1341101181|       

# Spark SQL using DataFrames API

In [64]:
#USERS who have most followers
socialDFpq.groupBy("followed").agg(count("followed").alias("follower")).orderBy(desc("follower")).show(5)

+--------+--------+
|followed|follower|
+--------+--------+
|    1503|   51386|
|     206|   48414|
|      88|   45221|
|     138|   44188|
|    1062|   40120|
+--------+--------+
only showing top 5 rows



In [66]:
#users who have most mentions
mentionDF.groupBy("mentioned").agg(count("occur").alias("mentiones")).orderBy(desc("mentiones")).show(5)

+---------+---------+
|mentioned|mentiones|
+---------+---------+
|       88|    11953|
|      677|     3906|
|     2417|     2533|
|    59195|     1601|
|     3998|     1587|
+---------+---------+
only showing top 5 rows



In [67]:
# Of the top 5 followed users, how many mentions has each one?

# top_f contains "top 5 users who have most followers"
top_f = socialDFpq.groupBy("followed").agg(count("follower").alias("followers")).orderBy(desc("followers")).limit(5)

top_f.join(mentionDFpq, top_f.followed == mentionDFpq.mentioned)\
    .groupBy(top_f.followed, top_f.followers)\
        .agg(sum(mentionDFpq.occur).alias("mentions"))\
    .orderBy(desc("followers")).show()

+--------+---------+--------+
|followed|followers|mentions|
+--------+---------+--------+
|    1503|    51386|     150|
|     206|    48414|     397|
|      88|    45221|   15687|
|     138|    44188|     347|
|    1062|    40120|      84|
+--------+---------+--------+



# Spark SQL using SQL language

In [68]:
# create temporary views so we can use SQL statements
socialDFpq.createOrReplaceTempView("social")
retweetDFpq.createOrReplaceTempView("retweet")
replyDFpq.createOrReplaceTempView("reply")
mentionDFpq.createOrReplaceTempView("mention")
activityDFpq.createOrReplaceTempView("activity")

In [69]:
# Users who have most followers
spark.sql("select followed, count(follower) as followers from social group by followed order by followers desc").show(5)

# Users who have most mentions
spark.sql("select mentioned, count(occur) as mentions from mention group by mentioned order by mentions desc").show(5)

+--------+---------+
|followed|followers|
+--------+---------+
|    1503|    51386|
|     206|    48414|
|      88|    45221|
|     138|    44188|
|    1062|    40120|
+--------+---------+
only showing top 5 rows

+---------+--------+
|mentioned|mentions|
+---------+--------+
|       88|   11953|
|      677|    3906|
|     2417|    2533|
|    59195|    1601|
|     3998|    1587|
+---------+--------+
only showing top 5 rows



In [70]:


# Of the top 5 followed users, how many mentions has each one?
spark.sql("""
select 5_top_f.followed, 5_top_f.followers, sum(m.occur) as mentions
    from 
        -- subquery that contains top 5 of followed users
        (select followed, count(follower) as followers from social group by followed order by followers desc limit 5) 5_top_f, 
        mention as m
    where 5_top_f.followed = m.mentioned
    group by 5_top_f.followed, followers
    order by followers desc
        """).show()



+--------+---------+--------+
|followed|followers|mentions|
+--------+---------+--------+
|    1503|    51386|     150|
|     206|    48414|     397|
|      88|    45221|   15687|
|     138|    44188|     347|
|    1062|    40120|      84|
+--------+---------+--------+



# Performance testing

# GZIP Compressed CSV file vs Parquet file

In [71]:
%%time
# GZIP Compressed CSV
socialDF.groupBy("followed").agg(count("followed").alias("followers")).orderBy(desc("followers")).show(5)


+--------+---------+
|followed|followers|
+--------+---------+
|    1503|    51386|
|     206|    48414|
|      88|    45221|
|     138|    44188|
|    1062|    40120|
+--------+---------+
only showing top 5 rows

CPU times: user 0 ns, sys: 6.81 ms, total: 6.81 ms
Wall time: 19.3 s


In [72]:
%%time
# Parquet file
socialDFpq.groupBy("followed").agg(count("followed").alias("followers")).orderBy(desc("followers")).show(5)


+--------+---------+
|followed|followers|
+--------+---------+
|    1503|    51386|
|     206|    48414|
|      88|    45221|
|     138|    44188|
|    1062|    40120|
+--------+---------+
only showing top 5 rows

CPU times: user 5.3 ms, sys: 0 ns, total: 5.3 ms
Wall time: 5.38 s


# Cached DF vs not cached DF

This time we will cache the 2 previous dataframes (socialDF and socialDFpq) and see how faster is.

In [73]:
# cache dataframes
socialDF.cache()
socialDFpq.cache()

# remove from cache
#socialDF.unpersist()
#socialDFpq.unpersist()

DataFrame[follower: int, followed: int]

In [74]:
%%time
# GZIP Compressed CSV (dataframe cached)
socialDF.groupBy("followed").agg(count("followed").alias("followers")).orderBy(desc("followers")).show(5)

+--------+---------+
|followed|followers|
+--------+---------+
|    1503|    51386|
|     206|    48414|
|      88|    45221|
|     138|    44188|
|    1062|    40120|
+--------+---------+
only showing top 5 rows

CPU times: user 9.29 ms, sys: 778 µs, total: 10.1 ms
Wall time: 33.8 s


In [75]:
%%time
# Parquet file (dataframe cached)
socialDFpq.groupBy("followed").agg(count("followed").alias("followers")).orderBy(desc("followers")).show(5)


+--------+---------+
|followed|followers|
+--------+---------+
|    1503|    51386|
|     206|    48414|
|      88|    45221|
|     138|    44188|
|    1062|    40120|
+--------+---------+
only showing top 5 rows

CPU times: user 8.12 ms, sys: 680 µs, total: 8.8 ms
Wall time: 13.1 s
