In [0]:
spark

In [0]:
# Load the file hosted at `filepath` onto a PySpark DataFrame: user_logs
filepath = "s3://full-stack-bigdata-datasets/Big_Data/youtube_playlog.csv"

user_logs = (spark.read.format('csv')\
             .option('header', 'true')\
             .option('inferSchema', 'true')\
             .load(filepath))

In [0]:
user_logs.show()

+----------+----+-----------+
| timestamp|user|       song|
+----------+----+-----------+
|1392387533|   0|t1l8Z6gLPzo|
|1392387538|   1|t1l8Z6gLPzo|
|1392387556|   2|t1l8Z6gLPzo|
|1392387561|   3|we5gzZq5Avg|
|1392387566|   4|we5gzZq5Avg|
|1392387566|   5|we5gzZq5Avg|
|1392387574|   6|49esza4eiK4|
|1392387579|   2|BoO6LfR7ca0|
|1392387583|   7|DaH4W1rY9us|
|1392387584|   2|BoO6LfR7ca0|
|1392387590|   8|BoO6LfR7ca0|
|1392387590|   9|PAAUqBghiVo|
|1392387592|   3|7CkBU80AsVg|
|1392387593|  10|yaaZO-X4LIk|
|1392387595|  11|yaaZO-X4LIk|
|1392387607|   7|lddHsCBdQu8|
|1392387612|  12|mhsf7K6h7SY|
|1392387612|  13|mhsf7K6h7SY|
|1392387621|   3|3go6xFb0FrU|
|1392387624|  14|L4OZxO1TnuE|
+----------+----+-----------+
only showing top 20 rows



In [0]:
user_logs.printSchema()

root
 |-- timestamp: integer (nullable = true)
 |-- user: integer (nullable = true)
 |-- song: string (nullable = true)



In [0]:
user_logs.describe()

Out[5]: DataFrame[summary: string, timestamp: string, user: string, song: string]

In [0]:
user_describe = user_logs.describe()
user_describe.toPandas()

Unnamed: 0,summary,timestamp,user,song
0,count,25739537.0,25739537.0,25739537
1,mean,1442700656.1045842,12697.352275450798,2.532571778181818E8
2,stddev,34432848.72371195,13094.065905828476,8.334645614940468E8
3,min,-139955897.0,0.0,---AtpxbkaE
4,max,1554321113.0,45903.0,zzzcFgRMY6c


In [0]:
display(user_describe) # you may also choose to use display

summary,timestamp,user,song
count,25739537.0,25739537.0,25739537
mean,1442700656.1045842,12697.352275450798,2.532571778181818E8
stddev,34432848.72371195,13094.065905828476,8.334645614940468E8
min,-139955897.0,0.0,---AtpxbkaE
max,1554321113.0,45903.0,zzzcFgRMY6c


In [0]:
user_logs.createOrReplaceTempView('user_logs_table')

In [0]:
display(spark.sql("""SELECT COUNT(*) 
                      FROM user_logs_table"""))

count(1)
25739537


In [0]:
user_logs.count()

Out[10]: 25739537

In [0]:
spark.sql("""SELECT user FROM user_logs_table""").show()

+----+
|user|
+----+
|   0|
|   1|
|   2|
|   3|
|   4|
|   5|
|   6|
|   2|
|   7|
|   2|
|   8|
|   9|
|   3|
|  10|
|  11|
|   7|
|  12|
|  13|
|   3|
|  14|
+----+
only showing top 20 rows



In [0]:
user_logs.select('user').show()

+----+
|user|
+----+
|   0|
|   1|
|   2|
|   3|
|   4|
|   5|
|   6|
|   2|
|   7|
|   2|
|   8|
|   9|
|   3|
|  10|
|  11|
|   7|
|  12|
|  13|
|   3|
|  14|
+----+
only showing top 20 rows



In [0]:
spark.sql("""SELECT DISTINCT(user) FROM user_logs_table""").show()

+----+
|user|
+----+
|  12|
|   1|
|  13|
|   6|
|  16|
|   3|
|  20|
|   5|
|  19|
|  15|
|   9|
|  17|
|   4|
|   8|
|   7|
|  10|
|  11|
|  14|
|   2|
|   0|
+----+
only showing top 20 rows



In [0]:
user_logs.select('user').distinct().show()

+----+
|user|
+----+
|  12|
|   1|
|  13|
|   6|
|  16|
|   3|
|  20|
|   5|
|  19|
|  15|
|   9|
|  17|
|   4|
|   8|
|   7|
|  10|
|  11|
|  14|
|   2|
|   0|
+----+
only showing top 20 rows



In [0]:
display(spark.sql("""SELECT DISTINCT(user) AS distinct_user FROM user_logs_table"""))

distinct_user
148
463
471
496
833
1088
1238
1342
1580
1591


In [0]:
user_logs.select(user_logs['user'].alias('distinct_user')).distinct().show()
#user_logs.select('user').distinct().withColumnRenamed('user','distinct_user').show() # alternative solution

+-------------+
|distinct_user|
+-------------+
|           12|
|            1|
|           13|
|            6|
|           16|
|            3|
|           20|
|            5|
|           19|
|           15|
|            9|
|           17|
|            4|
|            8|
|            7|
|           10|
|           11|
|           14|
|            2|
|            0|
+-------------+
only showing top 20 rows



In [0]:
spark.sql("""
    SELECT COUNT(DISTINCT(user)) AS total_distinct_user
    FROM user_logs_table""").show()


+-------------------+
|total_distinct_user|
+-------------------+
|              45904|
+-------------------+



In [0]:
user_logs.select('user').distinct().count()

Out[18]: 45904

In [0]:
spark.sql("""
    SELECT COUNT(DISTINCT(song)) AS total_distinct_song
    FROM user_logs_table
""").show()

+-------------------+
|total_distinct_song|
+-------------------+
|             631348|
+-------------------+



In [0]:
user_logs.select('song').distinct().count()

Out[20]: 631348