In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql.types import TimestampType, DateType
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [2]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [3]:
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [4]:
input_data = "./data/"
output_data = "./data/sparkify/"
spark = create_spark_session()

In [8]:
song_output = spark.read.parquet(output_data + "song_table/")

song_output.printSchema()
song_output.count()

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)



71

In [None]:
song_output = spark.read.parquet(output_data + "song_table/")

song_output.printSchema()
song_output.count()

In [22]:
list_variables = ["song_table", "artist_table", 'time_table', 'user_table', 'songplays_table']

for val in list_variables:
    print(f"READING {val}")
    exec(f"""
{val} = spark.read.parquet('{output_data + val}')
print("SCHEMA {val}")
{val}.printSchema()
print("table {val} has " +str({val}.count())+ ' rows \\n')
    """)

READING song_table
SCHEMA song_table
root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)

table song_table has 71 rows 

READING artist_table
SCHEMA artist_table
root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)

table artist_table has 69 rows 

READING time_table
SCHEMA time_table
root
 |-- ts: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

table time_table has 6820 rows 

READING user_table
SCHEMA user_table
root
 

In [32]:
songplays_table.join(time_table, 
                     on=[songplays_table.ts == time_table.ts], 
                     how="left")\
                    .groupBy("week")\
                    .agg(F.countDistinct("user_id").alias("count_user"))\
                    .show()

+----+----------+
|week|count_user|
+----+----------+
|  44|        41|
|  47|        74|
|  48|        62|
|  45|        69|
|  46|        60|
+----+----------+



In [37]:
songplays_table.join(user_table, 
                    on=[songplays_table.user_id == user_table.user_id],
                    how="left")\
                .groupBy("gender",user_table.level)\
                .count().alias("number_songplays")\
                .show()

+------+-----+-----+
|gender|level|count|
+------+-----+-----+
|     M| free|  585|
|     F| free|  530|
|     F| paid| 4357|
|     M| paid| 1348|
+------+-----+-----+



In [45]:
songplays_table.join(time_table, 
                     on=[songplays_table.ts == time_table.ts], 
                     how="left")\
                .groupBy("location")\
                .pivot("week")\
                .agg(F.countDistinct("user_id").alias("number_user"))\
                .show()

+--------------------+----+----+----+----+----+
|            location|  44|  45|  46|  47|  48|
+--------------------+----+----+----+----+----+
|        Richmond, VA|null|   1|null|   1|   1|
|San Diego-Carlsba...|null|   1|null|   1|null|
|La Crosse-Onalask...|   1|   1|   1|   1|   1|
|New Haven-Milford...|   1|   1|   1|   1|   1|
|Birmingham-Hoover...|   1|   1|   1|   1|   1|
|Indianapolis-Carm...|   1|null|   1|   1|   1|
|Seattle-Tacoma-Be...|null|null|   1|   1|   1|
|       Palestine, TX|null|   1|null|   1|   1|
|      Santa Rosa, CA|   1|   1|   1|   1|   1|
|   Winston-Salem, NC|   1|   1|   1|   1|null|
|Los Angeles-Long ...|null|   1|   1|   3|null|
|Minneapolis-St. P...|null|   1|   1|null|   1|
|San Francisco-Oak...|   1|   1|   1|   1|   2|
|Phoenix-Mesa-Scot...|   1|   1|   1|   1|   1|
|Augusta-Richmond ...|null|null|   1|   1|   1|
|Miami-Fort Lauder...|null|   1|null|null|null|
|San Jose-Sunnyval...|   1|   2|   2|   2|   2|
|       Yuba City, CA|   1|   1|   1|   