#### Spark & Hadoop Operations

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
conf = SparkConf()
#set config
conf = conf.setAppName("ml_sparkSession") \
           .setMaster('local') \
           .set("spark.executor.memory", "4g") \
           .set("spark.executor.cores", "2") \
           .set("spark.sql.shuffle.partitions", "4") \
           .set("spark.executor.instances", "2") 
#create Spark Session
spark = SparkSession.builder.config(conf = conf).getOrCreate()

In [13]:
#read data
silver_artist = spark.read.parquet("hdfs://namenode:9000/datalake/silver_data/silver_artist")
silver_album = spark.read.parquet("hdfs://namenode:9000/datalake/silver_data/silver_album")
silver_track = spark.read.parquet("hdfs://namenode:9000/datalake/silver_data/silver_track")
silver_track_feature = spark.read.parquet("hdfs://namenode:9000/datalake/silver_data/silver_track_feature")
#rename columns
silver_artist = silver_artist.withColumnRenamed('id', 'artist_id') \
                            .withColumnRenamed('name', 'artist_name')
#drop unnecessary columns and rename columns 
silver_album = silver_album.drop('artist', 'label', 'popularity', 'copyrights', 'url', 'link_image') \
                        .withColumnRenamed('id', 'album_id') \
                        .withColumnRenamed('name', 'album_name') \
                        .withColumnRenamed('type', 'album_type')  \
                        .withColumnRenamed('popularity', 'album_popularity')
#joining table and creating summary album table
summary_album_table = silver_artist.join(silver_album, on = 'artist_id', how = 'inner')
summary_album_table = summary_album_table.select('artist_id','artist_name','album_id','album_name',
                        'genres','followers','popularity','link_image','url','album_type',
                        'total_tracks','release_date','release_date_precision')

#joining table and creating summary track feature table
silver_track = silver_track.select('album_id', 'album_name', 'id', 'name', 'preview')
summary_feature_table = silver_track.join(silver_track_feature, on = 'id', how = 'left')
summary_feature_table = summary_feature_table.withColumnRenamed('id', 'track_id')

In [None]:
summary_album_table.show()
summary_feature_table.show()

[Stage 18:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+
|            track_id|            album_id|          album_name|                name|             preview|danceability|energy| key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+
|0006Rv1e2Xfh6Qooy...|1BD29pKydSXe1EsHF...|Colin Frake On Fi...|           Nightwood|https://p.scdn.co...|       0.295| 0.498|   2|   -9.19|   0|     0.0301|       0.795|           0.944|   0.107| 0.0445| 89.048|             3|
|0007AYhg2UQbEm88m...|32RJzqlapfiU0fr2l...|             E.D.G.E|Mandarin Oranges ...|htt

                                                                                