In [1]:
import h5py
import sys
import os
import glob
import csv
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from hdfs import InsecureClient
import numpy as np
import tables

Go through the base path and find all the .h5 files that are present there then save them to a csv file in order to load them later into spark

TODO: this actually needs to be reworked to create a table based on the hdfs file directory instead of local, however it doesn't matter as long as we don't change the files just run the code and pretend it does the right thing for now

In [2]:
basedir = '../MillionSongSubset/data'
os.listdir(basedir)

['A', 'B', 'files.csv']

In [3]:
ext='.h5'
all_files = []
for root, dirs, files in os.walk(basedir):
    files = glob.glob(os.path.join(root,'*'+ext))
    all_files.append(files)

flat_list = [item for sublist in all_files for item in sublist]

In [4]:
file = '/files.csv'
with open(basedir + file, 'w', newline='') as myfile:
    for line in flat_list:
        myfile.write(line)
        myfile.write('\n')

Set up the spark application

In [5]:
# calculate the number of partitions we want for the rdds
num_nodes = 3
num_rep = 2 * num_nodes

conf = (SparkConf()
   .setMaster("spark://192.168.2.110:7077")
   .setAppName("Group14")
   .set("spark.executor.cores", 2)
   .set("spark.pytspark.python","python3"))

sc = SparkContext(conf = conf)

In [6]:
sqlContext = SQLContext(sc)

code to convert filenames to actual file content

In [7]:
# good methods to explore the h5 inread: .keys(), .attrs.items() see documentation of h5py

def get_title(file, idx=0):
    return file['metadata']['songs']['title'][idx].decode("utf-8")

def get_artist_name(file, idx=0):
    return file['metadata']['songs']['artist_name'][idx].decode("utf-8")


def get_artist_familiarity(h5,songidx=0):
    """
    Get artist familiarity from a HDF5 song file, by default the first song in it
    """
    return str(h5['metadata']['songs']['artist_familiarity'][songidx]).encode('utf-8', 'ignore').decode("utf-8")

def get_artist_hotttnesss(h5,songidx=0):
    """
    Get artist hotttnesss from a HDF5 song file, by default the first song in it
    """
    return str(h5['metadata']['songs']['artist_hotttnesss'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_artist_id(h5,songidx=0):
    """
    Get artist id from a HDF5 song file, by default the first song in it
    """
    return h5['metadata']['songs']['artist_id'][songidx].decode("utf-8")

def get_artist_mbid(h5,songidx=0):
    """
    Get artist musibrainz id from a HDF5 song file, by default the first song in it
    """
    return h5['metadata']['songs']['artist_mbid'][songidx].decode("utf-8")

def get_artist_playmeid(h5,songidx=0):
    """
    Get artist playme id from a HDF5 song file, by default the first song in it
    """
    return str(h5['metadata']['songs']['artist_playmeid'][songidx]).encode('utf-8','ignore').decode("utf-8")

def get_artist_7digitalid(h5,songidx=0):
    """
    Get artist 7digital id from a HDF5 song file, by default the first song in it
    """
    return str(h5['metadata']['songs']['artist_7digitalid'][songidx]).encode('utf-8','ignore').decode("utf-8")

def get_artist_latitude(h5,songidx=0):
    """
    Get artist latitude from a HDF5 song file, by default the first song in it
    """
    return str(h5['metadata']['songs']['artist_latitude'][songidx]).encode('utf-8','ignore').decode("utf-8")

def get_artist_longitude(h5,songidx=0):
    """
    Get artist longitude from a HDF5 song file, by default the first song in it
    """
    return str(h5['metadata']['songs']['artist_longitude'][songidx]).encode('utf-8','ignore').decode("utf-8")

def get_artist_location(h5,songidx=0):
    """
    Get artist location from a HDF5 song file, by default the first song in it
    """
    return h5['metadata']['songs']['artist_location'][songidx].decode("utf-8")


def get_release(h5,songidx=0):
    """
    Get release from a HDF5 song file, by default the first song in it
    """
    return h5['metadata']['songs']['release'][songidx].decode("utf-8")

def get_release_7digitalid(h5,songidx=0):
    """
    Get release 7digital id from a HDF5 song file, by default the first song in it
    """
    return str(h5['metadata']['songs']['release_7digitalid'][songidx]).encode('utf-8','ignore').decode("utf-8")

def get_song_id(h5,songidx=0):
    """
    Get song id from a HDF5 song file, by default the first song in it
    """
    return h5['metadata']['songs']['song_id'][songidx].decode("utf-8")

def get_song_hotttnesss(h5,songidx=0):
    """
    Get song hotttnesss from a HDF5 song file, by default the first song in it
    """
    return str(h5['metadata']['songs']['song_hotttnesss'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_track_7digitalid(h5,songidx=0):
    """
    Get track 7digital id from a HDF5 song file, by default the first song in it
    """
    return str(h5['metadata']['songs']['track_7digitalid'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_analysis_sample_rate(h5,songidx=0):
    """
    Get analysis sample rate from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['analysis_sample_rate'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_audio_md5(h5,songidx=0):
    """
    Get audio MD5 from a HDF5 song file, by default the first song in it
    """
    return h5['analysis']['songs']['audio_md5'][songidx].decode("utf-8")


def get_danceability(h5,songidx=0):
    """
    Get danceability from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['danceability'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_duration(h5,songidx=0):
    """
    Get duration from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['duration'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_end_of_fade_in(h5,songidx=0):
    """
    Get end of fade in from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['end_of_fade_in'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_energy(h5,songidx=0):
    """
    Get energy from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['energy'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_key(h5,songidx=0):
    """
    Get key from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['key'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_key_confidence(h5,songidx=0):
    """
    Get key confidence from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['key_confidence'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_loudness(h5,songidx=0):
    """
    Get loudness from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['loudness'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_mode(h5,songidx=0):
    """
    Get mode from a HDF5 song file, by default the first song ifn it
    """
    return str(h5['analysis']['songs']['mode'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_mode_confidence(h5,songidx=0):
    """
    Get mode confidence from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['mode_confidence'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_start_of_fade_out(h5,songidx=0):
    """
    Get start of fade out from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['start_of_fade_out'][songidx]).encode('utf-8','ignore').decode("utf-8")


def get_tempo(h5,songidx=0):
    """
    Get tempo from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['tempo'][songidx]).encode('utf-8','ignore').decode("utf-8")

def get_time_signature(h5,songidx=0):
    """
    Get signature from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['time_signature'][songidx]).encode('utf-8','ignore').decode("utf-8")

def get_time_signature_confidence(h5,songidx=0):
    """
    Get signature confidence from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['time_signature_confidence'][songidx]).encode('utf-8','ignore').decode("utf-8")

def get_track_id(h5,songidx=0):
    """
    Get track id from a HDF5 song file, by default the first song in it
    """
    return str(h5['analysis']['songs']['track_id'][songidx]).encode('utf-8','ignore').decode("utf-8")

def get_year(h5,songidx=0):
    """
    Get release year from a HDF5 song file, by default the first song in it
    """
    return str(h5['musicbrainz']['songs']['year'][songidx]).encode('utf-8','ignore').decode("utf-8")

In [10]:
def get_similar_artists(h5,songidx=0):
    """
    Get similar artists array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return h5.root.metadata.similar_artists[h5.root.metadata.songs.cols.idx_similar_artists[songidx]:]
    return h5.root.metadata.similar_artists[h5.root.metadata.songs.cols.idx_similar_artists[songidx]:
                                            h5.root.metadata.songs.cols.idx_similar_artists[songidx+1]]

def get_artist_terms(h5,songidx=0):
    """
    Get artist terms array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return h5.root.metadata.artist_terms[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:]
    return h5.root.metadata.artist_terms[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
                                            h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]]

def get_artist_terms_freq(h5,songidx=0):
    """
    Get artist terms array frequencies. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return h5.root.metadata.artist_terms_freq[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:]
    return h5.root.metadata.artist_terms_freq[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
                                              h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]]

def get_artist_terms_weight(h5,songidx=0):
    """
    Get artist terms array frequencies. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return h5.root.metadata.artist_terms_weight[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:]
    return h5.root.metadata.artist_terms_weight[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
                                                h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]]


In [11]:
# import hdf5_getters
import tables
import requests
import io

# right now only mapping for the title needs to be changed to extract all the other features
def f(x):
    # set up the url where the data is stored (yes it's ugly)
    base_hdfs = '/LDSA/data'
    url = 'http://192.168.2.110:50070/webhdfs/v1'
    op = '?op=OPEN'
    full_path = url + base_hdfs + x[2:] + op
    # read in song file from hdfs api and then extract the wanted attributes
    with h5py.File(io.BytesIO(requests.get(url = full_path).content), 'r') as f:
        song = {}
        song['artist_name'] = get_artist_name(f)
        song['title'] = get_title(f)
        song['artist_familiarity'] = get_artist_familiarity(f)
        song['artist_hotttnesss'] = get_artist_hotttnesss(f)
        song['artist_id'] = get_artist_id(f)
        song['artist_mbid'] = get_artist_mbid(f)
        song['artist_playmeid'] = get_artist_playmeid(f)
        song['artist_7digitalid'] = get_artist_7digitalid(f)
        song['artist_latitude'] = get_artist_latitude(f)
        song['artist_longitude'] = get_artist_longitude(f)
        song['artist_location'] = get_artist_location(f)
        song['release'] = get_release(f)
        song['release_7digitalid'] = get_release_7digitalid(f)
        song['song_id'] = get_song_id(f)
        song['song_hotnesss'] = get_song_hotttnesss(f)
        song['track_7digitalid'] = get_track_7digitalid(f)
        song['analysis_sample_rate'] = get_analysis_sample_rate(f)
        song['audio_md5'] = get_audio_md5(f)
        song['danceability'] = get_danceability(f)
        song['duration'] = get_duration(f)
        song['end_of_fade_in'] = get_end_of_fade_in(f)
        song['energy'] = get_energy(f)
        song['key'] = get_key(f)
        song['key_confidence'] = get_key_confidence(f)
        song['loudness'] = get_loudness(f)
        song['mode'] = get_mode(f)
        song['mode_confidence'] = get_mode_confidence(f)
        song['start_of_fade_out'] = get_start_of_fade_out(f)
        song['tempo'] = get_tempo(f)
        song['time_signature'] = get_time_signature(f)
        song['time_signature_confidence'] = get_time_signature_confidence(f)
        song['track_id'] = get_track_id(f)
        song['year'] = get_year(f)
        
    return song



# right now only mapping for the title needs to be changed to extract all the other features
def f2(x):
    # set up the url where the data is stored (yes it's ugly)
    base_hdfs = '/LDSA/data'
    url = 'http://192.168.2.110:50070/webhdfs/v1'
    op = '?op=OPEN'
    full_path = url + base_hdfs + x[2:] + op
    # read in song file from hdfs api and then extract the wanted attributes
    with h5py.File(io.BytesIO(requests.get(url = full_path).content), 'r') as f:
        song = []
        
        song.append(get_artist_name(f))
        song.append(get_title(f))
        song.append(get_artist_familiarity(f))
        song.append(get_artist_hotttnesss(f))
        song.append(get_artist_id(f))
        song.append(get_artist_mbid(f))
        song.append(get_artist_playmeid(f))
        song.append(get_artist_7digitalid(f))
        song.append(get_artist_latitude(f))
        song.append(get_artist_longitude(f))
        song.append(get_artist_location(f))
        song.append(get_release(f))
        song.append(get_release_7digitalid(f))
        song.append(get_song_id(f))
        song.append(get_song_hotttnesss(f))
        song.append(get_track_7digitalid(f))
        song.append(get_analysis_sample_rate(f))
        song.append(get_audio_md5(f))
        song.append(get_danceability(f))
        song.append(get_duration(f))
        song.append(get_end_of_fade_in(f))
        song.append(get_energy(f))
        song.append(get_key(f))
        song.append(get_key_confidence(f))
        song.append(get_loudness(f))
        song.append(get_mode(f))
        song.append(get_mode_confidence(f))
        song.append(get_start_of_fade_out(f))
        song.append(get_tempo(f))
        song.append(get_time_signature(f))
        song.append(get_time_signature_confidence(f))
        song.append(get_track_id(f))
        song.append(get_year(f))
        
    return song

In [12]:
# todo: set the number of partitions here corectly as calculated above in order to maintain speed
file_paths = sc.textFile(basedir + file, minPartitions=12).map(lambda x: f2(x))

In [13]:
file_paths.take(1)

[['Avril',
  'The Matter With You (Part I_ Ii_ Iii)',
  '0.8971780384699787',
  '0.43823441386890283',
  'ARBXIDR1187B9AB77E',
  'b7a4eab2-75c8-44ef-aa2d-06a6b3a36165',
  '-1',
  '24165',
  'nan',
  'nan',
  '',
  "Now It's Spring",
  '279570',
  'SOMSKPE12A6D4FA09D',
  'nan',
  '3113250',
  '22050',
  'a87a246e15cdf35ae052672770a8dd67',
  '0.0',
  '651.59791',
  '0.131',
  '0.0',
  '4',
  '0.354',
  '-10.718',
  '0',
  '0.427',
  '606.064',
  '86.589',
  '4',
  '0.961',
  "b'TRAEELO128F425BD8F'",
  '2000']]

convert the rdd (api version 1) to a dataframe (api version 2)

In [12]:
columns = ['artist_name', 'title', 'artist_familiarity', 
             'artist_hotttnesss', 'artist_id', 'artist_mbid', 
             'artist_playmeid', 'artist_7digitalid', 'artist_latitude', 
             'artist_longitude', 'artist_location', 'release', 
             'release_7digitalid', 
             'song_id', 'song_hotnesss', 'track_7digitalid', 
             'analysis_sample_rate', 
             'audio_md5', 'danceability', 'duration', 'end_of_fade_in', 
             'energy', 'key', 
             'key_confidence', 'loudness', 'mode', 'mode_confidence', 
             'start_of_fade_out', 
             'tempo', 'time_signature', 
             'time_signature_confidence', 'track_id', 'year']
df = file_paths.toDF(columns)
df.printSchema()

root
 |-- artist_name: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_familiarity: string (nullable = true)
 |-- artist_hotttnesss: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- artist_mbid: string (nullable = true)
 |-- artist_playmeid: string (nullable = true)
 |-- artist_7digitalid: string (nullable = true)
 |-- artist_latitude: string (nullable = true)
 |-- artist_longitude: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- release: string (nullable = true)
 |-- release_7digitalid: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- song_hotnesss: string (nullable = true)
 |-- track_7digitalid: string (nullable = true)
 |-- analysis_sample_rate: string (nullable = true)
 |-- audio_md5: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- end_of_fade_in: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: st

In [13]:
# change types
from pyspark.sql import types 
['BinaryType', 'BooleanType', 'ByteType', 'DateType', 
          'DecimalType', 'DoubleType', 'FloatType', 'IntegerType', 
           'LongType', 'ShortType', 'StringType', 'TimestampType']
changedTypedf = df.withColumn("year", df["year"].cast("Integer"))\
                .withColumn("track_id", df["track_id"].cast("Integer"))\
                .withColumn("artist_id", df["artist_id"].cast("Integer"))\
                .withColumn("song_id", df["song_id"].cast("Integer"))\
                .withColumn("duration", df["duration"].cast("Float"))\
                .withColumn("danceability", df["danceability"].cast("Float"))\
                .withColumn("end_of_fade_in", df["end_of_fade_in"].cast("Float"))\
                .withColumn("energy", df["energy"].cast("Float"))\
                .withColumn("key_confidence", df["key_confidence"].cast("Float"))\
                .withColumn("mode_confidence", df["mode_confidence"].cast("Float"))\
                .withColumn("release_7digitalid", df["release_7digitalid"].cast("Integer"))\
                .withColumn("song_hotnesss", df["song_hotnesss"].cast("Float"))\
                .withColumn("start_of_fade_out", df["start_of_fade_out"].cast("Float"))\
                .withColumn("loudness", df["loudness"].cast("Float"))\
                .withColumn("tempo", df["tempo"].cast("Float"))

start with the data processing

In [14]:
changedTypedf.printSchema()

root
 |-- artist_name: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_familiarity: string (nullable = true)
 |-- artist_hotttnesss: string (nullable = true)
 |-- artist_id: integer (nullable = true)
 |-- artist_mbid: string (nullable = true)
 |-- artist_playmeid: string (nullable = true)
 |-- artist_7digitalid: string (nullable = true)
 |-- artist_latitude: string (nullable = true)
 |-- artist_longitude: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- release: string (nullable = true)
 |-- release_7digitalid: integer (nullable = true)
 |-- song_id: integer (nullable = true)
 |-- song_hotnesss: float (nullable = true)
 |-- track_7digitalid: string (nullable = true)
 |-- analysis_sample_rate: string (nullable = true)
 |-- audio_md5: string (nullable = true)
 |-- danceability: float (nullable = true)
 |-- duration: float (nullable = true)
 |-- end_of_fade_in: float (nullable = true)
 |-- energy: float (nullable = true)
 |-- key: stri

start with some very basic metrics over the dataset:

In [None]:
changedTypedf.take(1)

count the number of different artists in the dataset and count the numbers of songs release every year

In [None]:
changedTypedf.groupBy('artist_name')\
             .count()\
             .show()

In [None]:
changedTypedf.groupBy("year")\
             .count()\
             .show()

Let's get a bit more advanced: Calculate the average dancibility of the songs of an artist

In [48]:
from pyspark.sql.functions import col, avg
from pyspark.sql import functions as F
from pyspark.sql.functions import *

left = changedTypedf.select("artist_name").distinct().filter(changedTypedf['year'] > 2000)

right = changedTypedf.groupBy("artist_name")\
            .agg(avg(col("loudness"))\
            .alias("avg_loudness"))\
            .orderBy("avg_loudness", ascending=False)

In [49]:
left.join(right, left.artist_name == right.artist_name)\
            .select(right["artist_name"], "avg_loudness")\
            .orderBy("avg_loudness", ascending=False)\
            .collect()


[Row(artist_name='Brian Tyler', avg_loudness=-16.9064998626709),
 Row(artist_name='Cirrus', avg_loudness=-6.488000154495239),
 Row(artist_name='Faith Yang', avg_loudness=-6.2230000495910645),
 Row(artist_name='Goldfish', avg_loudness=-7.789499878883362),
 Row(artist_name='Ms. Dynamite / Kymani Marley', avg_loudness=-11.223999977111816),
 Row(artist_name='Rhian Sheehan', avg_loudness=-14.762500047683716),
 Row(artist_name='Sean Callery', avg_loudness=-29.33099937438965),
 Row(artist_name='Silverstein', avg_loudness=-4.7363333106040955),
 Row(artist_name='This Is Hell', avg_loudness=-5.493499994277954),
 Row(artist_name='Tim Hughes', avg_loudness=-7.204999923706055),
 Row(artist_name='Belle & Sebastian', avg_loudness=-7.915999889373779),
 Row(artist_name='Ben Jelen', avg_loudness=-9.035499811172485),
 Row(artist_name='Noisettes', avg_loudness=-3.4240000247955322),
 Row(artist_name='Britney Spears feat. Pharrell Williams', avg_loudness=-4.947000026702881),
 Row(artist_name='Cama', avg_lou

In [15]:
changedTypedf.take(1)

[Row(artist_name='Avril', title='The Matter With You (Part I_ Ii_ Iii)', artist_familiarity='0.8971780384699787', artist_hotttnesss='0.43823441386890283', artist_id=None, artist_mbid='b7a4eab2-75c8-44ef-aa2d-06a6b3a36165', artist_playmeid='-1', artist_7digitalid='24165', artist_latitude='nan', artist_longitude='nan', artist_location='', release="Now It's Spring", release_7digitalid=279570, song_id=None, song_hotnesss=None, track_7digitalid='3113250', analysis_sample_rate='22050', audio_md5='a87a246e15cdf35ae052672770a8dd67', danceability=0.0, duration=651.597900390625, end_of_fade_in=0.13099999725818634, energy=0.0, key='4', key_confidence=0.3540000021457672, loudness=-10.718000411987305, mode='0', mode_confidence=0.4269999861717224, start_of_fade_out=606.0640258789062, tempo=86.58899688720703, time_signature='4', time_signature_confidence='0.961', track_id=None, year=2000)]

In [None]:
sc.stop()