In [1]:
"""
Thierry Bertin-Mahieux (2010) Columbia University
tb2332@columbia.edu
This code contains a set of getters functions to access the fields
from an HDF5 song file (regular file with one song or
aggregate / summary file with many songs)
This is part of the Million Song Dataset project from
LabROSA (Columbia University) and The Echo Nest.
Copyright 2010, Thierry Bertin-Mahieux
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""



import tables
import numpy as np


def open_h5_file_read(h5filename):
    """
    Open an existing H5 in read mode.
    Same function as in hdf5_utils, here so we avoid one import
    """
    return tables.open_file(h5filename, mode='r')


def get_num_songs(h5):
    """
    Return the number of songs contained in this h5 file, i.e. the number of rows
    for all basic informations like name, artist, ...
    """
    return h5.root.metadata.songs.nrows

def get_artist_familiarity(h5,songidx=0):
    """
    Get artist familiarity from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_familiarity[songidx]

def get_artist_hotttnesss(h5,songidx=0):
    """
    Get artist hotttnesss from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_hotttnesss[songidx]

def get_artist_id(h5,songidx=0):
    """
    Get artist id from a HDF5 song file, by default the first song in it
    """
    return (h5.root.metadata.songs.cols.artist_id[songidx]).decode('utf-8')

def get_artist_mbid(h5,songidx=0):
    """
    Get artist musibrainz id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_mbid[songidx].decode('utf-8')

def get_artist_playmeid(h5,songidx=0):
    """
    Get artist playme id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_playmeid[songidx]

def get_artist_7digitalid(h5,songidx=0):
    """
    Get artist 7digital id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_7digitalid[songidx]

def get_artist_latitude(h5,songidx=0):
    """
    Get artist latitude from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_latitude[songidx]

def get_artist_longitude(h5,songidx=0):
    """
    Get artist longitude from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_longitude[songidx]

def get_artist_location(h5,songidx=0):
    """
    Get artist location from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_location[songidx].decode('utf-8')

def get_artist_name(h5,songidx=0):
    """
    Get artist name from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.artist_name[songidx].decode('utf-8')

def get_release(h5,songidx=0):
    """
    Get release from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.release[songidx].decode('utf-8')

def get_release_7digitalid(h5,songidx=0):
    """
    Get release 7digital id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.release_7digitalid[songidx]

def get_song_id(h5,songidx=0):
    """
    Get song id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.song_id[songidx].decode('utf-8')

def get_song_hotttnesss(h5,songidx=0):
    """
    Get song hotttnesss from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.song_hotttnesss[songidx]

def get_title(h5,songidx=0):
    """
    Get title from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.title[songidx].decode('utf-8')

def get_track_7digitalid(h5,songidx=0):
    """
    Get track 7digital id from a HDF5 song file, by default the first song in it
    """
    return h5.root.metadata.songs.cols.track_7digitalid[songidx]

def get_similar_artists(h5,songidx=0):
    """
    Get similar artists array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return np.char.decode(h5.root.metadata.similar_artists[h5.root.metadata.songs.cols.idx_similar_artists[songidx]:])
    return np.char.decode(h5.root.metadata.similar_artists[h5.root.metadata.songs.cols.idx_similar_artists[songidx]:
                                            h5.root.metadata.songs.cols.idx_similar_artists[songidx+1]])

def get_artist_terms(h5,songidx=0):
    """
    Get artist terms array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return np.char.decode(h5.root.metadata.artist_terms[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:])
    return np.char.decode(h5.root.metadata.artist_terms[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
                                            h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]])

def get_artist_terms_freq(h5,songidx=0):
    """
    Get artist terms array frequencies. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return h5.root.metadata.artist_terms_freq[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:]
    return h5.root.metadata.artist_terms_freq[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
                                              h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]]

def get_artist_terms_weight(h5,songidx=0):
    """
    Get artist terms array frequencies. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.metadata.songs.nrows == songidx + 1:
        return h5.root.metadata.artist_terms_weight[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:]
    return h5.root.metadata.artist_terms_weight[h5.root.metadata.songs.cols.idx_artist_terms[songidx]:
                                                h5.root.metadata.songs.cols.idx_artist_terms[songidx+1]]

def get_analysis_sample_rate(h5,songidx=0):
    """
    Get analysis sample rate from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.analysis_sample_rate[songidx]

def get_audio_md5(h5,songidx=0):
    """
    Get audio MD5 from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.audio_md5[songidx].decode('utf-8')

def get_danceability(h5,songidx=0):
    """
    Get danceability from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.danceability[songidx]

def get_duration(h5,songidx=0):
    """
    Get duration from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.duration[songidx]

def get_end_of_fade_in(h5,songidx=0):
    """
    Get end of fade in from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.end_of_fade_in[songidx]

def get_energy(h5,songidx=0):
    """
    Get energy from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.energy[songidx]

def get_key(h5,songidx=0):
    """
    Get key from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.key[songidx]

def get_key_confidence(h5,songidx=0):
    """
    Get key confidence from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.key_confidence[songidx]

def get_loudness(h5,songidx=0):
    """
    Get loudness from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.loudness[songidx]

def get_mode(h5,songidx=0):
    """
    Get mode from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.mode[songidx]

def get_mode_confidence(h5,songidx=0):
    """
    Get mode confidence from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.mode_confidence[songidx]

def get_start_of_fade_out(h5,songidx=0):
    """
    Get start of fade out from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.start_of_fade_out[songidx]

def get_tempo(h5,songidx=0):
    """
    Get tempo from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.tempo[songidx]

def get_time_signature(h5,songidx=0):
    """
    Get signature from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.time_signature[songidx]

def get_time_signature_confidence(h5,songidx=0):
    """
    Get signature confidence from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.time_signature_confidence[songidx]

def get_track_id(h5,songidx=0):
    """
    Get track id from a HDF5 song file, by default the first song in it
    """
    return h5.root.analysis.songs.cols.track_id[songidx].decode('utf-8')

def get_segments_start(h5,songidx=0):
    """
    Get segments start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_start[h5.root.analysis.songs.cols.idx_segments_start[songidx]:]
    return h5.root.analysis.segments_start[h5.root.analysis.songs.cols.idx_segments_start[songidx]:
                                           h5.root.analysis.songs.cols.idx_segments_start[songidx+1]]
    
def get_segments_confidence(h5,songidx=0):
    """
    Get segments confidence array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_confidence[h5.root.analysis.songs.cols.idx_segments_confidence[songidx]:]
    return h5.root.analysis.segments_confidence[h5.root.analysis.songs.cols.idx_segments_confidence[songidx]:
                                                h5.root.analysis.songs.cols.idx_segments_confidence[songidx+1]]

def get_segments_pitches(h5,songidx=0):
    """
    Get segments pitches array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_pitches[h5.root.analysis.songs.cols.idx_segments_pitches[songidx]:,:]
    return h5.root.analysis.segments_pitches[h5.root.analysis.songs.cols.idx_segments_pitches[songidx]:
                                             h5.root.analysis.songs.cols.idx_segments_pitches[songidx+1],:]

def get_segments_timbre(h5,songidx=0):
    """
    Get segments timbre array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_timbre[h5.root.analysis.songs.cols.idx_segments_timbre[songidx]:,:]
    return h5.root.analysis.segments_timbre[h5.root.analysis.songs.cols.idx_segments_timbre[songidx]:
                                            h5.root.analysis.songs.cols.idx_segments_timbre[songidx+1],:]

def get_segments_loudness_max(h5,songidx=0):
    """
    Get segments loudness max array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_loudness_max[h5.root.analysis.songs.cols.idx_segments_loudness_max[songidx]:]
    return h5.root.analysis.segments_loudness_max[h5.root.analysis.songs.cols.idx_segments_loudness_max[songidx]:
                                                  h5.root.analysis.songs.cols.idx_segments_loudness_max[songidx+1]]

def get_segments_loudness_max_time(h5,songidx=0):
    """
    Get segments loudness max time array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_loudness_max_time[h5.root.analysis.songs.cols.idx_segments_loudness_max_time[songidx]:]
    return h5.root.analysis.segments_loudness_max_time[h5.root.analysis.songs.cols.idx_segments_loudness_max_time[songidx]:
                                                       h5.root.analysis.songs.cols.idx_segments_loudness_max_time[songidx+1]]

def get_segments_loudness_start(h5,songidx=0):
    """
    Get segments loudness start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.segments_loudness_start[h5.root.analysis.songs.cols.idx_segments_loudness_start[songidx]:]
    return h5.root.analysis.segments_loudness_start[h5.root.analysis.songs.cols.idx_segments_loudness_start[songidx]:
                                                    h5.root.analysis.songs.cols.idx_segments_loudness_start[songidx+1]]

def get_sections_start(h5,songidx=0):
    """
    Get sections start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.sections_start[h5.root.analysis.songs.cols.idx_sections_start[songidx]:]
    return h5.root.analysis.sections_start[h5.root.analysis.songs.cols.idx_sections_start[songidx]:
                                           h5.root.analysis.songs.cols.idx_sections_start[songidx+1]]

def get_sections_confidence(h5,songidx=0):
    """
    Get sections confidence array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.sections_confidence[h5.root.analysis.songs.cols.idx_sections_confidence[songidx]:]
    return h5.root.analysis.sections_confidence[h5.root.analysis.songs.cols.idx_sections_confidence[songidx]:
                                                h5.root.analysis.songs.cols.idx_sections_confidence[songidx+1]]

def get_beats_start(h5,songidx=0):
    """
    Get beats start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.beats_start[h5.root.analysis.songs.cols.idx_beats_start[songidx]:]
    return h5.root.analysis.beats_start[h5.root.analysis.songs.cols.idx_beats_start[songidx]:
                                        h5.root.analysis.songs.cols.idx_beats_start[songidx+1]]

def get_beats_confidence(h5,songidx=0):
    """
    Get beats confidence array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.beats_confidence[h5.root.analysis.songs.cols.idx_beats_confidence[songidx]:]
    return h5.root.analysis.beats_confidence[h5.root.analysis.songs.cols.idx_beats_confidence[songidx]:
                                             h5.root.analysis.songs.cols.idx_beats_confidence[songidx+1]]

def get_bars_start(h5,songidx=0):
    """
    Get bars start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.bars_start[h5.root.analysis.songs.cols.idx_bars_start[songidx]:]
    return h5.root.analysis.bars_start[h5.root.analysis.songs.cols.idx_bars_start[songidx]:
                                       h5.root.analysis.songs.cols.idx_bars_start[songidx+1]]

def get_bars_confidence(h5,songidx=0):
    """
    Get bars start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.bars_confidence[h5.root.analysis.songs.cols.idx_bars_confidence[songidx]:]
    return h5.root.analysis.bars_confidence[h5.root.analysis.songs.cols.idx_bars_confidence[songidx]:
                                            h5.root.analysis.songs.cols.idx_bars_confidence[songidx+1]]

def get_tatums_start(h5,songidx=0):
    """
    Get tatums start array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.tatums_start[h5.root.analysis.songs.cols.idx_tatums_start[songidx]:]
    return h5.root.analysis.tatums_start[h5.root.analysis.songs.cols.idx_tatums_start[songidx]:
                                         h5.root.analysis.songs.cols.idx_tatums_start[songidx+1]]

def get_tatums_confidence(h5,songidx=0):
    """
    Get tatums confidence array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.analysis.songs.nrows == songidx + 1:
        return h5.root.analysis.tatums_confidence[h5.root.analysis.songs.cols.idx_tatums_confidence[songidx]:]
    return h5.root.analysis.tatums_confidence[h5.root.analysis.songs.cols.idx_tatums_confidence[songidx]:
                                              h5.root.analysis.songs.cols.idx_tatums_confidence[songidx+1]]

def get_artist_mbtags(h5,songidx=0):
    """
    Get artist musicbrainz tag array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.musicbrainz.songs.nrows == songidx + 1:
        return np.char.decode(h5.root.musicbrainz.artist_mbtags[h5.root.musicbrainz.songs.cols.idx_artist_mbtags[songidx]:])
    return np.char.decode(h5.root.musicbrainz.artist_mbtags[h5.root.metadata.songs.cols.idx_artist_mbtags[songidx]:
                                             h5.root.metadata.songs.cols.idx_artist_mbtags[songidx+1]])

def get_artist_mbtags_count(h5,songidx=0):
    """
    Get artist musicbrainz tag count array. Takes care of the proper indexing if we are in aggregate
    file. By default, return the array for the first song in the h5 file.
    To get a regular numpy ndarray, cast the result to: numpy.array( )
    """
    if h5.root.musicbrainz.songs.nrows == songidx + 1:
        return h5.root.musicbrainz.artist_mbtags_count[h5.root.musicbrainz.songs.cols.idx_artist_mbtags[songidx]:]
    return h5.root.musicbrainz.artist_mbtags_count[h5.root.metadata.songs.cols.idx_artist_mbtags[songidx]:
                                                   h5.root.metadata.songs.cols.idx_artist_mbtags[songidx+1]]

def get_year(h5,songidx=0):
    """
    Get release year from a HDF5 song file, by default the first song in it
    """
    return h5.root.musicbrainz.songs.cols.year[songidx]

In [2]:
def get_total_features(path):
    
    #Load the features name from the metadata into a list so that we don't have to insert them manually

    h5_summary = open_h5_file_read(path)

    metadata = h5_summary.get_node('/metadata/songs/').colnames
    metadata.remove('genre')
    metadata.remove('analyzer_version')
    metadata = [w.replace('idx_', '') for w in metadata]

    analysis = h5_summary.get_node('/analysis/songs/').colnames
    analysis = [w.replace('idx_', '') for w in analysis]

    musicbrainz = h5_summary.get_node('/musicbrainz/songs/').colnames
    musicbrainz = [w.replace('idx_', '') for w in musicbrainz]

    total_features = np.array(metadata + analysis + musicbrainz).ravel()

    total_features = np.append(total_features, ['artist_terms_freq', 'artist_terms_weight', 'artist_mbtags_count'])


    total_features = np.sort(total_features)

    return total_features

In [3]:
def process_one_file(path, categories):
    h5file = open_h5_file_read(path)
    datapoint = {}
    for cat in categories:
        datapoint[cat] = globals()["get_"+cat](h5file)
    h5file.close()
    return datapoint

In [4]:
from tqdm.notebook import tqdm
import pandas as pd

def load_song_data(file_paths, parallel=4):
    #assert half in [1,2], 'half must be one or two'
    categories = get_total_features(file_paths[0])
    data = []
    #file_paths = glob.glob(path+'/'+letter+'/'+regex_half[half]+'/*/*.h5')

    for p in tqdm(file_paths):
        data.append(process_one_file(p, categories))

    df = pd.DataFrame(data)
    return df

In [14]:
import glob
original_path = "C:/Users/maxpr/Downloads/millionsongsubset"
result = glob.glob(original_path + '/**/*.h5', recursive=True)

In [114]:
df = load_song_data([f.replace("\\","/") for f in result], parallel=0)

  0%|          | 0/10000 [00:00<?, ?it/s]

  df['artist_mbtags'] = df['artist_mbtags'].apply(lambda x : x if x != [] else [''])
  df['artist_mbtags'] = df['artist_mbtags'].apply(lambda x : x if x != [] else [''])


In [119]:
df.to_parquet('test.parquet', engine='pyarrow')

In [120]:
df

Unnamed: 0,analysis_sample_rate,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_mbtags,...,start_of_fade_out,tatums_confidence,tatums_start,tempo,time_signature,time_signature_confidence,title,track_7digitalid,track_id,year
0,22050,165270,0.581794,0.401998,ARD7TVE1187B99BFB1,,California - LA,,e77e51a5-4761-45b3-9847-2051f811e366,[],...,218.932,"[0.779, 0.734, 0.674, 0.637, 0.597, 0.532, 0.4...","[0.28519, 0.58521, 0.89422, 1.19196, 1.49119, ...",92.198,4,0.778,I Didn't Mean To,3401791,TRAAAAW128F429D538,0
1,22050,1998,0.630630,0.417500,ARMJAGH1187FB546F3,35.14968,"Memphis, TN",-90.04892,1c78ab62-db33-4433-8d0b-7c8dcf1849c2,[classic pop and rock],...,137.915,"[0.969, 0.929, 0.897, 0.871, 0.856, 0.845, 0.8...","[0.20627, 0.45904, 0.71054, 0.96834, 1.21836, ...",121.274,4,0.384,Soul Deep,3400270,TRAAABD128F429CF47,1969
2,22050,290021,0.487357,0.343428,ARKRRTF1187B9984DA,,,,7a273984-edd9-4451-9c4d-39b38f05ebcd,[],...,172.304,"[0.482, 0.676, 0.627, 0.549, 0.279, 0.264, 0.2...","[0.42132, 0.73152, 1.06609, 1.39732, 1.72854, ...",100.070,1,0.000,Amor De Cabaret,5703798,TRAAADZ128F9348C2E,0
3,22050,19072,0.630382,0.454231,AR7G5I41187FB4CE6C,,"London, England",,e188a520-9cb7-4f73-a3d7-2f70c6538e92,"[uk, british, english]",...,217.124,"[0.601, 0.556, 0.523, 0.49, 0.466, 0.44, 0.428...","[0.56254, 0.81002, 1.05749, 1.30621, 1.55494, ...",119.293,4,0.000,Something Girls,3226795,TRAAAEF128F4273421,1982
4,22050,30973,0.651046,0.401724,ARXR32B1187FB57099,,,,c6903a2e-063c-4f91-a284-17b8f421be7b,[],...,198.699,"[1.0, 0.98, 0.932, 0.87, 0.82, 0.793, 0.768, 0...","[0.13576, 0.36918, 0.59914, 0.83141, 1.06368, ...",129.738,4,0.562,Face the Ashes,6795666,TRAAAFD128F92F423A,2007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,22050,52613,0.722571,0.499826,AR4C6V01187FB3BAF4,39.55792,Portugal,-7.84481,386b19c6-a3d2-414a-bef2-c178572b1cbe,"[black metal, portuguese, gothic metal, metal]",...,386.194,"[0.224, 0.198, 0.173, 0.151, 0.137, 0.129, 0.1...","[0.22262, 0.44279, 0.65975, 0.87991, 1.09901, ...",140.185,4,0.099,The Hanged Man,7677054,TRBIJMU12903CF892B,1998
9996,22050,7247,0.511663,0.409779,AR9JLBU1187B9AAEC4,-33.96243,"Port Elizabeth, South Africa",25.62326,aab7a1d7-5461-4874-8162-bc127b91da59,[south african],...,163.463,"[0.604, 0.592, 0.456, 0.362, 0.289, 0.235, 0.2...","[0.07692, 0.47498, 0.87108, 1.25931, 1.63576, ...",77.072,3,0.597,The Wonderful World Of The Young,442366,TRBIJNF128F14815A7,1998
9997,22050,7562,0.433508,0.289903,ARS1DCR1187B9A4A56,,,,0159799c-ef7f-4c37-a011-25b6572c0f62,[],...,186.015,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.27924, 0.53335, 0.78745, 1.04154, 1.29562, ...",118.123,4,0.205,Sentimental Man,3884209,TRBIJNK128F93093EC,0
9998,22050,68313,0.334457,0.216829,ARAGMIV11F4C843F78,,,,,[],...,300.826,"[0.421, 0.356, 0.301, 0.246, 0.203, 0.154, 0.1...","[0.28192, 0.50923, 0.73103, 0.95833, 1.18674, ...",137.663,4,0.000,Zydeco In D-Minor,904098,TRBIJRN128F425F3DD,0


In [66]:
df

Unnamed: 0,analysis_sample_rate,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_mbtags,...,start_of_fade_out,tatums_confidence,tatums_start,tempo,time_signature,time_signature_confidence,title,track_7digitalid,track_id,year
0,22050,165270,0.581794,0.401998,ARD7TVE1187B99BFB1,,California - LA,,e77e51a5-4761-45b3-9847-2051f811e366,[],...,218.932,"[0.779, 0.734, 0.674, 0.637, 0.597, 0.532, 0.4...","[0.28519, 0.58521, 0.89422, 1.19196, 1.49119, ...",92.198,4,0.778,I Didn't Mean To,3401791,TRAAAAW128F429D538,0
1,22050,1998,0.63063,0.4175,ARMJAGH1187FB546F3,35.14968,"Memphis, TN",-90.04892,1c78ab62-db33-4433-8d0b-7c8dcf1849c2,['classic pop and rock'],...,137.915,"[0.969, 0.929, 0.897, 0.871, 0.856, 0.845, 0.8...","[0.20627, 0.45904, 0.71054, 0.96834, 1.21836, ...",121.274,4,0.384,Soul Deep,3400270,TRAAABD128F429CF47,1969
2,22050,290021,0.487357,0.343428,ARKRRTF1187B9984DA,,,,7a273984-edd9-4451-9c4d-39b38f05ebcd,[],...,172.304,"[0.482, 0.676, 0.627, 0.549, 0.279, 0.264, 0.2...","[0.42132, 0.73152, 1.06609, 1.39732, 1.72854, ...",100.07,1,0.0,Amor De Cabaret,5703798,TRAAADZ128F9348C2E,0
3,22050,19072,0.630382,0.454231,AR7G5I41187FB4CE6C,,"London, England",,e188a520-9cb7-4f73-a3d7-2f70c6538e92,['uk' 'british' 'english'],...,217.124,"[0.601, 0.556, 0.523, 0.49, 0.466, 0.44, 0.428...","[0.56254, 0.81002, 1.05749, 1.30621, 1.55494, ...",119.293,4,0.0,Something Girls,3226795,TRAAAEF128F4273421,1982
4,22050,30973,0.651046,0.401724,ARXR32B1187FB57099,,,,c6903a2e-063c-4f91-a284-17b8f421be7b,[],...,198.699,"[1.0, 0.98, 0.932, 0.87, 0.82, 0.793, 0.768, 0...","[0.13576, 0.36918, 0.59914, 0.83141, 1.06368, ...",129.738,4,0.562,Face the Ashes,6795666,TRAAAFD128F92F423A,2007
5,22050,432935,0.535293,0.385471,ARKFYS91187B98E58F,,,,79c403f9-5467-4f23-8426-9ca3fc60a115,[],...,254.27,"[0.136, 0.127, 0.113, 0.112, 0.104, 0.09, 0.07...","[0.53929, 0.74856, 0.95987, 1.17118, 1.38249, ...",147.782,3,0.454,The Moon And I (Ordinary Day Album Version),444964,TRAAAMO128F1481E7F,0
6,22050,17970,0.556496,0.261941,ARD0S291187B9B7BF5,,Ohio,,56503d6d-094e-4c28-ae3d-04cc748ade5b,[],...,114.782,"[0.467, 0.474, 0.528, 0.541, 0.507, 0.482, 0.3...","[0.05611, 0.27253, 0.48785, 0.70535, 0.92722, ...",111.787,1,0.0,Keepin It Real (Skit),276593,TRAAAMQ128F1460CD3,0
7,22050,21128,0.801136,0.605507,AR10USD1187B99F3F1,,"Burlington, Ontario, Canada",,d89de379-665d-425c-b2e9-41b95d1edb36,[],...,181.023,"[0.292, 0.284, 0.282, 0.274, 0.27, 0.237, 0.21...","[0.36129, 0.65428, 0.94433, 1.24174, 1.53768, ...",101.43,3,0.408,Drop of Rain,90004,TRAAAPK128E0786D96,0
8,22050,276891,0.426668,0.332276,AR8ZCNI1187B9A069B,,,,19d232b9-b4d7-4dc8-b259-bf65efb655b1,[],...,258.99,"[0.121, 0.124, 0.126, 0.128, 0.13, 0.131, 0.13...","[1.22595, 1.39961, 1.57241, 1.74174, 1.91886, ...",86.643,4,0.487,Pink World,3996579,TRAAARJ128F9320760,1984
9,22050,242273,0.550514,0.422706,ARNTLGG11E2835DDB9,,,,4d96f7d0-2f0e-4e92-ba70-a405f96f8cec,[],...,261.747,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.09933, 0.36057, 0.62445, 0.88967, 1.15423, ...",114.041,4,0.878,Insatiable (Instrumental Version),7684249,TRAAAVG12903CFA543,0


In [67]:
df['artist_terms_freq'] = df['artist_terms_freq'].to_dict()
df['artist_terms_weight'] =  df['artist_terms_weight'].to_dict()
df['bars_confidence'] = df['bars_confidence'].to_dict()

In [69]:
df['artist_terms_weight']

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
Name: artist_terms_weight, dtype: int64

In [62]:
df['artist_mbtags_count'] = df['artist_mbtags_count'].astype(str).tolist()

In [60]:
df['artist_terms_freq'].to_dict()

{0: array([1.        , 0.77613623, 0.72966979, 0.68301072, 0.73013328,
        0.6715377 , 0.80834839, 0.63365545, 0.72966979, 0.5663102 ,
        0.57592584, 0.76900909, 0.58208775, 0.47493557, 0.67402276,
        0.61640511, 0.64964936, 0.51614094, 0.6000771 , 0.51980619,
        0.59353997, 0.56511801, 0.55205485, 0.53260669, 0.53958053,
        0.50931501, 0.45188402, 0.4484606 , 0.43320144, 0.4322592 ,
        0.42699224, 0.40636693, 0.36872193, 0.35580011, 0.33148334,
        0.28803951, 0.3212017 ]),
 1: array([1.        , 0.89319999, 0.78606029, 0.74638538, 0.76959371,
        0.86287996, 0.84396311, 0.80926862, 0.76959371, 0.76959371,
        0.91182678, 0.65015172, 0.76959371, 0.76959371, 0.80319885,
        0.59682398, 0.62623369, 0.56622481, 0.63564988, 0.53145068,
        0.53925634, 0.6718294 , 0.56760867, 0.64202042, 0.57531714,
        0.57830665, 0.58026237, 0.50317926, 0.49775864, 0.49154685,
        0.56355296, 0.50884487, 0.4721209 , 0.53094369, 0.46943746,
        

In [58]:
df['artist_terms_freq'].iloc[3]

array([0.98858386, 0.96725046, 0.82060405, 1.        , 0.77455844,
       0.75252382, 0.75252382, 0.71485539, 0.71485539, 0.75252382,
       0.71485539, 0.8342615 , 0.76490016, 0.81222688, 0.87216654,
       0.83474625, 0.71485539, 0.75252382, 0.75252382, 0.82849873,
       0.75252382, 0.60296874, 0.75252382, 0.73352282, 0.71485539,
       0.62436906, 0.57580031, 0.71485539, 0.66667331, 0.68135412,
       0.71485539, 0.57874549, 0.63456263, 0.5509555 , 0.69394164,
       0.64617951, 0.56252181, 0.61771564, 0.59034949, 0.56047439,
       0.66157194, 0.61925231, 0.59076729])