_Imports_

In [None]:
import os
import re
import json
import pandas as pd
from tqdm import tqdm
from downloader import SpotiDownloader

_Read your data_

In [None]:
data = []
folder_path = "./data/"
for filename in os.listdir(folder_path):
    with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
        data.extend(json.load(file))

df = pd.DataFrame(data)

# keep only important columns
columns_to_keep = ['ms_played', 'master_metadata_track_name', 'master_metadata_album_artist_name',
                   'spotify_track_uri']
df = df[columns_to_keep]

# convert millisecond to minute
df['ms_played'] = (df['ms_played'] / 60000).round(4)

# rearrange column order
df.columns = ["mins_played", "song", "artist", "uri"]

# remove the non-unique part from the URI
df["uri"] = df["uri"].str.replace("spotify:track:", "", regex=False)

_Preprocessing_

In [None]:
# remove duplicates
df_uri = df[['song', 'artist', 'uri']].drop_duplicates(subset=['song', 'artist'])

# group by song and artist
df_unique = df.groupby(['song', 'artist'], as_index=False)['mins_played'].sum()

# merge with uri
df_unique = pd.merge(df_unique, df_uri, on=['song', 'artist'], how='left')

# sort from most listened music to least
df = df_unique.sort_values(by='mins_played', ascending=False)

_Drop_

In [None]:
# remove the songs that have been listened under 10 minutes in total
df = df[df['mins_played'] >= 10]
df = df.reset_index(drop=True)

In [None]:
# a function to clean song names from unusual characters
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)

In [None]:
# create folders to store songs and their covers if does not exist
os.mkdir("./songs", exist_ok=True)
os.mkdir("./covers", exist_ok=True)

# initialize an object SpotiDownloader 
downloader = SpotiDownloader()

# iterate over your listened songs, download them as wav file and also their covers as png
for idx, row in tqdm(df.iterrows(), total=len(df)):
    try:
        song = sanitize_filename(row['song'].lower())
        artist = sanitize_filename(row['artist'].lower())
        downloader.download_song(name=song, author=artist, uri=row['uri'])
        downloader.download_cover(name=song, author=artist, uri=row['uri'])
    except:
        print(f"An error occurred while downloading {song} by {artist}!")
        continue

downloader.close()