In [0]:
df = spark.read.csv("/Volumes/workspace/music/albums-supplemental/SupplementalAlbums1.csv", header=True, inferSchema=True)
df.head()

In [0]:
CLIENT_ID = dbutils.secrets.get(scope = "Spotify", key = "client-id")
CLIENT_SECRET = dbutils.secrets.get(scope = "Spotify", key = "client-secret")

In [0]:
import requests
import urllib.parse

def get_token():
    url = "https://accounts.spotify.com/api/token"
    response = requests.post(
        url,
        data={"grant_type": "client_credentials"},
        auth=(CLIENT_ID, CLIENT_SECRET),
    )
    return response.json()["access_token"]

def get_album_tracks(album_num, artist_name, album_name, year, token=None):
    headers = {"Authorization": f"Bearer {token}"}
    query = f"artist:{artist_name},album:{album_name}"
    encoded_query = urllib.parse.quote_plus(query)

    # Search album
    search_url = f"https://api.spotify.com/v1/search?q={encoded_query}&type=album&limit=7"
    print(search_url)
    search_res = requests.get(search_url, headers=headers).json()
    
    if not search_res["albums"]["items"]:
        print (f"No albums found for {album_name} by {artist_name}.")
        return None
    
    if len(search_res["albums"]["items"]) > 1:
        print(f"Multiple albums found for {album_name} by {artist_name}.")
        # Print each possibility and ask for user input
        print(f"0. None of the below")
        for i, item in enumerate(search_res["albums"]["items"], 1):
            print(f"{i}. {item['name']} ({item['release_date']})")
        choice = int(input("Enter the number of the correct album: "))
        if choice == 0:
            return None
        album = search_res["albums"]["items"][choice-1]
    else:
        album = search_res["albums"]["items"][0]

    album_id = album["id"]
    release_date = album["release_date"]
    release_date_precision = album["release_date_precision"]
    print(f"{release_date} {release_date_precision}")

    # Get tracks
    tracks_url = f"https://api.spotify.com/v1/albums/{album_id}/tracks"
    tracks_res = requests.get(tracks_url, headers=headers).json()
    
    tracks_info = [(album_num, track["track_number"], track["name"], track["duration_ms"]) for track in tracks_res["items"]]
    tracks_df = spark.createDataFrame(tracks_info, ["album_num", "track_number", "name", "duration_ms"])
    return tracks_df

In [0]:
token = get_token()
volume_path = "/Volumes/workspace/music/albums-supplemental"

index = 0
for artist, album, year in df.take(100):
    index += 1
    file_name = f"{(3000+index):04d}-{artist}-{album}.csv"
    file_path = f"{volume_path}/{file_name}"

    # Check to see if the file exists in the tracks volume already
    try:
        files = dbutils.fs.ls(file_path)
        print(f"Skipping {file_name}...")
        continue
    except Exception:
        pass  # File does not exist, proceed

    tracks_df = get_album_tracks((3000+index), artist, album, year, token)
    if not tracks_df:
        continue

    print(tracks_df.head(3))
    print()

    # Write the tracks to the tracks volume
    tracks_df.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save(f"{volume_path}/{file_name}")


In [0]:
import os

volume_path = "/Volumes/workspace/music/albums-1001-tracks"

# List all files in the volume path
files = dbutils.fs.ls(volume_path)

for file_info in files:
    if not file_info.isDir():
        file_path = file_info.path
        
        # Check if the folder name ends with .csv.csv.csv
        if file_path.endswith(".temp.csv"):
            dbutils.fs.mv(file_path, file_path.replace(".temp.csv", ".csv"))

In [0]:
import os

volume_path = "/Volumes/workspace/music/albums-1001-tracks"

# List all files in the volume path
files = dbutils.fs.ls(volume_path)

for file_info in files:
    if file_info.isDir():
        folder_path = file_info.path
        folder_name = os.path.basename(folder_path.rstrip('/'))
        
        # Check if the folder name ends with .csv.csv.csv
        if folder_name.startswith("002"):
            print(f"Deleting directory: {folder_path}")
            dbutils.fs.rm(folder_path, True)

In [0]:
import os

volume_path = "/Volumes/workspace/music/albums-1001-tracks"

# List all files in the volume path
files = dbutils.fs.ls(volume_path)

for file_info in files:
    if file_info.isDir():
        folder_path = file_info.path
        folder_name = os.path.basename(folder_path.rstrip('/'))
        
        # Check if the folder name ends with .csv
        if folder_name.endswith(".csv"):
            new_folder_name = folder_name[:-4]
            new_folder_path = os.path.join(os.path.dirname(folder_path.rstrip('/')), new_folder_name)
            print(f"Renaming directory: {folder_path} to {new_folder_path}")
            dbutils.fs.mv(folder_path, new_folder_path, recurse=True)

In [0]:
import os
volume_path = "/Volumes/workspace/music/albums-supplemental"

files = dbutils.fs.ls(volume_path)

for file_info in files:
    if file_info.isDir():
        folder_path = file_info.path
        folder_name = folder_path.rstrip('/').split('/')[-1]
        
        print(f"Processing directory: {folder_path}")
        df = spark.read.csv(folder_path, header=True, inferSchema=True)
        temp_csv_path = f"{os.path.dirname(folder_path.rstrip('/'))}/{folder_name}.temp"
        single_csv_path = temp_csv_path[:-5] + ".csv"
        
        # Write Spark DataFrame directly to DBFS as a single CSV file
        df.coalesce(1).write.option("header", True).mode("overwrite").csv(temp_csv_path)

        temporary_csv = os.path.join(temp_csv_path, dbutils.fs.ls(temp_csv_path)[3][1])

        dbutils.fs.cp(temporary_csv, single_csv_path)
        print(f"Written single CSV file: {single_csv_path}")
        
        print(f"Deleting directory: {temp_csv_path}")
        dbutils.fs.rm(temp_csv_path, True)
        print(f"Deleting directory: {folder_path}")
        dbutils.fs.rm(folder_path, True)

In [0]:
    # Example for moving multiple files using a loop
    source_base_path = "/Volumes/workspace/music/albums-supplemental/"
    dest_base_path = "/Volumes/workspace/music/albums-supplemental-tracks/"

    for file_name in [file.name for file in dbutils.fs.ls(volume_path) if file.name.endswith('.csv')]:
        source_path = source_base_path + file_name
        dest_path = dest_base_path + file_name
        dbutils.fs.mv(source_path, dest_path)

In [0]:
import os

volume_path = "/Volumes/workspace/music/albums-supplemental-tracks"

# List all files in the volume path
files = dbutils.fs.ls(volume_path)

for file_info in files:
    if not file_info.isDir():
        file_path = file_info.path
        
        # Check if the file name ends with .csv.csv
        if file_path.endswith(".csv.csv"):
            new_file_path = file_path[:-4]
            print(f"Renaming file: {file_path} to {new_file_path}")
            dbutils.fs.mv(file_path, new_file_path)