In [47]:
import requests
import json
from typing import Any
import logging
import sys
import pathlib

from ytmusicapi.parsers import songs

In [48]:
def set_logger(name: str) -> logging.Logger:
    logger: logging.Logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    logger.addHandler(logging.StreamHandler(sys.stdout))
    return logger

def get_dir_absolute_path(dir_name: str) -> pathlib.Path:
    # Assuming you are in the "src" folder
    current_folder: pathlib.Path = pathlib.Path.cwd()

    target_folder_path: pathlib.Path = pathlib.Path()
    for parent in current_folder.parents:
        for potential_folder_path in parent.rglob(dir_name):
            if potential_folder_path.is_dir():
                return potential_folder_path

    return target_folder_path

def get_access_token() -> str:
    """
    Retrieve the spotify access token from spotify api endpoint
    :return: the access token
    """
    response: requests.Response = requests.post(
        url="https://accounts.spotify.com/api/token", 
        data="grant_type=client_credentials&client_id=bf621646332d4c9c82c6e6d1fd8a8352&client_secret=0ecda4e3308e4340a26b519d0647b2bf",
        headers={"Content-Type": "application/x-www-form-urlencoded"}
    )
    if response.status_code == 200:
        return response.json()["access_token"]
    
    return ''

def get_playlist_info(playlist_id: str, access_token: str, offset: int = 0, get_total: bool = True) -> dict[str, Any]:
    logger.info(f"Fetching tracks from playlist id: {playlist_id} starting from the {offset}th song")
    url: str = f"https://api.spotify.com/v1/playlists/{playlist_id}/tracks?market=IT&fields=total,items(track(id, uri, name, album(name, release_date, release_date_precision), artists(id, name)))&offset={offset}"
    if not get_total:
        url: str = url.replace("fields=total,", "fields=")
    
    response: requests.Response = requests.get(
        url=url,
        headers={"Authorization": f"Bearer  {access_token}"}
    )
    if response.status_code == 200:
        return response.json()
    
    print(f"Error getting playlist, status code: {response.status_code} - {response.text}")
    return {}

def get_playlist_tracks_data(playlist_id: str, access_token: str) -> list[dict[str, Any]]:    
    json_output_dir: pathlib.Path = get_dir_absolute_path("raw")
    playlist_filename: pathlib.Path = json_output_dir / "json" / f"{playlist_id}.json"
    
    if playlist_filename.exists():
        logger.info(f"Playlist data already downloaded")
        return json.load(open(str(playlist_filename), 'r'))
    
    offset: int = 0
    num_fetched_songs: int = 0
    total_songs_in_playlist: int = 0
    
    playlist_info: list[dict[str, Any]] = []
    
    logger.info(f"Fetching playlist tracks for {playlist_id}, current offset: {offset}, total songs: {total_songs_in_playlist}, fetching total {not bool(total_songs_in_playlist)}, fetched {num_fetched_songs} songs")
    while True:
        playlist_batch_info: dict[str, Any] = get_playlist_info(playlist_id, access_token, offset=offset, get_total=not bool(total_songs_in_playlist))
        playlist_info.extend(playlist_batch_info.get("items", []))
        
        if total_songs_in_playlist == 0:
            total_songs_in_playlist = playlist_batch_info.get("total", 0)
        num_fetched_songs += len(playlist_batch_info.get("items", []))
        offset += len(playlist_batch_info.get("items", []))
        logger.info(f"Fetched {num_fetched_songs}, total {total_songs_in_playlist}, offset {offset}, length {len(playlist_batch_info.get('items', []))}")
        if num_fetched_songs >= total_songs_in_playlist:
            break

    return playlist_info
    
    
def save_playlist_tracks_data(tracks_data: list[dict[str, Any]], playlist_id: str) -> bool:
    json_output_dir: pathlib.Path = get_dir_absolute_path("raw")
    playlist_filename: pathlib.Path = json_output_dir / "json" / f"{playlist_id}.json"

    if playlist_filename.exists():
        return True
    
    try:
        json.dump(tracks_data, open(str(playlist_filename), "w"))
        return True
    except Exception as e:
        logger.error(e)
        return False

In [49]:
logger: logging.Logger = set_logger("spotify api get data")
PLAYLISTS: list[str] = ["31LTVcI9mBggtLVlYRStnJ", "7CqTvaywsSnZHdwujtkfqp"]
PLAYLIST_ID: str = PLAYLISTS[1]
access_token: str = get_access_token()
playlist_info: list[dict[str, Any]] = get_playlist_tracks_data(PLAYLIST_ID, access_token)
save_playlist_tracks_data(playlist_info, PLAYLIST_ID)

Playlist data already downloaded
Playlist data already downloaded
Playlist data already downloaded
Playlist data already downloaded


True

## Download songs from spotify/youtube

In [50]:
# !spotdl download https://open.spotify.com/playlist/31LTVcI9mBggtLVlYRStnJ --threads 12

In [51]:
import yt_dlp
from youtube_search import YoutubeSearch
import pathlib
from datetime import datetime
import shutil

youtube_dl_options: dict[str, str | list[dict[str, str]]] = {
    "quiet": True,
    'format': 'bestaudio/best',
    'embedthumbnail': True,
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }, {
        'key': 'FFmpegMetadata',
    }]
}

def get_song_realease_date_folder(release_date: str, precision: str) -> pathlib.Path:
    release_date_precision_map: dict[str, str] = {
        "year": "%Y",
        "month": "%Y-%m",
        "day": "%Y-%m-%d"
    }
    release_date: datetime = datetime.strptime(release_date, release_date_precision_map[precision])
    
    return pathlib.Path(release_date.strftime(release_date_precision_map[precision].replace('-', '/')))

def download_song_from_youtube(song_to_search: str, track: dict[str, Any]) -> bool:
    release_path: pathlib.Path = get_song_realease_date_folder(
        release_date=track["track"]["album"]["release_date"],
        precision=track["track"]["album"]["release_date_precision"],
    )
    output_path: pathlib.Path = get_dir_absolute_path("songs") / release_path / track["track"]["id"]

    # create dirs if they do not exist
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    if output_path.with_suffix(".mp3").exists():
        logger.debug(f"Skipping {track['track']['id']}. Already downloaded. output_path {output_path}")
        return True
    logger.info(f"To download: {track['track']['id']}")
    # return False
    
    attempts_left: int = 3
    best_url: str = ""
    while attempts_left > 0:
        try:
            url_suffix = YoutubeSearch(song_to_search, max_results=1).to_dict()[0].get("url_suffix")
            best_url = f"https://www.youtube.com{url_suffix}"
            break
        except IndexError:
            attempts_left -= 1
            logger.debug(f"No valid URLs found for {song_to_search}, trying again ({attempts_left} attempts left).")
        if best_url is None:
            logger.debug(f"No valid URLs found for {song_to_search}, skipping track.")
            continue

    # Run you-get to fetch and download the link's audio
    # print(f"Initiating download for {song_to_search}. url is: {best_url}")
    try:
        with yt_dlp.YoutubeDL(youtube_dl_options | {'outtmpl': str(output_path)}) as ydl:
            ydl.extract_info(best_url, download=True)
        return True
    except Exception as e:
        logger.error(e)
        return False

def get_string_to_search_in_youtube(track: dict[str, Any]) -> str:
    track = track.get("track", {})
    # print(track["id"], track["name"], track["uri"], track["album"], track["artists"])
    to_search: str = " ".join([artist.get("name") for artist in track["artists"]])
    return f"{to_search} - {track.get('name')}"


def move_songs_to_release_date_partition(song: dict[str, Any]) -> None:
    songs_path: pathlib.Path = get_dir_absolute_path("songs")
    original_song_path: pathlib.Path = (songs_path / song["track"]["id"]).with_suffix(".mp3")

    if not original_song_path.exists():
        return 

    try:
        release_path: pathlib.Path = get_song_realease_date_folder(
            release_date=song["track"]["album"]["release_date"],
            precision=song["track"]["album"]["release_date_precision"],
        )
        output_path: pathlib.Path = (songs_path / release_path / song["track"]["id"]).with_suffix(".mp3")
    except Exception as e:
        return
    
    if output_path.exists():
        return 
    
    logger.debug(f'moving song["track"]["id"]')
    output_path.parent.mkdir(parents=True, exist_ok=True)
    shutil.move(original_song_path, output_path)

In [52]:
from tqdm.contrib.concurrent import process_map
import os

r = process_map(move_songs_to_release_date_partition, playlist_info, max_workers=os.cpu_count(), chunksize=1)

  0%|          | 0/992 [00:00<?, ?it/s]

In [53]:
from tqdm.contrib.concurrent import process_map
import os

def download_song_multiprocessing(track: dict[str, dict[str, Any]]) -> bool:
    to_search: str = get_string_to_search_in_youtube(track)
    return download_song_from_youtube(to_search, track)


r = process_map(download_song_multiprocessing, playlist_info, max_workers=os.cpu_count(), chunksize=1)

  0%|          | 0/992 [00:00<?, ?it/s]

To download: 3Fcfwhm8oRrBvBZ8KGhtea
To download: 3Fcfwhm8oRrBvBZ8KGhtea
To download: 3Fcfwhm8oRrBvBZ8KGhtea
To download: 3Fcfwhm8oRrBvBZ8KGhtea
                                                        