In [1]:
import json
import itertools
import os
import random
from pathlib import Path
from time import sleep

import numpy as np
import pandas as pd
from dotenv import load_dotenv

from spotipy import Spotify
from spotipy.oauth2 import SpotifyClientCredentials

diff_dataset = False

data_path = Path("../../data").resolve()
os.makedirs(data_path, exist_ok=True)

load_dotenv()
spotify = Spotify(client_credentials_manager=SpotifyClientCredentials(),
                  requests_timeout=10, retries=3)

In [2]:
def fetch_spotify_chart(filepath, num_samples=1000, seed=42):
    # currently: batshit crazy 4.3 GB spotify top200 chart CSV from kaggle: https://www.kaggle.com/general/232036
    # @TODO: series of fetching for random date from 1/1/2017 to present day.
    # url = "https://spotifycharts.com/regional/global/weekly/2018-10-12--2018-10-19/download"
    random.seed(seed)
    n = sum(1 for line in open(filepath)) - 1  # number of records in file (excludes header)
    s = num_samples  # desired sample size
    skip = sorted(random.sample(range(1, n + 1), n - s))  # the 0-indexed header will not be included in the skip list
    df = pd.read_csv(filepath, skiprows=skip)
    df["id"] = df["url"].apply(lambda url: url.split("/")[-1])
    return df


spotify_chart_df = fetch_spotify_chart("/Users/pez/Downloads/charts.csv", num_samples=15000, seed=42)

In [3]:
def extract_track_ids(df, id_key="id"):
    return list(df[(~df[id_key].isnull())]
                [id_key].to_dict().values())

# track ids from spotify top 200 daily charts selected at random
spotify_chart_track_ids = extract_track_ids(spotify_chart_df, "id")

# track ids from grammy nominations dataset
recording_nominations_df = pd.read_parquet(data_path / "raw/recording_nominations.pq")
recording_nominations_track_ids = extract_track_ids(recording_nominations_df, "spotify_track_id")

# track ids from chart2000 dataset
song_chart_df = pd.read_parquet(data_path / "raw/song_chart_by_month.pq")
song_chart_track_ids = extract_track_ids(song_chart_df, "spotify_track_id")

# track ids from random table
random_tracks_df = pd.read_csv(data_path / "random_track_ids.csv")
random_track_ids = extract_track_ids(random_tracks_df, "track_id")

# combine track ids and deduplicate
track_ids = list(set(
    recording_nominations_track_ids
    + song_chart_track_ids
    + spotify_chart_track_ids
    + random_track_ids
))

print(f"unique tracks: {len(track_ids)}")
print(f"spotify chart tracks: {len(set(spotify_chart_track_ids))}")
print(f"grammy tracks: {len(set(recording_nominations_track_ids))}")
print(f"chart2000 tracks: {len(set(song_chart_track_ids))}")
print(f"random tracks: {len(set(random_track_ids))}")

# how many new unique tracks have we added with additional random tracks?
track_ids_ = list(set(
    recording_nominations_track_ids
    + song_chart_track_ids
    + spotify_chart_track_ids
))
len(set(track_ids_) & set(random_track_ids))

unique tracks: 16157
spotify chart tracks: 8063
grammy tracks: 535
chart2000 tracks: 3313
random tracks: 4872


54

In [4]:
def fetch_bulk_track_features(track_ids):
    fetched_tracks = []
    for i in range(0, len(track_ids), 50):
        chunk = track_ids[i:i + 50]
        if i % 500 == 0:
            print(f"fetching track metadata: {i} ({i / len(track_ids) * 100:.2f})")
        tracks = spotify.tracks(chunk)["tracks"]
        sleep(.50)
        track_audio_features = spotify.audio_features(chunk)
        sleep(.50)
        for track, audio_features in zip(tracks, track_audio_features):
            track_ = {
                "id": track["id"],
                "name": track["name"],
                "album": track["album"]["name"],
                "album_id": track["album"]["id"],
                "artists": json.dumps([a["name"] for a in track["artists"]]),
                "artist_ids": json.dumps([a["id"] for a in track["artists"]]),
                "track_number": track["track_number"],
                "disc_number": track["disc_number"],
                "explicit": track["explicit"],
                "duration_ms": track["duration_ms"],
                "year": int(track["album"]["release_date"][0:4]),
                "release_date": track["album"]["release_date"],
                "popularity": track["popularity"],
                "audio_features": 0,
                "isrc": track.get("external_ids", {}).get("isrc")
            }
            if audio_features is not None:
                track_.update({
                    "audio_features": 1,
                    "danceability": audio_features["danceability"],
                    "energy": audio_features["energy"],
                    "key": audio_features["key"],
                    "loudness": audio_features["loudness"],
                    "mode": audio_features["mode"],
                    "speechiness": audio_features["speechiness"],
                    "acousticness": audio_features["acousticness"],
                    "instrumentalness": audio_features["instrumentalness"],
                    "liveness": audio_features["liveness"],
                    "valence": audio_features["valence"],
                    "tempo": audio_features["tempo"],
                    "time_signature": audio_features["time_signature"],
                })
            fetched_tracks.append(track_)
    return pd.DataFrame(fetched_tracks)


tracks_df = fetch_bulk_track_features(track_ids)
print(tracks_df.shape)
tracks_df.head()

fetching track metadata: 0 (0.00)
fetching track metadata: 500 (3.09)
fetching track metadata: 1000 (6.19)
fetching track metadata: 1500 (9.28)
fetching track metadata: 2000 (12.38)
fetching track metadata: 2500 (15.47)
fetching track metadata: 3000 (18.57)
fetching track metadata: 3500 (21.66)
fetching track metadata: 4000 (24.76)
fetching track metadata: 4500 (27.85)
fetching track metadata: 5000 (30.95)
fetching track metadata: 5500 (34.04)
fetching track metadata: 6000 (37.14)
fetching track metadata: 6500 (40.23)
fetching track metadata: 7000 (43.32)
fetching track metadata: 7500 (46.42)
fetching track metadata: 8000 (49.51)
fetching track metadata: 8500 (52.61)
fetching track metadata: 9000 (55.70)
fetching track metadata: 9500 (58.80)
fetching track metadata: 10000 (61.89)
fetching track metadata: 10500 (64.99)
fetching track metadata: 11000 (68.08)
fetching track metadata: 11500 (71.18)
fetching track metadata: 12000 (74.27)
fetching track metadata: 12500 (77.37)
fetching track

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,duration_ms,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6stxV2bl7xSiPUornxwyNR,Khapta,En esprit,4YUEcV66XwylhSNJI51ABq,"[""Heuss L'enfoir\u00e9"", ""Sofiane""]","[""3YwqjMyrRfuixi2pbgTGCE"", ""6qFt3TjvxMt77YGskt...",6,1,True,162906,...,4.0,-2.836,0.0,0.105,0.00937,0.0,0.0598,0.817,125.009,4.0
1,62Pp6exz0ywSlBqvNqiY6Z,Anissa,Anissa,5xPYfWKHgxj7jARUJza5eM,"[""Wejdene""]","[""1SxuyHZnLUFyFHGzdGaxZk""]",1,1,False,175752,...,9.0,-8.554,1.0,0.0646,0.693,0.0017,0.108,0.582,100.037,4.0
2,4BS9MBPYKNFEvaC5S7I3fN,Lovesick (feat. Felix Snow),Lovesick (feat. Felix Snow),0QyeR5V23AwRHSgJr1jOvi,"[""Caroline Pennell"", ""Felix Snow""]","[""0cZPTEmf3mlwj5kjVXR4po"", ""1qskAseW7apRxJBpYA...",1,1,False,234693,...,9.0,-7.095,0.0,0.0312,0.116,0.000774,0.238,0.346,101.011,4.0
3,09PGubKAMryhOWv1LHpCYz,"fuck, i'm lonely (with Anne-Marie)",~how i'm feeling~,6EgJXcGqaUvgZIF9bqPXfP,"[""Lauv"", ""Anne-Marie""]","[""5JZ7CnR6gTvEMKX4g70Amv"", ""1zNqDE7qDGCsyzJwoh...",2,1,True,198973,...,9.0,-6.46,1.0,0.0683,0.482,0.0,0.0618,0.678,95.016,4.0
4,7IerPeiRJsJfVaPW73GmbS,Bao Tiền Một Mớ Bình Yên,Bao Tiền Một Mớ Bình Yên,3BMJwayvFzLSqYbImGfMnU,"[""14 Casper"", ""Bon""]","[""0zo8kCJVktn7oPnzpkbC2p"", ""6hnfLIkvDl6pjlAe1Y...",1,1,False,300000,...,7.0,-10.202,1.0,0.0505,0.575,0.0,0.0902,0.0935,173.916,3.0


In [5]:
def with_track_charting_summary(df, song_chart_df):
    song_chart_df = song_chart_df.rename(columns={
        "spotify_artist_id": "artist_id",
        "spotify_track_id": "id"
    })
    song_chart_df["chart_months"] = 1
    song_chart_df = (
        song_chart_df
            .sort_values(by=["id"])
            .groupby(["id"])
            .aggregate({
                "chart_months": "sum",
                "indicativerevenue": "sum",
                "position": ["min", "mean", "median"],
            })
            .reset_index()
            .set_axis(["id", "track_chart_months", "track_chart_indicativerevenue", "track_chart_peak", "track_chart_mean", "track_chart_median"], axis=1)
    )

    df = pd.merge(df, song_chart_df, on="id", how="left")
    df["track_chart_months"] = df["track_chart_months"].fillna(0)
    df["track_chart_indicativerevenue"] = df["track_chart_indicativerevenue"].fillna(0)
    return df


drop_columns = [col for col in tracks_df.columns if col.startswith("track_chart_")]
tracks_df = tracks_df.drop(columns=drop_columns)

song_chart_df = pd.read_parquet(data_path / "raw/song_chart_by_month.pq")
tracks_df = with_track_charting_summary(tracks_df, song_chart_df)
print(tracks_df.shape)
tracks_df.head()

(16157, 32)


Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,duration_ms,...,instrumentalness,liveness,valence,tempo,time_signature,track_chart_months,track_chart_indicativerevenue,track_chart_peak,track_chart_mean,track_chart_median
0,6stxV2bl7xSiPUornxwyNR,Khapta,En esprit,4YUEcV66XwylhSNJI51ABq,"[""Heuss L'enfoir\u00e9"", ""Sofiane""]","[""3YwqjMyrRfuixi2pbgTGCE"", ""6qFt3TjvxMt77YGskt...",6,1,True,162906,...,0.0,0.0598,0.817,125.009,4.0,0.0,0.0,,,
1,62Pp6exz0ywSlBqvNqiY6Z,Anissa,Anissa,5xPYfWKHgxj7jARUJza5eM,"[""Wejdene""]","[""1SxuyHZnLUFyFHGzdGaxZk""]",1,1,False,175752,...,0.0017,0.108,0.582,100.037,4.0,0.0,0.0,,,
2,4BS9MBPYKNFEvaC5S7I3fN,Lovesick (feat. Felix Snow),Lovesick (feat. Felix Snow),0QyeR5V23AwRHSgJr1jOvi,"[""Caroline Pennell"", ""Felix Snow""]","[""0cZPTEmf3mlwj5kjVXR4po"", ""1qskAseW7apRxJBpYA...",1,1,False,234693,...,0.000774,0.238,0.346,101.011,4.0,0.0,0.0,,,
3,09PGubKAMryhOWv1LHpCYz,"fuck, i'm lonely (with Anne-Marie)",~how i'm feeling~,6EgJXcGqaUvgZIF9bqPXfP,"[""Lauv"", ""Anne-Marie""]","[""5JZ7CnR6gTvEMKX4g70Amv"", ""1zNqDE7qDGCsyzJwoh...",2,1,True,198973,...,0.0,0.0618,0.678,95.016,4.0,0.0,0.0,,,
4,7IerPeiRJsJfVaPW73GmbS,Bao Tiền Một Mớ Bình Yên,Bao Tiền Một Mớ Bình Yên,3BMJwayvFzLSqYbImGfMnU,"[""14 Casper"", ""Bon""]","[""0zo8kCJVktn7oPnzpkbC2p"", ""6hnfLIkvDl6pjlAe1Y...",1,1,False,300000,...,0.0,0.0902,0.0935,173.916,3.0,0.0,0.0,,,


In [6]:
def with_recording_nominations(df, recording_nominations_df):
    recording_nominations_df = (
        recording_nominations_df[[
                "award_category",
                "award_nominee",
                "award_winner",
                "spotify_track_id",
            ]]
            .groupby(["spotify_track_id"])
            .aggregate({
                "award_category": "max",
                "award_nominee": "sum",
                "award_winner": "sum",
            })
            .reset_index()
            .drop_duplicates(subset=["spotify_track_id"])
            .rename(columns={
                "spotify_track_id": "id",
                "award_category": "recording_award_category",
                "award_nominee": "recording_award_nominee",
                "award_winner": "recording_award_winner",
            })
    )
    df = pd.merge(df, recording_nominations_df, on="id", how="outer")
    df["recording_award_nominee"] = df["recording_award_nominee"].fillna(0)
    df["recording_award_winner"] = df["recording_award_winner"].fillna(0)
    return df


drop_columns = [col for col in tracks_df.columns if col.startswith("recording_award_")]
tracks_df = tracks_df.drop(columns=drop_columns)

recording_nominations_df = pd.read_parquet(data_path / "raw/recording_nominations.pq")
tracks_df = with_recording_nominations(tracks_df, recording_nominations_df)
print(tracks_df.shape)
tracks_df.head()

(16157, 35)


Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,duration_ms,...,tempo,time_signature,track_chart_months,track_chart_indicativerevenue,track_chart_peak,track_chart_mean,track_chart_median,recording_award_category,recording_award_nominee,recording_award_winner
0,6stxV2bl7xSiPUornxwyNR,Khapta,En esprit,4YUEcV66XwylhSNJI51ABq,"[""Heuss L'enfoir\u00e9"", ""Sofiane""]","[""3YwqjMyrRfuixi2pbgTGCE"", ""6qFt3TjvxMt77YGskt...",6,1,True,162906,...,125.009,4.0,0.0,0.0,,,,,0.0,0.0
1,62Pp6exz0ywSlBqvNqiY6Z,Anissa,Anissa,5xPYfWKHgxj7jARUJza5eM,"[""Wejdene""]","[""1SxuyHZnLUFyFHGzdGaxZk""]",1,1,False,175752,...,100.037,4.0,0.0,0.0,,,,,0.0,0.0
2,4BS9MBPYKNFEvaC5S7I3fN,Lovesick (feat. Felix Snow),Lovesick (feat. Felix Snow),0QyeR5V23AwRHSgJr1jOvi,"[""Caroline Pennell"", ""Felix Snow""]","[""0cZPTEmf3mlwj5kjVXR4po"", ""1qskAseW7apRxJBpYA...",1,1,False,234693,...,101.011,4.0,0.0,0.0,,,,,0.0,0.0
3,09PGubKAMryhOWv1LHpCYz,"fuck, i'm lonely (with Anne-Marie)",~how i'm feeling~,6EgJXcGqaUvgZIF9bqPXfP,"[""Lauv"", ""Anne-Marie""]","[""5JZ7CnR6gTvEMKX4g70Amv"", ""1zNqDE7qDGCsyzJwoh...",2,1,True,198973,...,95.016,4.0,0.0,0.0,,,,,0.0,0.0
4,7IerPeiRJsJfVaPW73GmbS,Bao Tiền Một Mớ Bình Yên,Bao Tiền Một Mớ Bình Yên,3BMJwayvFzLSqYbImGfMnU,"[""14 Casper"", ""Bon""]","[""0zo8kCJVktn7oPnzpkbC2p"", ""6hnfLIkvDl6pjlAe1Y...",1,1,False,300000,...,173.916,3.0,0.0,0.0,,,,,0.0,0.0


In [7]:
def with_album_nominations(df, album_nominations_df):
    album_nominations_df = (
        album_nominations_df[[
            "award_category",
            "award_nominee",
            "award_winner",
            "spotify_album_id",
        ]]
        .groupby(["spotify_album_id"])
        .aggregate({
            "award_category": "max",
            "award_nominee": "sum",
            "award_winner": "sum",
        })
        .reset_index()
        .drop_duplicates(subset=["spotify_album_id"])
        .rename(columns={
            "spotify_album_id": "album_id",
            "award_category": "album_award_category",
            "award_nominee": "album_award_nominee",
            "award_winner": "album_award_winner",
        })
    )
    return (
        pd.merge(df, album_nominations_df, on="album_id",  how="left")
    )


album_nominations_df = pd.read_parquet(data_path / "raw/album_nominations.pq")
tracks_df = with_album_nominations(tracks_df, album_nominations_df)
print(tracks_df.shape)
tracks_df.head()

(16157, 38)


Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,duration_ms,...,track_chart_indicativerevenue,track_chart_peak,track_chart_mean,track_chart_median,recording_award_category,recording_award_nominee,recording_award_winner,album_award_category,album_award_nominee,album_award_winner
0,6stxV2bl7xSiPUornxwyNR,Khapta,En esprit,4YUEcV66XwylhSNJI51ABq,"[""Heuss L'enfoir\u00e9"", ""Sofiane""]","[""3YwqjMyrRfuixi2pbgTGCE"", ""6qFt3TjvxMt77YGskt...",6,1,True,162906,...,0.0,,,,,0.0,0.0,,,
1,62Pp6exz0ywSlBqvNqiY6Z,Anissa,Anissa,5xPYfWKHgxj7jARUJza5eM,"[""Wejdene""]","[""1SxuyHZnLUFyFHGzdGaxZk""]",1,1,False,175752,...,0.0,,,,,0.0,0.0,,,
2,4BS9MBPYKNFEvaC5S7I3fN,Lovesick (feat. Felix Snow),Lovesick (feat. Felix Snow),0QyeR5V23AwRHSgJr1jOvi,"[""Caroline Pennell"", ""Felix Snow""]","[""0cZPTEmf3mlwj5kjVXR4po"", ""1qskAseW7apRxJBpYA...",1,1,False,234693,...,0.0,,,,,0.0,0.0,,,
3,09PGubKAMryhOWv1LHpCYz,"fuck, i'm lonely (with Anne-Marie)",~how i'm feeling~,6EgJXcGqaUvgZIF9bqPXfP,"[""Lauv"", ""Anne-Marie""]","[""5JZ7CnR6gTvEMKX4g70Amv"", ""1zNqDE7qDGCsyzJwoh...",2,1,True,198973,...,0.0,,,,,0.0,0.0,,,
4,7IerPeiRJsJfVaPW73GmbS,Bao Tiền Một Mớ Bình Yên,Bao Tiền Một Mớ Bình Yên,3BMJwayvFzLSqYbImGfMnU,"[""14 Casper"", ""Bon""]","[""0zo8kCJVktn7oPnzpkbC2p"", ""6hnfLIkvDl6pjlAe1Y...",1,1,False,300000,...,0.0,,,,,0.0,0.0,,,


In [8]:
tracks_df["num_artists"] = tracks_df["artist_ids"].apply(json.loads).apply(len)

In [9]:
def lookup(artist_df, artist_ids):
    artist_ids = json.loads(artist_ids.replace("'", "\""))
    artists_ = []
    for artist_id in artist_ids:
        matches = artist_df[artist_df["id"] == artist_id]
        if len(matches) > 0:
            artists_.append(matches.to_dict('records')[0])
    return artists_


def with_artist_summary(df, artist_df):
    df = df.copy()
    # df["num_artists"] = df.apply(lambda r: len(df["artist_ids"]), axis=1)
    df["artists"] = df.apply(lambda r: lookup(artist_df, r["artist_ids"]), axis=1)
    df["artist_matched"] = df.apply(lambda r: len(r["artists"]) > 0, axis=1)
    df["artist_names"] = df.apply(lambda r: [a["name"] for a in r["artists"]], axis=1)
    df["artist_names"] = df.apply(lambda r: ", ".join(r["artist_names"]), axis=1)
    df["artist_genres"] = df.apply(lambda r: [a["genres"] for a in r["artists"]], axis=1)
    df["artist_genres"] = df.apply(lambda r: list(itertools.chain(*r["artist_genres"])), axis=1)
    df["artist_genres"] = df.apply(lambda r: r["artist_genres"] if r["artist_genres"] else np.nan, axis=1)
    # df["artist_genres"] = df.apply(lambda r: "; ".join(list(itertools.chain(*r["artist_genres"]))), axis=1)
    df["artist_popularity"] = df.apply(lambda r: [a["popularity"] for a in r["artists"]], axis=1)
    df["artist_popularity"] = df.apply(lambda r: max(r["artist_popularity"]) if r["artist_popularity"] else 0, axis=1)
    df["artist_first_release"] = df.apply(lambda r: [a["first_release"] for a in r["artists"]], axis=1)
    df["artist_first_release"] = df.apply(lambda r: min(r["artist_first_release"]) if r["artist_first_release"] else np.nan, axis=1)
    # df["artist_last_release"] = df.apply(lambda r: [a["last_release"] for a in r["artists"]], axis=1)
    # df[f"artist_last_release"] = df.apply(lambda r: max(df["artist_last_release"])
    df["artist_lifetime_releases"] = df.apply(lambda r: [a["total_releases"] for a in r["artists"]], axis=1)
    df["artist_lifetime_releases"] = df.apply(lambda r: sum(m for m in r["artist_lifetime_releases"] if m is not None), axis=1).fillna(0)
    df["artist_gender"] = df.apply(lambda r: [a["gender"] for a in r["artists"]], axis=1)
    #df["artist_gender"] = df.apply(lambda r: r["artist_gender"][0] if r["artist_gender"] else np.nan, axis=1)
    df["artist_country"] = df.apply(lambda r:[a["country"] for a in r["artists"]], axis=1)
    df["artist_country"] = df.apply(lambda r: r["artist_country"][0] if r["artist_country"] else np.nan, axis=1)
    df["artist_hometown"] = df.apply(lambda r: [a["hometown"] for a in r["artists"]], axis=1)
    df["artist_hometown"] = df.apply(lambda r: r["artist_hometown"][0] if r["artist_hometown"] else np.nan, axis=1)
    df["artist_first_win"] = df.apply(lambda r: [a["first_win"] for a in r["artists"]], axis=1)
    df["artist_first_win"] = df.apply(lambda r: min(r["artist_first_win"]) if r["artist_first_win"] else np.nan, axis=1)
    df["artist_first_nomination"] = df.apply(lambda r: [a["first_nomination"] for a in r["artists"]], axis=1)
    df["artist_first_nomination"] = df.apply(lambda r: min(r["artist_first_nomination"]) if r["artist_first_nomination"] else np.nan, axis=1)
    # df["artist_last_win"] = df.apply(lambda r: [a["last_win"] for a in r["artists"]], axis=1)
    # df["artist_last_win"] = df.apply(lambda r: max(df["artist_last_win"])
    # df["artist_last_nomination"] = df.apply(lambda r: [a["last_nomination"] for a in r["artists"]], axis=1)
    # df["artist_last_nomination"] = df.apply(lambda r: max(df["artist_last_nomination"])
    df["artist_lifetime_wins"] = df.apply(lambda r: [a["total_wins"] for a in r["artists"]], axis=1)
    df["artist_lifetime_wins"] = df.apply(lambda r: sum(m for m in r["artist_lifetime_wins"] if m is not None), axis=1).fillna(0)
    df["artist_lifetime_nominations"] = df.apply(lambda r: [a["total_nominations"] for a in r["artists"]], axis=1)
    df["artist_lifetime_nominations"] = df.apply(lambda r: sum(m for m in r["artist_lifetime_nominations"] if m is not None), axis=1).fillna(0)
    df["artist_lifetime_chart_months"] = df.apply(lambda r: [a["chart_months"] for a in r["artists"]], axis=1)
    df["artist_lifetime_chart_months"] = df.apply(lambda r: sum(r["artist_lifetime_chart_months"]), axis=1).fillna(0)
    df["artist_lifetime_revenue"] = df.apply(lambda r: [a["total_revenue"] for a in r["artists"]], axis=1)
    df["artist_lifetime_revenue"] = df.apply(lambda r: sum(m for m in r["artist_lifetime_revenue"] if m is not None), axis=1).fillna(0)
    df["artist_first_chart_year"] = df.apply(lambda r: [a["first_chart_year"] for a in r["artists"]], axis=1)
    df["artist_first_chart_year"] = df.apply(lambda r: min(r["artist_first_chart_year"]) if r["artist_first_chart_year"] else np.nan, axis=1)
    # df["artist_last_chart_year"] = df.apply(lambda r: [a["last_chart_year"] for a in r["artists"]], axis=1)
    # df["artist_last_chart_year"] = df.apply(lambda r: max(df["artist_last_chart_year"])
    df["artist_chart_tracks"] = df.apply(lambda r: [a["chart_tracks"] for a in r["artists"]], axis=1)
    df["artist_chart_tracks"] = df.apply(lambda r: max(r["artist_chart_tracks"]) if r["artist_chart_tracks"] else 0, axis=1).fillna(0)
    df["artist_chart_peak"] = df.apply(lambda r: [a["chart_peak"] for a in r["artists"]], axis=1)
    df["artist_chart_peak"] = df.apply(lambda r: min(r["artist_chart_peak"]) if r["artist_chart_peak"] else 0, axis=1)
    df["artist_nominations"] = df.apply(lambda r: [a.get(f"grammy_nominated_{r['year'] - 1}") for a in r["artists"]], axis=1)
    df["artist_nominations"] = df.apply(lambda r: sum(m for m in r["artist_nominations"] if m is not None), axis=1).fillna(0)
    df["artist_wins"] = df.apply(lambda r: [a.get(f"grammy_won_{r['year'] - 1}") for a in r["artists"]], axis=1)
    df["artist_wins"] = df.apply(lambda r: sum(m for m in r["artist_wins"] if m is not None), axis=1).fillna(0)
    df["artist_releases"] = df.apply(lambda r: [a.get(f"releases_{r['year'] - 1}") for a in r["artists"]], axis=1)
    df["artist_releases"] = df.apply(lambda r: sum(m for m in r["artist_releases"] if m is not None), axis=1).fillna(0)
    df["artist_chart_months"] = df.apply(lambda r: [a.get(f"chart_months_{r['year'] - 1}") for a in r["artists"]], axis=1)
    df["artist_chart_months"] = df.apply(lambda r: sum(m for m in r["artist_chart_months"] if m is not None), axis=1).fillna(0)
    df["artist_chart_months_recently"] = df.apply(lambda r: [a.get(f"chart_months_recently_{r['year'] - 1}") for a in r["artists"]], axis=1)
    df["artist_chart_months_recently"] = df.apply(lambda r: sum(m for m in r["artist_chart_months_recently"] if m is not None), axis=1).fillna(0)
    df["artist_releases_recently"] = df.apply(lambda r:  [a.get(f"releases_{r['year'] - 1}", 0) - a.get(f"releases_{r['year'] - 4}") for a in r["artists"]], axis=1)
    df["artist_releases_recently"] = df.apply(lambda r: sum(m for m in r["artist_releases_recently"] if m is not None), axis=1).fillna(0)
    df["artists"] = df["artist_names"]
    return df


artist_df = pd.read_parquet(data_path / "artist_summary.pq")
tracks_df = with_artist_summary(tracks_df, artist_df)
print(tracks_df.shape)
print(tracks_df[tracks_df["artist_matched"] == 0].shape)
tracks_df.head()

(16157, 63)
(826, 63)


Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,duration_ms,...,artist_lifetime_revenue,artist_first_chart_year,artist_chart_tracks,artist_chart_peak,artist_nominations,artist_wins,artist_releases,artist_chart_months,artist_chart_months_recently,artist_releases_recently
0,6stxV2bl7xSiPUornxwyNR,Khapta,En esprit,4YUEcV66XwylhSNJI51ABq,"Heuss L'enfoiré, Sofiane","[""3YwqjMyrRfuixi2pbgTGCE"", ""6qFt3TjvxMt77YGskt...",6,1,True,162906,...,0.0,0.0,0.0,0.0,0.0,0.0,39.0,0.0,0.0,36.0
1,62Pp6exz0ywSlBqvNqiY6Z,Anissa,Anissa,5xPYfWKHgxj7jARUJza5eM,Wejdene,"[""1SxuyHZnLUFyFHGzdGaxZk""]",1,1,False,175752,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,4BS9MBPYKNFEvaC5S7I3fN,Lovesick (feat. Felix Snow),Lovesick (feat. Felix Snow),0QyeR5V23AwRHSgJr1jOvi,Caroline Pennell,"[""0cZPTEmf3mlwj5kjVXR4po"", ""1qskAseW7apRxJBpYA...",1,1,False,234693,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,8.0
3,09PGubKAMryhOWv1LHpCYz,"fuck, i'm lonely (with Anne-Marie)",~how i'm feeling~,6EgJXcGqaUvgZIF9bqPXfP,"Lauv, Anne-Marie","[""5JZ7CnR6gTvEMKX4g70Amv"", ""1zNqDE7qDGCsyzJwoh...",2,1,True,198973,...,731.846,0.0,1.0,0.0,0.0,0.0,32.0,1.0,1.0,26.0
4,7IerPeiRJsJfVaPW73GmbS,Bao Tiền Một Mớ Bình Yên,Bao Tiền Một Mớ Bình Yên,3BMJwayvFzLSqYbImGfMnU,"14 Casper, Bon","[""0zo8kCJVktn7oPnzpkbC2p"", ""6hnfLIkvDl6pjlAe1Y...",1,1,False,300000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [10]:
def resolve_genre(genres, genre_weights):
    genre = sorted(genres, key=genre_weights.get)[0]\
        if genres is not None and isinstance(genres, list) and len(genres) > 0\
        else "other"
    # print(f"{genres} => {genre}")
    return genre


def with_genres_resolved(tracks_df, artist_df):
    genre_weights = artist_df["genres"].explode().value_counts().to_dict()
    genre_weight_values = list(genre_weights.values())
    genre_weight_cutoff = np.average(genre_weight_values) + 1.5 * np.std(genre_weight_values)
    genre_weights = dict((k, v if v > genre_weight_cutoff else 0) for k, v in genre_weights.items())
    tracks_df["genre"] = tracks_df["artist_genres"].apply(lambda genres: resolve_genre(genres, genre_weights))
    return tracks_df


# tracks_df = tracks_df.drop(columns=["genre"])

artist_df = pd.read_parquet(data_path / "artist_summary.pq")
tracks_df = with_genres_resolved(tracks_df, artist_df)
print(tracks_df[tracks_df["artist_matched"] == 0].shape)
print(tracks_df.shape)
tracks_df.head()

(16157, 64)


Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,duration_ms,...,artist_first_chart_year,artist_chart_tracks,artist_chart_peak,artist_nominations,artist_wins,artist_releases,artist_chart_months,artist_chart_months_recently,artist_releases_recently,genre
0,6stxV2bl7xSiPUornxwyNR,Khapta,En esprit,4YUEcV66XwylhSNJI51ABq,"Heuss L'enfoiré, Sofiane","[""3YwqjMyrRfuixi2pbgTGCE"", ""6qFt3TjvxMt77YGskt...",6,1,True,162906,...,0.0,0.0,0.0,0.0,0.0,39.0,0.0,0.0,36.0,rap francais
1,62Pp6exz0ywSlBqvNqiY6Z,Anissa,Anissa,5xPYfWKHgxj7jARUJza5eM,Wejdene,"[""1SxuyHZnLUFyFHGzdGaxZk""]",1,1,False,175752,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,francoton
2,4BS9MBPYKNFEvaC5S7I3fN,Lovesick (feat. Felix Snow),Lovesick (feat. Felix Snow),0QyeR5V23AwRHSgJr1jOvi,Caroline Pennell,"[""0cZPTEmf3mlwj5kjVXR4po"", ""1qskAseW7apRxJBpYA...",1,1,False,234693,...,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,8.0,indie poptimism
3,09PGubKAMryhOWv1LHpCYz,"fuck, i'm lonely (with Anne-Marie)",~how i'm feeling~,6EgJXcGqaUvgZIF9bqPXfP,"Lauv, Anne-Marie","[""5JZ7CnR6gTvEMKX4g70Amv"", ""1zNqDE7qDGCsyzJwoh...",2,1,True,198973,...,0.0,1.0,0.0,0.0,0.0,32.0,1.0,1.0,26.0,uk pop
4,7IerPeiRJsJfVaPW73GmbS,Bao Tiền Một Mớ Bình Yên,Bao Tiền Một Mớ Bình Yên,3BMJwayvFzLSqYbImGfMnU,"14 Casper, Bon","[""0zo8kCJVktn7oPnzpkbC2p"", ""6hnfLIkvDl6pjlAe1Y...",1,1,False,300000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,viet lo-fi


In [11]:
def with_labels(df, labels_df):
    labels_df = labels_df.drop_duplicates(["album_id"])
    # labels_df = labels_df.drop(columns=["id", "artist_ids"])
    labels_df["label_matched"] = 1
    df = pd.merge(df, labels_df, on="album_id", how="left")
    return df
    df["label_matched"] = df["label_matched"].fillna(0)
    # df["major_label"] = df["major_label"].fillna(0)
    return df


labels_df = pd.read_parquet(data_path / "labels/albums_with_annotated_labels.pq")
tracks_df = with_labels(tracks_df, labels_df)
print(tracks_df.shape)
print(tracks_df[tracks_df["label_matched"] == 0].shape)
tracks_df.head()

(16157, 67)
(0, 67)


Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,duration_ms,...,artist_nominations,artist_wins,artist_releases,artist_chart_months,artist_chart_months_recently,artist_releases_recently,genre,label_name,major_label,label_matched
0,6stxV2bl7xSiPUornxwyNR,Khapta,En esprit,4YUEcV66XwylhSNJI51ABq,"Heuss L'enfoiré, Sofiane","[""3YwqjMyrRfuixi2pbgTGCE"", ""6qFt3TjvxMt77YGskt...",6,1,True,162906,...,0.0,0.0,39.0,0.0,0.0,36.0,rap francais,,1.0,1.0
1,62Pp6exz0ywSlBqvNqiY6Z,Anissa,Anissa,5xPYfWKHgxj7jARUJza5eM,Wejdene,"[""1SxuyHZnLUFyFHGzdGaxZk""]",1,1,False,175752,...,0.0,0.0,1.0,0.0,0.0,1.0,francoton,,1.0,1.0
2,4BS9MBPYKNFEvaC5S7I3fN,Lovesick (feat. Felix Snow),Lovesick (feat. Felix Snow),0QyeR5V23AwRHSgJr1jOvi,Caroline Pennell,"[""0cZPTEmf3mlwj5kjVXR4po"", ""1qskAseW7apRxJBpYA...",1,1,False,234693,...,0.0,0.0,8.0,0.0,0.0,8.0,indie poptimism,,,
3,09PGubKAMryhOWv1LHpCYz,"fuck, i'm lonely (with Anne-Marie)",~how i'm feeling~,6EgJXcGqaUvgZIF9bqPXfP,"Lauv, Anne-Marie","[""5JZ7CnR6gTvEMKX4g70Amv"", ""1zNqDE7qDGCsyzJwoh...",2,1,True,198973,...,0.0,0.0,32.0,1.0,1.0,26.0,uk pop,[no label],0.0,1.0
4,7IerPeiRJsJfVaPW73GmbS,Bao Tiền Một Mớ Bình Yên,Bao Tiền Một Mớ Bình Yên,3BMJwayvFzLSqYbImGfMnU,"14 Casper, Bon","[""0zo8kCJVktn7oPnzpkbC2p"", ""6hnfLIkvDl6pjlAe1Y...",1,1,False,300000,...,0.0,0.0,1.0,0.0,0.0,1.0,viet lo-fi,Viết Tân Studio,0.0,1.0


In [14]:
tracks_df = tracks_df.rename(columns={"chart_months": "track_chart_months"})
tracks_df = tracks_df[tracks_df["year"] >= 2000]
# features
# - chart_status  => [0: never charts, 1: has charted, 2: charted top 10]
# - award_status  => [0: never nominated, 1: nominated, 2: won]
# - artist_status => [0: never nominated, 1: previously nominated, 2: previous winner]

def chart_status(r):
    # should chart status account for album or artist charting?
    return 2 if r["track_chart_peak"] <= 25\
        else 1 if r["track_chart_peak"] > 0\
        else 0


def award_status(r):
    # should award status account for album or artist winning?
    return 2 if r["recording_award_winner"] >= 1\
        else 1 if r["recording_award_nominee"] >= 1\
        else 0


tracks_df["chart_status"] = tracks_df.apply(chart_status, axis=1)
tracks_df["award_status"] =  tracks_df.apply(award_status, axis=1)
# df["artist_status"] = df.apply(artist_status, axis=1)

print(tracks_df.shape)
tracks_df.head()

(14996, 69)


Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,duration_ms,...,artist_releases,artist_chart_months,artist_chart_months_recently,artist_releases_recently,genre,label_name,major_label,label_matched,chart_status,award_status
0,6stxV2bl7xSiPUornxwyNR,Khapta,En esprit,4YUEcV66XwylhSNJI51ABq,"Heuss L'enfoiré, Sofiane","[""3YwqjMyrRfuixi2pbgTGCE"", ""6qFt3TjvxMt77YGskt...",6,1,True,162906,...,39.0,0.0,0.0,36.0,rap francais,,1.0,1.0,0,0
1,62Pp6exz0ywSlBqvNqiY6Z,Anissa,Anissa,5xPYfWKHgxj7jARUJza5eM,Wejdene,"[""1SxuyHZnLUFyFHGzdGaxZk""]",1,1,False,175752,...,1.0,0.0,0.0,1.0,francoton,,1.0,1.0,0,0
2,4BS9MBPYKNFEvaC5S7I3fN,Lovesick (feat. Felix Snow),Lovesick (feat. Felix Snow),0QyeR5V23AwRHSgJr1jOvi,Caroline Pennell,"[""0cZPTEmf3mlwj5kjVXR4po"", ""1qskAseW7apRxJBpYA...",1,1,False,234693,...,8.0,0.0,0.0,8.0,indie poptimism,,,,0,0
3,09PGubKAMryhOWv1LHpCYz,"fuck, i'm lonely (with Anne-Marie)",~how i'm feeling~,6EgJXcGqaUvgZIF9bqPXfP,"Lauv, Anne-Marie","[""5JZ7CnR6gTvEMKX4g70Amv"", ""1zNqDE7qDGCsyzJwoh...",2,1,True,198973,...,32.0,1.0,1.0,26.0,uk pop,[no label],0.0,1.0,0,0
4,7IerPeiRJsJfVaPW73GmbS,Bao Tiền Một Mớ Bình Yên,Bao Tiền Một Mớ Bình Yên,3BMJwayvFzLSqYbImGfMnU,"14 Casper, Bon","[""0zo8kCJVktn7oPnzpkbC2p"", ""6hnfLIkvDl6pjlAe1Y...",1,1,False,300000,...,1.0,0.0,0.0,1.0,viet lo-fi,Viết Tân Studio,0.0,1.0,0,0


In [17]:
print(tracks_df[tracks_df["artist_matched"] == 0].shape)
print(tracks_df[tracks_df["label_matched"] != 1].shape)

(138, 69)
(8639, 69)


In [18]:
print(f"no. tracks: {len(tracks_df)}")
print(f"no. chart2000 songs: {len(song_chart_df)}")
print(f"no. spotify chart songs: {len(spotify_chart_df)}")
print(f"no. grammy songs: {len(recording_nominations_df)}")
print(f"no. tracks w/out features: {(tracks_df['audio_features'] == 0).sum()}")
# print(f"no. track features w/out nominations: {tracks_df['spotify_track_id'].isnull().sum()}")
print(f"no. tracks w/out chart position: {len(tracks_df[(tracks_df['track_chart_months'].isnull()) & (tracks_df['recording_award_nominee'] > 0)])}")
print(f"no. charted songs w/ nominations: {len(tracks_df[(~tracks_df['track_chart_months'].isnull()) & (tracks_df['recording_award_nominee'] > 0)])}")
print(f"no. songs w/out nomination or chart: {len(tracks_df[(~tracks_df['track_chart_months'].isnull()) & (tracks_df['recording_award_nominee'] == 0)])}")

tracks_df.to_csv(data_path / "tracks.csv.gz", compression="gzip", index=False)
tracks_df.to_parquet(data_path / "tracks.pq", index=False)

tracks_df.groupby(["chart_status", "award_status"]).count()

no. tracks: 14996
no. chart2000 songs: 12900
no. spotify chart songs: 15000
no. grammy songs: 580
no. tracks w/out features: 1
no. tracks w/out chart position: 0
no. charted songs w/ nominations: 457
no. songs w/out nomination or chart: 14539


Unnamed: 0_level_0,Unnamed: 1_level_0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,duration_ms,...,artist_nominations,artist_wins,artist_releases,artist_chart_months,artist_chart_months_recently,artist_releases_recently,genre,label_name,major_label,label_matched
chart_status,award_status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,11555,11555,11555,11555,11555,11555,11555,11555,11555,11555,...,11555,11555,11555,11555,11555,11555,11555,5030,6357,6357
0,1,211,211,211,211,211,211,211,211,211,211,...,211,211,211,211,211,211,211,0,0,0
0,2,35,35,35,35,35,35,35,35,35,35,...,35,35,35,35,35,35,35,0,0,0
1,0,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,...,1464,1464,1464,1464,1464,1464,1464,0,0,0
1,1,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,0,0,0
1,2,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,0,0,0
2,0,1520,1520,1520,1520,1520,1520,1520,1520,1520,1520,...,1520,1520,1520,1520,1520,1520,1520,0,0,0
2,1,137,137,137,137,137,137,137,137,137,137,...,137,137,137,137,137,137,137,0,0,0
2,2,27,27,27,27,27,27,27,27,27,27,...,27,27,27,27,27,27,27,0,0,0


In [32]:
tracks_df[["track_chart_months", "recording_award_nominee"]][(~tracks_df['track_chart_months'].isnull()) & (tracks_df['recording_award_nominee'] == 0)]

Unnamed: 0,track_chart_months,recording_award_nominee
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
...,...,...
16152,0.0,0.0
16153,0.0,0.0
16154,0.0,0.0
16155,0.0,0.0


In [None]:
track_features_full_df = df[[
    'id',
    'name',
    'album',
    'album_id',
    'artists',
    'artist_ids',
    'track_number',
    'disc_number',
    'explicit',
    'duration_ms',
    'year',
    'release_date',
    'popularity',
    'isrc',
    # spotify audio features
    'audio_features', # has audio features
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    'time_signature',
    # artist
    'artist_genre',
    'artist_gender',
    'artist_country',
    'artist_hometown',
    'artist_begin',
    'artist_total_releases'
    # charts2000
    'months', # num months on charts
    'indicativerevenue',
    'position_min',
    'position_mean',
    'position_median',
    # grammy award result for this song
    'recording_award_category',
    'recording_award_nominee',
    'recording_award_winner',
    # grammy award result for this song
    'album_award_category',
    'album_award_nominee',
    'album_award_winner',
    # grammy nomination history for artist
    'artist_total_nominations',
    'artist_total_wins',
    'artist_first_nomination',
    'artist_first_win',
    'artist_previous_nominations',
    'artist_previous_wins',
    # quick categories
    'chart_status',
    'award_status',
    'artist_status'
]]

track_features_full_df.to_csv("../../data/02-track_features_balanced_wide.csv.gz", compression="gzip", index=False)
track_features_full_df.to_parquet("../../data/02-track_features_balanced_wide.pq", index=False)

track_features_slim_df = df[[
    'id',
    'name',
    'album',
    'album_id',
    'artists',
    'artist_ids',
    'track_number',
    'disc_number',
    'explicit',
    'duration_ms',
    'year',
    'release_date',
    'popularity',
    # spotify audio features
    'audio_features', # has audio features
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    'time_signature',
    # artist
    'artist_genre',
    'artist_gender',
    'artist_country',
    'artist_hometown',
    'artist_begin',
    'artist_total_releases',
    # quick categories
    'chart_status',
    'award_status',
    'artist_status'
]]

track_features_slim_df.to_csv("../../data/02-track_features_balanced.csv.gz", compression="gzip", index=False)
track_features_slim_df.to_parquet("../../data/02-track_features_balanced.pq", index=False)

In [14]:
track_df = pd.read_parquet(data_path / "tracks.pq")
print(track_df[track_df["artist_matched"] == 0].shape)
print(track_df[track_df["label_matched"] == 0].shape)

(138, 69)
(4768, 69)


In [35]:
tracks_df = pd.read_parquet(data_path / "tracks.pq")
print(tracks_df[tracks_df["artist_matched"] == 0].shape)
print(tracks_df[tracks_df["label_matched"] == 0].shape)

(138, 69)
(4768, 69)
