In [4]:
import json
import itertools
import numpy as np
import os
from time import sleep
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from spotipy import Spotify, SpotifyClientCredentials

from util import mbz

load_dotenv()

mbz.set_useragent("music-mining-labels-parents", "0.3.1", "pezon@uchicago.edu")
spotify = Spotify(client_credentials_manager=SpotifyClientCredentials(),
                  requests_timeout=10, retries=3)

diff_dataset = True
data_path = Path("../../data").resolve()
label_path = Path("../../data/labels").resolve()
os.makedirs(label_path, exist_ok=True)

In [5]:
track_df = pd.read_parquet(data_path / "tracks.pq")
if diff_dataset:
    track_df = track_df[track_df["label_matched"] != 1]

album_df = (
    track_df[["album_id", "album", "artists"]]
        .drop_duplicates(["album_id"])
        .rename(columns={"album_id": "id"})
)

print(album_df.shape)
album_df.head()

(6503, 3)


Unnamed: 0,id,album,artists
2,0QyeR5V23AwRHSgJr1jOvi,Lovesick (feat. Felix Snow),Caroline Pennell
5,6kZ42qRrzov54LcAk4onW9,Red (Taylor's Version),Taylor Swift
7,4nNZ5UJCzhlfJbip0SDLI1,Portraits,Maribou State
8,2lZttozROJsM5KbD5gqSet,Downtown Church,"Patty Griffin, Emmylou Harris"
10,2upw5IrzeqKApIQZyx5o6r,Beam Me Up Scotty,"Nicki Minaj, Drake, Lil Wayne"


In [7]:
def search_label(album_name, artist_name):
    releases = mbz.search_releases(f"album: {album_name} artist: {artist_name}")
    #print(album_name, artist_name)
    sleep(1)
    if "release-list" not in releases:
        return
    for i in range(0, 3):
        try:
            release = releases["release-list"][i]
        except IndexError as err:
            break
        if "label-info-list" not in release:
            break
        for label in release["label-info-list"]:
            if "label" in label:
                yield label["label"]
            # break


def with_labels(df, album_key="album"):
    """
    Add Spotify ids and metadata to dataframe with artist data
    As ids and metadata are added to multiple rows at a time.
    Make sure to skip rows that already have id and metadata.
    """
    df["labels"] = np.nan

    for index, record in df.iterrows():
        # search for album on mbz
        # artists = json.loads(record["artists"])
        artists = record["artists"].split(", ")
        labels = search_label(record["album"], artists[0])
        df.loc[(df["id"] == record["id"]), "labels"] = json.dumps(list(labels))

        if index % 25 == 0:
            total_albums = len(df)
            null_albums = df["labels"].isnull().sum()
            print(f"status: {100 - null_albums / total_albums * 100:.2f} "
                  f"({total_albums - null_albums} / {null_albums})")
    return df


album_df_with_labels = with_labels(album_df)
#album_df_with_labels.to_parquet(label_path / "albums_with_labels.pq", index=False)
print(album_df_with_labels.shape)
album_df_with_labels.head()

status: 0.17 (11 / 6492)
status: 0.58 (38 / 6465)
status: 1.29 (84 / 6419)
status: 1.52 (99 / 6404)
status: 1.78 (116 / 6387)
status: 2.35 (153 / 6350)
status: 2.55 (166 / 6337)
status: 2.78 (181 / 6322)
status: 2.98 (194 / 6309)
status: 3.17 (206 / 6297)
status: 3.40 (221 / 6282)
status: 4.11 (267 / 6236)
status: 5.00 (325 / 6178)
status: 5.37 (349 / 6154)
status: 5.58 (363 / 6140)
status: 6.49 (422 / 6081)
status: 7.17 (466 / 6037)
status: 7.35 (478 / 6025)
status: 8.80 (572 / 5931)
status: 8.95 (582 / 5921)
status: 9.63 (626 / 5877)
status: 9.80 (637 / 5866)
status: 9.98 (649 / 5854)
status: 10.80 (702 / 5801)
status: 11.24 (731 / 5772)
status: 11.46 (745 / 5758)
status: 11.67 (759 / 5744)
status: 11.86 (771 / 5732)
status: 12.06 (784 / 5719)
status: 12.66 (823 / 5680)
status: 12.79 (832 / 5671)
status: 13.15 (855 / 5648)
status: 13.35 (868 / 5635)
status: 13.55 (881 / 5622)
status: 13.81 (898 / 5605)
status: 14.21 (924 / 5579)
status: 14.42 (938 / 5565)
status: 14.67 (954 / 5549)
s

Unnamed: 0,id,album,artists,labels
2,0QyeR5V23AwRHSgJr1jOvi,Lovesick (feat. Felix Snow),Caroline Pennell,"[{""id"": ""909a6c83-c3cf-449c-a284-60f913b96b40""..."
5,6kZ42qRrzov54LcAk4onW9,Red (Taylor's Version),Taylor Swift,"[{""id"": ""1a917e6f-54f5-4964-bebf-5d4e2442ceb4""..."
7,4nNZ5UJCzhlfJbip0SDLI1,Portraits,Maribou State,"[{""id"": ""9b27b429-18bf-4e8f-b9d0-750e78964e0d""..."
8,2lZttozROJsM5KbD5gqSet,Downtown Church,"Patty Griffin, Emmylou Harris","[{""id"": ""5ecd57fa-e156-453e-94c9-3c8f1832b3ec""..."
10,2upw5IrzeqKApIQZyx5o6r,Beam Me Up Scotty,"Nicki Minaj, Drake, Lil Wayne","[{""id"": ""e0ecd909-0477-485f-80dc-3c27ea4837ca""..."


In [8]:
def is_major_label(labels):
    return any([
        1 if "Sony" in l["name"]
             or "Universal" in l["name"]
             or "Warner" in l["name"]
             or "EMI" in l["name"]
             or "Disney" in l["name"]
             or "Capitol" in l["name"]
             or "Atlantic" in l["name"]
             or "Interscope" in l["name"]
             or "RCA" in l["name"]
             or "MCA" in l["name"]
             or "UMG" in l["name"]
             or "Deutsche Grammophon" in l["name"]
        else 0
        for l in labels
    ])


def fetch_parent_label(label_id):
    label = mbz.get_label_by_id(label_id, includes=["label-rels"])["label"]
    sleep(1)
    if "label-relation-list" not in label:
        return []
    labels_ = []
    for ll in label["label-relation-list"]:
        if ll["direction"] == "forward": #\
            #or ll["label"].get("type") != "Holding":
            continue
        labels_.append({
            "id": ll["label"]["id"],
            "name": ll["label"]["name"],
            "rel_type": ll.get("type"),
            "label_type": ll["label"].get("type"),
        })
    return labels_


def with_parent_labels(df):
    df["parent_labels"] = np.nan
    df["parent_holding"] = np.nan
    df["parent_distrib"] = np.nan

    for index, record in df.iterrows():
        parent_labels = []
        parent_holding = []
        parent_distrib = []
        major_holding = False
        major_distrib = False
        label = record["labels"]
        if not isinstance(label, dict):
            continue
        if "id" in label:
            parent_labels += fetch_parent_label(record["label_id"])
            for parent_label in parent_labels:
                if parent_label["label_type"] == "Holding":
                    parent_holding.append(parent_label)
                if parent_label["label_type"] == "Distributor":
                    parent_distrib.append(parent_label)
            if is_major_label(parent_holding):
                major_holding = True
            if is_major_label(parent_distrib):
                major_distrib = True

        df.loc[(df["label_id"] == record["label_id"]), "parent_labels"] = json.dumps(list(parent_labels))
        df.loc[(df["label_id"] == record["label_id"]), "parent_holding"] = json.dumps(list(parent_holding))
        df.loc[(df["label_id"] == record["label_id"]), "parent_distrib"] = json.dumps(list(parent_distrib))
        df.loc[(df["label_id"] == record["label_id"]), "major_holding"] = major_holding
        df.loc[(df["label_id"] == record["label_id"]), "major_distrib"] = major_distrib
        df.loc[(df["label_id"] == record["label_id"]), "label_name"] = label["name"]
        if len(parent_holding) > 0:
            df.loc[(df["label_id"] == record["label_id"]), "parent_holding"] = parent_holding[0]["name"]
        else:
            df.loc[(df["label_id"] == record["label_id"]), "parent_holding"] = np.nan
        if len(parent_distrib) > 0:
            df.loc[(df["label_id"] == record["label_id"]), "parent_distrib"] = parent_distrib[0]["name"]
        else:
            df.loc[(df["label_id"] == record["label_id"]), "parent_holding"] = np.nan
        df.loc[(df["label_id"] == record["label_id"]), "major_label"] = int(major_holding or major_distrib or is_major_label([label]))

        total_albums = len(df)
        null_albums = df["parent_labels"].isnull().sum()
        print(f"status: {100 - null_albums / total_albums * 100:.2f} "
              f"({total_albums - null_albums} / {total_albums})")

    return df

#albums_with_labels_df = pd.read_parquet(label_path / "albums_with_labels.pq")
album_df_with_labels["labels"] = album_df_with_labels["labels"].apply(json.loads)
album_df_with_labels = album_df_with_labels.explode("labels")
album_df_with_labels["label_id"] = album_df_with_labels["labels"].apply(lambda l: l.get("id") if isinstance(l, dict) else None)
labels_df = album_df_with_labels[["label_id", "labels"]].drop_duplicates("label_id")
print(labels_df.shape)
labels_df = with_parent_labels(labels_df)
# labels_df.to_parquet(label_path / "labels.pq", index=False)
print(labels_df.shape)
labels_df.head()

(3578, 2)
status: 0.03 (1 / 3578)
status: 0.06 (2 / 3578)
status: 0.08 (3 / 3578)
status: 0.11 (4 / 3578)
status: 0.14 (5 / 3578)
status: 0.17 (6 / 3578)
status: 0.20 (7 / 3578)
status: 0.22 (8 / 3578)
status: 0.25 (9 / 3578)
status: 0.28 (10 / 3578)
status: 0.31 (11 / 3578)
status: 0.34 (12 / 3578)
status: 0.36 (13 / 3578)
status: 0.39 (14 / 3578)
status: 0.42 (15 / 3578)
status: 0.45 (16 / 3578)
status: 0.48 (17 / 3578)
status: 0.50 (18 / 3578)
status: 0.53 (19 / 3578)
status: 0.56 (20 / 3578)
status: 0.59 (21 / 3578)
status: 0.61 (22 / 3578)
status: 0.64 (23 / 3578)
status: 0.67 (24 / 3578)
status: 0.70 (25 / 3578)
status: 0.73 (26 / 3578)
status: 0.75 (27 / 3578)
status: 0.78 (28 / 3578)
status: 0.81 (29 / 3578)
status: 0.84 (30 / 3578)
status: 0.87 (31 / 3578)
status: 0.89 (32 / 3578)
status: 0.92 (33 / 3578)
status: 0.95 (34 / 3578)
status: 0.98 (35 / 3578)
status: 1.01 (36 / 3578)
status: 1.03 (37 / 3578)
status: 1.06 (38 / 3578)
status: 1.09 (39 / 3578)
status: 1.12 (40 / 3578)

Unnamed: 0,label_id,labels,parent_labels,parent_holding,parent_distrib,major_holding,major_distrib,label_name,major_label
2,909a6c83-c3cf-449c-a284-60f913b96b40,"{'id': '909a6c83-c3cf-449c-a284-60f913b96b40',...","[{""id"": ""3ccd1270-99d3-4b83-a311-6c60e127c866""...",,[],False,False,[PIAS] America,0.0
2,5a584032-dcef-41bb-9f8b-19540116fb1c,"{'id': '5a584032-dcef-41bb-9f8b-19540116fb1c',...","[{""id"": ""840502e1-7792-464c-9924-74de621432dd""...",,[],True,False,Deutsche Grammophon,1.0
5,1a917e6f-54f5-4964-bebf-5d4e2442ceb4,"{'id': '1a917e6f-54f5-4964-bebf-5d4e2442ceb4',...","[{""id"": ""beac7743-7296-47ce-b0eb-58b6d63841b4""...",,Universal Music Distribution,False,True,Big Machine Records,1.0
7,9b27b429-18bf-4e8f-b9d0-750e78964e0d,"{'id': '9b27b429-18bf-4e8f-b9d0-750e78964e0d',...","[{""id"": ""e6622117-b54b-43c1-9ac2-5343002af419""...",,[],False,False,fabric,0.0
8,5ecd57fa-e156-453e-94c9-3c8f1832b3ec,"{'id': '5ecd57fa-e156-453e-94c9-3c8f1832b3ec',...",[],,[],False,False,Thirty Tigers,0.0


In [None]:
def with_annotated_labels(album_df, label_df):
    album_df["labels"] = album_df["labels"].apply(json.loads)
    album_df = album_df.explode("labels")
    album_df["label_id"] = album_df["labels"].apply(lambda l: l.get("id") if isinstance(l, dict) else None)
    df = pd.merge(album_df[["id", "label_id"]], label_df, on="label_id", how="left")
    album_null = {
        "id": None,
        "label_id": np.nan,
        "parent_holding": np.nan,
        "parent_distrib": np.nan,
        "major_holding": 0,
        "major_distrib": 0,
        "major_label": 0,
    }
    albums = []
    album = album_null.copy()
    for index, record in df.iterrows():
        if record["id"] != album["id"]:
            if album["id"] is not None:
                albums.append(album)
            album = record.copy()
        if record["major_distrib"]:
            album["major_label"] = 1
            album["parent_distrib"] = record["parent_distrib"]
            album["major_distrib"] = record["major_distrib"]
            album["label_name"] = record["label_name"]
            album["label_id"] = record["label_id"]
        if record["major_holding"]:
            album["major_label"] = 1
            album["parent_holding"] = record["parent_holding"]
            album["major_holding"] = record["major_holding"]
            album["label_name"] = record["label_name"]
            album["label_id"] = record["label_id"]
        if record["major_label"]:
            album["major_label"] = 1
            album["label_name"] = record["label_name"]
            album["label_name"] = record["label_name"]
            album["label_id"] = record["label_id"]
    df = pd.DataFrame(albums)[["id", "label_name", "major_label"]].drop_duplicates("id")
    df["major_label"] = df["major_label"].fillna(0)
    df = df.rename(columns={"id": "album_id"})
    return df


#labels_df = pd.read_parquet(label_path / "labels.pq")
albums_with_labels_df = pd.read_parquet(label_path / "albums_with_labels.pq")
albums_with_annotated_labels_df = with_annotated_labels(albums_with_labels_df, labels_df)
albums_with_annotated_labels_df.to_parquet(label_path / "albums_with_annotated_labels.pq", index=False)
print(albums_with_annotated_labels_df.shape)
albums_with_annotated_labels_df.head()

In [45]:
albums_with_annotated_labels_df[albums_with_annotated_labels_df["major_label"] == 1]

Unnamed: 0,album_id,label_name,major_label
0,3JE1v19SrHzxmZAbf8iHkZ,Deutsche Grammophon,1.0
4,7o6j8wph7fvEcAL67jLVGN,,1.0
5,3tBkjgxDqAwss76O1YHsSY,Interscope Records,1.0
12,0n4P6BsuT61HgsKExU0i1R,RCA Victor,1.0
18,6pFZVwBPXj3m4dyaKcnVev,Sony Music | Latin,1.0
...,...,...,...
14464,13bddABWHCEtgRwGWmEiAT,Atlantic Records Russia,1.0
14471,2kBN5t0JhiAamaDgFGK9Tu,American Recordings,1.0
14474,6Beyik0c73t1KWk0aXp1cW,,1.0
14475,6Bks244AVHynPDeT2t0jOb,,1.0
