In [1]:
import json
import itertools
import numpy as np
import os
from time import sleep
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from spotipy import Spotify, SpotifyClientCredentials

from util import mbz

load_dotenv()

mbz.set_useragent("music-mining-labels-parents", "0.3.1", "pezon@uchicago.edu")
spotify = Spotify(client_credentials_manager=SpotifyClientCredentials(),
                  requests_timeout=10, retries=3)

diff_dataset = True
data_path = Path("../../data").resolve()
label_path = Path("../../data/labels").resolve()
os.makedirs(label_path, exist_ok=True)

In [53]:
track_df = pd.read_parquet(data_path / "tracks.pq")
if diff_dataset:
    track_df = track_df[track_df["labels_matched"] == 0]

album_df = (
    track_df[["album_id", "album", "artists"]]
        .drop_duplicates(["album_id"])
        .rename(columns={"album_id": "id"})
)

print(album_df.shape)
album_df.head()

(6077, 3)


Unnamed: 0,id,album,artists
3,3JE1v19SrHzxmZAbf8iHkZ,BO4L,"[""KESI"", ""Hennedub""]"
5,7o6j8wph7fvEcAL67jLVGN,I'll Be There,"[""Jess Glynne""]"
7,3tBkjgxDqAwss76O1YHsSY,Lose You To Love Me,"[""Selena Gomez""]"
8,7JtT7OyWM8BnIS5FXXPMKg,Emmanuel,"[""Anuel AA"", ""Bad Bunny""]"
9,0n4P6BsuT61HgsKExU0i1R,Jeannine,"[""Lomepal"", ""Rom\u00e9o Elvis""]"


In [61]:
def search_label(album_name, artist_name):
    releases = mbz.search_releases(f"album: {album_name} artist: {artist_name}")
    #print(album_name, artist_name)
    sleep(1)
    if "release-list" not in releases:
        return
    for i in range(0, 3):
        try:
            release = releases["release-list"][i]
        except IndexError as err:
            break
        if "label-info-list" not in release:
            break
        for label in release["label-info-list"]:
            if "label" in label:
                yield label["label"]
            # break


def with_labels(df, album_key="album"):
    """
    Add Spotify ids and metadata to dataframe with artist data
    As ids and metadata are added to multiple rows at a time.
    Make sure to skip rows that already have id and metadata.
    """
    df["labels"] = np.nan

    for index, record in df.iterrows():
        # search for album on mbz
        artists = json.loads(record["artists"])
        labels = search_label(record["album"], artists[0])
        df.loc[(df["id"] == record["id"]), "labels"] = json.dumps(list(labels))

        if index % 25 == 0:
            total_albums = len(df)
            null_albums = df["labels"].isnull().sum()
            print(f"status: {100 - null_albums / total_albums * 100:.2f} "
                  f"({total_albums - null_albums} / {null_albums})")
    return df


album_df_with_labels = with_labels(album_df)
album_df_with_labels.to_parquet(label_path / "albums_with_labels.pq", index=False)
print(album_df_with_labels.shape)
album_df_with_labels.head()

status: 0.77 (47 / 6030)
status: 1.66 (101 / 5976)
status: 1.96 (119 / 5958)
status: 2.42 (147 / 5930)
status: 2.73 (166 / 5911)
status: 3.06 (186 / 5891)
status: 3.26 (198 / 5879)
status: 3.51 (213 / 5864)
status: 3.75 (228 / 5849)
status: 3.97 (241 / 5836)
status: 4.16 (253 / 5824)
status: 4.69 (285 / 5792)
status: 5.20 (316 / 5761)
status: 5.92 (360 / 5717)
status: 6.15 (374 / 5703)
status: 6.40 (389 / 5688)
status: 6.65 (404 / 5673)
status: 6.85 (416 / 5661)
status: 7.06 (429 / 5648)
status: 7.27 (442 / 5635)
status: 7.80 (474 / 5603)
status: 8.28 (503 / 5574)
status: 8.72 (530 / 5547)
status: 9.94 (604 / 5473)
status: 10.93 (664 / 5413)
status: 11.14 (677 / 5400)
status: 11.65 (708 / 5369)
status: 11.90 (723 / 5354)
status: 12.54 (762 / 5315)
status: 12.69 (771 / 5306)
status: 12.90 (784 / 5293)
status: 13.31 (809 / 5268)
status: 13.72 (834 / 5243)
status: 13.92 (846 / 5231)
status: 14.20 (863 / 5214)
status: 14.46 (879 / 5198)
status: 14.68 (892 / 5185)
status: 15.24 (926 / 5151)

TypeError: __cinit__() got an unexpected keyword argument 'keep_index'

In [2]:
def is_major_label(labels):
    return any([
        1 if "Sony" in l["name"]
             or "Universal" in l["name"]
             or "Warner" in l["name"]
             or "EMI" in l["name"]
             or "Disney" in l["name"]
             or "Capitol" in l["name"]
             or "Atlantic" in l["name"]
             or "Interscope" in l["name"]
             or "RCA" in l["name"]
             or "MCA" in l["name"]
             or "UMG" in l["name"]
             or "Deutsche Grammophon" in l["name"]
        else 0
        for l in labels
    ])


def fetch_parent_label(label_id):
    label = mbz.get_label_by_id(label_id, includes=["label-rels"])["label"]
    sleep(1)
    if "label-relation-list" not in label:
        return []
    labels_ = []
    for ll in label["label-relation-list"]:
        if ll["direction"] == "forward": #\
            #or ll["label"].get("type") != "Holding":
            continue
        labels_.append({
            "id": ll["label"]["id"],
            "name": ll["label"]["name"],
            "rel_type": ll.get("type"),
            "label_type": ll["label"].get("type"),
        })
    return labels_


def with_parent_labels(df):
    df["parent_labels"] = np.nan
    df["parent_holding"] = np.nan
    df["parent_distrib"] = np.nan

    for index, record in df.iterrows():
        parent_labels = []
        parent_holding = []
        parent_distrib = []
        major_holding = False
        major_distrib = False
        label = record["labels"]
        if not isinstance(label, dict):
            continue
        if "id" in label:
            parent_labels += fetch_parent_label(record["label_id"])
            for parent_label in parent_labels:
                if parent_label["label_type"] == "Holding":
                    parent_holding.append(parent_label)
                if parent_label["label_type"] == "Distributor":
                    parent_distrib.append(parent_label)
            if is_major_label(parent_holding):
                major_holding = True
            if is_major_label(parent_distrib):
                major_distrib = True

        df.loc[(df["label_id"] == record["label_id"]), "parent_labels"] = json.dumps(list(parent_labels))
        df.loc[(df["label_id"] == record["label_id"]), "parent_holding"] = json.dumps(list(parent_holding))
        df.loc[(df["label_id"] == record["label_id"]), "parent_distrib"] = json.dumps(list(parent_distrib))
        df.loc[(df["label_id"] == record["label_id"]), "major_holding"] = major_holding
        df.loc[(df["label_id"] == record["label_id"]), "major_distrib"] = major_distrib
        df.loc[(df["label_id"] == record["label_id"]), "label_name"] = label["name"]
        if len(parent_holding) > 0:
            df.loc[(df["label_id"] == record["label_id"]), "parent_holding"] = parent_holding[0]["name"]
        else:
            df.loc[(df["label_id"] == record["label_id"]), "parent_holding"] = np.nan
        if len(parent_distrib) > 0:
            df.loc[(df["label_id"] == record["label_id"]), "parent_distrib"] = parent_distrib[0]["name"]
        else:
            df.loc[(df["label_id"] == record["label_id"]), "parent_holding"] = np.nan
        df.loc[(df["label_id"] == record["label_id"]), "major_label"] = int(major_holding or major_distrib or is_major_label([label]))

        total_albums = len(df)
        null_albums = df["parent_labels"].isnull().sum()
        print(f"status: {100 - null_albums / total_albums * 100:.2f} "
              f"({total_albums - null_albums} / {total_albums})")

    return df

albums_with_labels_df = pd.read_parquet(label_path / "albums_with_labels.pq")
albums_with_labels_df["labels"] = albums_with_labels_df["labels"].apply(json.loads)
albums_with_labels_df = albums_with_labels_df.explode("labels")
albums_with_labels_df["label_id"] = albums_with_labels_df["labels"].apply(lambda l: l.get("id") if isinstance(l, dict) else None)
labels_df = albums_with_labels_df[["label_id", "labels"]].drop_duplicates("label_id")
print(labels_df.shape)
labels_df = with_parent_labels(labels_df)
labels_df.to_parquet(label_path / "labels.pq", index=False)
print(labels_df.shape)
labels_df.head()

status: 0.03 (1 / 3788)
status: 0.05 (2 / 3788)
status: 0.08 (3 / 3788)
status: 0.11 (4 / 3788)
status: 0.13 (5 / 3788)
status: 0.16 (6 / 3788)
status: 0.18 (7 / 3788)
status: 0.21 (8 / 3788)
status: 0.24 (9 / 3788)
status: 0.26 (10 / 3788)
status: 0.29 (11 / 3788)
status: 0.32 (12 / 3788)
status: 0.34 (13 / 3788)
status: 0.37 (14 / 3788)
status: 0.40 (15 / 3788)
status: 0.42 (16 / 3788)
status: 0.45 (17 / 3788)
status: 0.48 (18 / 3788)
status: 0.50 (19 / 3788)
status: 0.53 (20 / 3788)
status: 0.55 (21 / 3788)
status: 0.58 (22 / 3788)
status: 0.61 (23 / 3788)
status: 0.63 (24 / 3788)
status: 0.66 (25 / 3788)
status: 0.69 (26 / 3788)
status: 0.71 (27 / 3788)
status: 0.74 (28 / 3788)
status: 0.77 (29 / 3788)
status: 0.79 (30 / 3788)
status: 0.82 (31 / 3788)
status: 0.84 (32 / 3788)
status: 0.87 (33 / 3788)
status: 0.90 (34 / 3788)
status: 0.92 (35 / 3788)
status: 0.95 (36 / 3788)
status: 0.98 (37 / 3788)
status: 1.00 (38 / 3788)
status: 1.03 (39 / 3788)
status: 1.06 (40 / 3788)
status: 1

KeyError: "['album_id'] not in index"

In [44]:
def with_annotated_labels(album_df, label_df):
    album_df["labels"] = album_df["labels"].apply(json.loads)
    album_df = album_df.explode("labels")
    album_df["label_id"] = album_df["labels"].apply(lambda l: l.get("id") if isinstance(l, dict) else None)
    df = pd.merge(album_df[["id", "label_id"]], label_df, on="label_id", how="left")
    album_null = {
        "id": None,
        "label_id": np.nan,
        "parent_holding": np.nan,
        "parent_distrib": np.nan,
        "major_holding": 0,
        "major_distrib": 0,
        "major_label": 0,
    }
    albums = []
    album = album_null.copy()
    for index, record in df.iterrows():
        if record["id"] != album["id"]:
            if album["id"] is not None:
                albums.append(album)
            album = record.copy()
        if record["major_distrib"]:
            album["major_label"] = 1
            album["parent_distrib"] = record["parent_distrib"]
            album["major_distrib"] = record["major_distrib"]
            album["label_name"] = record["label_name"]
            album["label_id"] = record["label_id"]
        if record["major_holding"]:
            album["major_label"] = 1
            album["parent_holding"] = record["parent_holding"]
            album["major_holding"] = record["major_holding"]
            album["label_name"] = record["label_name"]
            album["label_id"] = record["label_id"]
        if record["major_label"]:
            album["major_label"] = 1
            album["label_name"] = record["label_name"]
            album["label_name"] = record["label_name"]
            album["label_id"] = record["label_id"]
    df = pd.DataFrame(albums)[["id", "label_name", "major_label"]].drop_duplicates("id")
    df["major_label"] = df["major_label"].fillna(0)
    df = df.rename(columns={"id": "album_id"})
    return df


labels_df = pd.read_parquet(label_path / "labels.pq")
albums_with_labels_df = pd.read_parquet(label_path / "albums_with_labels.pq")
albums_with_annotated_labels_df = with_annotated_labels(albums_with_labels_df, labels_df)
albums_with_annotated_labels_df.to_parquet(label_path / "albums_with_annotated_labels.pq", index=False)
print(albums_with_annotated_labels_df.shape)
albums_with_annotated_labels_df.head()

(6076, 3)


Unnamed: 0,album_id,label_name,major_label
0,3JE1v19SrHzxmZAbf8iHkZ,Deutsche Grammophon,1.0
4,7o6j8wph7fvEcAL67jLVGN,,1.0
5,3tBkjgxDqAwss76O1YHsSY,Interscope Records,1.0
9,7JtT7OyWM8BnIS5FXXPMKg,Pain dans le Cul,0.0
12,0n4P6BsuT61HgsKExU0i1R,RCA Victor,1.0


In [45]:
albums_with_annotated_labels_df[albums_with_annotated_labels_df["major_label"] == 1]

Unnamed: 0,album_id,label_name,major_label
0,3JE1v19SrHzxmZAbf8iHkZ,Deutsche Grammophon,1.0
4,7o6j8wph7fvEcAL67jLVGN,,1.0
5,3tBkjgxDqAwss76O1YHsSY,Interscope Records,1.0
12,0n4P6BsuT61HgsKExU0i1R,RCA Victor,1.0
18,6pFZVwBPXj3m4dyaKcnVev,Sony Music | Latin,1.0
...,...,...,...
14464,13bddABWHCEtgRwGWmEiAT,Atlantic Records Russia,1.0
14471,2kBN5t0JhiAamaDgFGK9Tu,American Recordings,1.0
14474,6Beyik0c73t1KWk0aXp1cW,,1.0
14475,6Bks244AVHynPDeT2t0jOb,,1.0
