In [70]:
import itertools
import json
import os
from pathlib import Path
from time import time, sleep

import pandas as pd
import numpy as np
from slugify import slugify
from util import mbz, spotify, filter_by_artist

diff_dataset = False

data_path = Path("../../data").resolve()
os.makedirs(data_path, exist_ok=True)

mbz.set_useragent("music-mining-artists", 0.3, "pezon@uchicago.edu")

In [71]:
def extract_artist_ids(df, artist_ids_key="artist_ids"):
    return df[artist_ids_key].apply(json.loads).explode(artist_ids_key)


df = pd.read_parquet(data_path / "tracks.pq")
if diff_dataset:
    df = df[df["artist_matched"] == 0]

artist_ids = list(set(extract_artist_ids(df).tolist()))
len(artist_ids)

6914

In [73]:
def fetch_spotify_metadata(artist_ids):
    artists = []
    for i in range(0, len(artist_ids), 50):
        chunk = artist_ids[i:i + 50]
        artists.append(
            {
                "id": artist["id"],
                "name": artist["name"],
                "popularity": artist["popularity"],
                "genres": artist["genres"],
            }
            for artist in spotify.artists(chunk)["artists"]
            if "Karaoke" not in artist["name"]
        )
        sleep(0.25)

        # logging status to console
        total_artists = len(artist_ids)
        fetched_artists = len(artists)
        if fetched_artists % 500 == 0:
            print(f"status: {fetched_artists / total_artists * 100:.2f} "
                  f"({fetched_artists} / {total_artists})")
    return pd.DataFrame(itertools.chain(*artists))


artist_df = fetch_spotify_metadata(artist_ids)
print(artist_df.shape)
artist_df.head()

(6870, 4)


Unnamed: 0,id,name,popularity,genres
0,40Ojab0UtVQFjA76qXr8Ot,Apparat,55,"[ambient pop, electronica, glitch, microhouse,..."
1,4cdse9xf7xbAOParaySF6u,Holy Grail,17,[]
2,3iri9nBFs9e4wN7PLIetAw,Garrett Nash,70,"[electropop, pop, pop rap, social media pop]"
3,12CpR4SNDzVIlDoPSeNFeW,Shashaa Tirupati,71,[filmi]
4,1oZePnbK6Qe8NewfhLhJWF,Cromo,37,"[italian hip hop, rap genovese]"


In [74]:
def fetch_release_years(artist_id):
    albums_resp = spotify.artist_albums(artist_id, limit=50)
    sleep(0.20)
    album_chunks = albums_resp["items"]
    album_years = []
    album_names = []
    for i in range(0, albums_resp["total"], 50):
        album_chunks += spotify.artist_albums(artist_id, limit=50, offset=(i + 1) * 50)["items"]
        sleep(0.20)
        for album in itertools.chain(album_chunks):
            if album["album_type"] == "compilation"\
                    or "Deluxe" in album["name"]\
                    or "Remaster" in album["name"]\
                    or "Karaoke" in album["name"]\
                    or "Acappella" in album["name"]\
                    or "Version" in album["name"]\
                    or ("Remix" in album["name"] and album["album_type"] == "single")\
                    or slugify(album["name"]) in album_names:
                continue
            album_names.append(slugify(album["name"]))
            album_years.append(album["release_date"][0:4])
    return album_years


def with_release_years(artist_df):
    artist_df["release_years"] = artist_df["id"].apply(fetch_release_years)
    return artist_df


artist_df = with_release_years(artist_df)
print(artist_df.shape)
artist_df.head()

KeyboardInterrupt: 

In [None]:
def with_release_year_stats(artist_df):
    df_ = artist_df.copy()
    df_["first_release"] = df_["release_years"].apply(lambda years: min(years) if len(years) > 0 else None)
    df_["last_release"] = df_["release_years"].apply(lambda years: max(years) if len(years) > 0 else None)
    df_["total_releases"] = df_["release_years"].apply(lambda years: len(years) if len(years) > 0 else None)
    return df_


artist_df = with_release_year_stats(artist_df)
print(artist_df.shape)
artist_df.head()

In [None]:
def with_release_years_csum_columns(artist_df):
    df_ = artist_df.explode("release_years")
    df_ = df_[["id", "release_years"]]
    df_["csum"] = 1
    df_ = (
        df_.groupby(["id", "release_years"]).sum()
            .groupby(level=0).cumsum()
            .reset_index()
    )
    start_year = max(1975, int(min(df_["release_years"])))
    end_year = int(max(df_["release_years"]))
    df_ = (
        df_.pivot(index="id", columns="release_years", values="csum")
            .fillna(0)
            .reset_index()
            .rename_axis(None, axis=1)
            #.set_index("id")
    )
    columns = ["id"]
    for year in range(start_year, end_year):
        columns.append(str(f"releases_{year}"))
        if str(year) not in df_.columns:
            df_[str(year)] = 0
        if str(year + 1) not in df_.columns:
            df_[str(year + 1)] = 0
        df_.loc[df_[str(year + 1)] == 0, str(year + 1)] = df_[str(year)]
        df_[f"releases_{year}"] = df_[str(year)]
    df_ = df_.fillna(0)
    df_ = pd.merge(artist_df, df_[columns], on="id", how="left")
    return df_

drop_columns = [col for col in artist_df.columns if col.startswith("releases_")]
artist_df = artist_df.drop(columns=drop_columns)
artist_df = with_release_years_csum_columns(artist_df)
print(artist_df.shape)
artist_df.head()

In [None]:
def get_artist_bio(artist_name):
    artists = mbz.search_artists(artist_name)
    sleep(1)
    if artists["artist-count"] > 0:
        artist = artists["artist-list"][0]
        artist_tags = artist.get("tag-list")
        return {
            "gender": artist.get("gender"),
            "country": artist.get("country"),
            "hometown": artist.get("begin-area", {}).get("name"),
        }
    else:
        return {}


def with_artist_bio(df, artist_key="name"):
    """
    Add Spotify ids and metadata to dataframe with artist data
    As ids and metadata are added to multiple rows at a time.
    Make sure to skip rows that already have id and metadata.
    """
    for index, record in df.iterrows():
        artist = record[artist_key]
        filter_by_artist_ = filter_by_artist(artist_key=artist_key)

        # check if artist id is already present
        if "hometown" in df\
                and len(df[(filter_by_artist_(df, artist))
                           & (df["hometown"].isna())]) == 0:
            continue

        # search for artist on mbz
        mbz_artist = get_artist_bio(artist)
        if mbz_artist is None:
            continue

        # add id and metadata to dataframe
        df.loc[filter_by_artist_(df, artist),
               "gender"] = mbz_artist.get("gender")
        df.loc[filter_by_artist_(df, artist),
               "country"] = mbz_artist.get("country")
        df.loc[filter_by_artist_(df, artist),
               "hometown"] = mbz_artist.get("hometown")
        df["hometown"] = df["hometown"].replace("None", np.NaN)

        if index % 500 == 0:
            total_artists = len(df)
            null_artists = df["hometown"].isnull().sum()
            print(f"status: {100 - null_artists / total_artists * 100:.2f} "
                  f"({total_artists - null_artists} / {total_artists})")
    return df


artist_df = with_artist_bio(artist_df)
print(artist_df.shape)
artist_df.head()

In [31]:
def summarize_artist_nominations(df):
    df = df[~df["spotify_artist_id"].isna()]

    df[f"grammy_nominated_1994"] = 0
    df[f"grammy_won_1994"] = 0
    for year in range(1995, 2022):
        df[f"grammy_nominated_{year}"] = ((df["award_year"] == year) & (df["award_nominee"] == 1)).astype(int)
        df[f"grammy_nominated_{year}"] = df[f"grammy_nominated_{year - 1}"] + df[f"grammy_nominated_{year}"]
        df[f"grammy_won_{year}"] = ((df["award_year"] == year) & (df["award_winner"] == 1)).astype(int)
        df[f"grammy_won_{year}"] = df[f"grammy_won_{year - 1}"] + df[f"grammy_won_{year}"]

    artist_first_win = (
        df[df["award_winner"] == 1]
            .groupby("artist_mbid")
            .min("award_year")
            .reset_index()
            .rename(columns={"award_year": "first_win"})
        [["artist_mbid", "first_win"]]
    )

    artist_last_win = (
        df[df["award_winner"] == 1]
            .groupby("artist_mbid")
            .max("award_year")
            .reset_index()
            .rename(columns={"award_year": "last_win"})
        [["artist_mbid", "last_win"]]
    )

    artist_first_nomination = (
        df.groupby("artist_mbid")
            .min("award_year")
            .reset_index()
            .rename(columns={"award_year": "first_nomination"})
        [["artist_mbid", "first_nomination"]]
    )

    artist_last_nomination = (
        df.groupby("artist_mbid")
            .min("award_year")
            .reset_index()
            .rename(columns={"award_year": "last_nomination"})
        [["artist_mbid", "last_nomination"]]
    )

    df["total_nominations"] = df["award_nominee"]
    df["total_wins"] = df["award_winner"]
    df = (
        df.groupby(["artist_mbid", "spotify_artist_id", "artist_name"])
            .agg("sum")
            .reset_index()
            .drop(columns=["award_year", "release_year", "award_nominee", "award_winner",
                           "spotify_track_popularity", "artist_name"])
    )

    df = pd.merge(df, artist_first_win, on="artist_mbid", how="outer")
    df = pd.merge(df, artist_last_win, on="artist_mbid", how="outer")
    df = pd.merge(df, artist_first_nomination, on="artist_mbid", how="outer")
    df = pd.merge(df, artist_last_nomination, on="artist_mbid", how="outer")
    df = df.drop_duplicates("artist_mbid")
    df = df.drop(columns=["artist_mbid"])
    return df.fillna(0)


def with_grammy_nomination_csum_columns(artist_df, artist_nominations_df):
    artist_nominations_summary_df = summarize_artist_nominations(artist_nominations_df)
    df_ = pd.merge(artist_df, artist_nominations_summary_df, left_on="id", right_on="spotify_artist_id", how="left")
    df_ = df_.drop(columns=["spotify_artist_id"])
    return df_


artist_nominations_df = pd.read_parquet("../../data/raw/artist_nominations.pq")
artist_df = with_grammy_nomination_csum_columns(artist_df, artist_nominations_df)
print(artist_df.shape)
artist_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"grammy_nominated_1994"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"grammy_won_1994"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"grammy_nominated_{year}"] = ((df["award_year"] == year) & (df["award_nominee"] == 1)).astype(int)
A value is trying to be set on a copy of a

Unnamed: 0,id,name,popularity,genres,release_years,first_release,last_release,total_releases,releases_0,releases_1,...,grammy_nominated_2020,grammy_won_2020,grammy_nominated_2021,grammy_won_2021,total_nominations,total_wins,first_win,last_win,first_nomination,last_nomination
0,0gGd4WhPXBSgDX6fdOHcOw,Rhymastic,54,"[v-pop, vietnamese hip hop]","[2021, 2021, 2021, 2021, 2021, 2021, 2020, 202...",2013,2022,35.0,0.0,0.0,...,,,,,,,,,,
1,7MBNcpWHn8RTZFUQv5fNPp,Alessio La Profunda Melodia,49,[],"[2021, 2019, 2021, 2020, 2019, 2019, 2018, 201...",2016,2021,24.0,0.0,0.0,...,,,,,,,,,,
2,40Ojab0UtVQFjA76qXr8Ot,Apparat,55,"[ambient pop, electronica, glitch, microhouse,...","[2020, 2020, 2020, 2020, 2019, 2015, 2013, 201...",2001,2020,49.0,0.0,0.0,...,,,,,,,,,,
3,5zVu34ozw0BzXIcNHqkO0u,Liana Flores,65,[indie pop],"[2020, 2019, 2018]",2018,2020,3.0,0.0,0.0,...,,,,,,,,,,
4,2HGPd1n5wmvFCJccRPRJ9k,Herrat,41,"[finnish dance pop, finnish hip hop, finnish pop]","[2021, 2021, 2020, 2020, 2020, 2020, 2020, 201...",2017,2021,16.0,0.0,0.0,...,,,,,,,,,,


In [102]:
def with_chart_stats(artist_df, song_chart_df):
    song_chart_df["year"] = song_chart_df["month"].apply(lambda month: month.split(" ")[-1])
    # track number of months artist has tracks in charts
    song_chart_df["chart_months"] = 1
    song_chart_df = (
        song_chart_df
            .groupby(["spotify_artist_id"])
            .aggregate({
                "chart_months": "sum",
                "indicativerevenue": "sum",
                "year": ["min", "max"],
                "spotify_track_id": "nunique",
                "position": "min",
            })
            .reset_index()
            .set_axis(["id", "chart_months", "total_revenue", "first_chart_year",
                       "last_chart_year", "chart_tracks", "chart_peak"], axis=1)
    )
    df = pd.merge(artist_df, song_chart_df, on="id", how="left").fillna(0)
    return df


artist_df = (
    pd.read_parquet("../../data/artist_summary.pq")
        .drop(columns=["chart_months", "total_revenue", "first_chart_year", "last_chart_year", "chart_tracks", "chart_peak"])
)
song_chart_summary_df = pd.read_parquet(data_path / "raw/song_chart_by_month.pq")
artist_df = with_chart_stats(artist_df, song_chart_summary_df)
print(artist_df.shape)
artist_df.head()

(6140, 243)


Unnamed: 0,id,name,popularity,genres,release_years,first_release,last_release,total_releases,releases_1900,releases_1901,...,chart_months_2019,chart_months_recently_2019,chart_months_2020,chart_months_recently_2020,chart_months,total_revenue,first_chart_year,last_chart_year,chart_tracks,chart_peak
0,1Yfe3ONJlioHys7jwHdfVm,Lomepal,71,"[french hip hop, rap conscient]","[2019, 2019, 2018, 2017, 2015, 2014, 2013, 201...",2011,2021,46.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
1,6OqQbeNe7Pi7Zi7SNTi9ko,Billy Gilman,37,"[country, queer country]","[2016, 2006, 2005, 2003, 2001, 2000, 2000, 202...",2000,2020,21.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1292.924,2000,2000,1.0,48.0
2,0aqgzJNXZRtRY2tacDnses,Van She Tech,11,[],"[2021, 2019, 2021, 2009, 2008, 2008, 2008, 200...",2007,2021,9.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
3,1Hsdzj7Dlq2I7tHP7501T4,Niall Horan,81,"[dance pop, pop, post-teen pop]","[2020, 2019, 2017, 2021, 2021, 2021, 2021, 202...",2016,2022,23.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,10510.417,2016,2017,2.0,19.0
4,5NtLSCw6j3lTSOUjHnZASC,"Juvenile, Wacko, & Skip",29,[],"[2013, 2004]",2004,2013,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2437.735,2004,2004,1.0,43.0


In [104]:
artist_df[artist_df["chart_months"] > 0]

Unnamed: 0,id,name,popularity,genres,release_years,first_release,last_release,total_releases,releases_1900,releases_1901,...,chart_months_2019,chart_months_recently_2019,chart_months_2020,chart_months_recently_2020,chart_months,total_revenue,first_chart_year,last_chart_year,chart_tracks,chart_peak
1,6OqQbeNe7Pi7Zi7SNTi9ko,Billy Gilman,37,"[country, queer country]","[2016, 2006, 2005, 2003, 2001, 2000, 2000, 202...",2000,2020,21.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1292.924,2000,2000,1.0,48.0
3,1Hsdzj7Dlq2I7tHP7501T4,Niall Horan,81,"[dance pop, pop, post-teen pop]","[2020, 2019, 2017, 2021, 2021, 2021, 2021, 202...",2016,2022,23.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,10510.417,2016,2017,2.0,19.0
4,5NtLSCw6j3lTSOUjHnZASC,"Juvenile, Wacko, & Skip",29,[],"[2013, 2004]",2004,2013,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2437.735,2004,2004,1.0,43.0
8,790FomKkXshlbRYZFtlgla,KAROL G,88,"[latin, reggaeton, reggaeton colombiano, trap ...","[2021, 2019, 2017, 2022, 2021, 2021, 2021, 202...",2014,2022,53.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,3840.116,2020,2020,1.0,33.0
12,6MPCFvOQv5cIGfw3jODMF0,Internet Money,80,"[rap, trap]","[2020, 2020, 2022, 2022, 2021, 2021, 2021, 202...",2019,2022,26.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7.0,8167.606,2020,2021,1.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,7JfutRemdlJGyQsIZ7wWQ9,Workout Buddy,32,[workout product],"[2022, 2022, 2022, 2022, 2022, 2022, 2021, 202...",2020,2022,99.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7.0,6248.887,2013,2013,1.0,9.0
3957,1dfeR4HaWDbWqFHLkxsg1d,Queen,88,"[classic rock, glam rock, rock]","[2018, 2016, 2016, 2015, 2014, 2014, 2012, 200...",1973,2020,31.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,760.980,2018,2018,1.0,48.0
4912,7CyeXFnOrfC1N6z4naIpgo,The Ronettes,68,"[adult standards, brill building pop, classic ...","[2021, 2020, 2016, 2013, 1964, 1964, 2017, 2016]",1964,2021,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1022.560,2020,2020,1.0,36.0
4937,5Z1CCuBsyhEHngq3U5IraY,Westlife,74,"[boy band, dance pop, europop]","[2019, 2010, 2009, 2007, 2006, 2005, 2004, 200...",1999,2021,24.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,4739.251,2000,2000,1.0,29.0


In [85]:
def with_chart_months_csum_columns(artist_df, song_chart_df):
    song_chart_df["year"] = song_chart_df["month"].apply(lambda month: month.split(" ")[-1]).astype(int)
    song_chart_df["chart_months"] = 1
    df_ = (
        song_chart_df
            .sort_values(by=["spotify_artist_id", "spotify_track_id", "year"])
            .groupby(["spotify_artist_id", "year"]).sum()
            .groupby(level=0)
            .aggregate({
                "chart_months": "cumsum",
                "indicativerevenue": "cumsum",
            })
            .reset_index()
            .pivot(index="spotify_artist_id", columns="year", values="chart_months")
            .fillna(0)
            .reset_index()
            .rename_axis(None, axis=1)
            #.set_index("id")
    )
    columns = ["id"]
    df_["1999"] = 0
    df_["1998"] = 0
    df_["1997"] = 0
    for year in range(2000, 2021):
        columns.append(str(f"chart_months_{year}"))
        columns.append(str(f"chart_months_recently_{year}"))
        if str(year) not in df_.columns:
            df_[str(year)] = 0
        if str(year + 1) not in df_.columns:
            df_[str(year + 1)] = 0
        df_.loc[df_[str(year + 1)] == 0, str(year + 1)] = df_[str(year)]
        df_[f"chart_months_{year}"] = df_[str(year)]
        df_[f"chart_months_recently_{year}"] = df_[str(year)] - df_[str(year - 3)]
    df_ = df_.rename(columns={"spotify_artist_id": "id"}).fillna(0)
    df_ = pd.merge(artist_df, df_[columns], on="id", how="left")
    return df_


artist_df = artist_df.drop(
    columns=[col for col in artist_df.columns
             if col.startswith("chart_months_")])
song_chart_df = pd.read_parquet(data_path / "raw/song_chart_by_month.pq")
artist_df = with_chart_months_csum_columns(artist_df, song_chart_df)
print(artist_df.shape)
artist_df.head()

(6140, 243)


Unnamed: 0,id,name,popularity,genres,release_years,first_release,last_release,total_releases,releases_1900,releases_1901,...,chart_months_2016,chart_months_recently_2016,chart_months_2017,chart_months_recently_2017,chart_months_2018,chart_months_recently_2018,chart_months_2019,chart_months_recently_2019,chart_months_2020,chart_months_recently_2020
0,1Yfe3ONJlioHys7jwHdfVm,Lomepal,71,"[french hip hop, rap conscient]","[2019, 2019, 2018, 2017, 2015, 2014, 2013, 201...",2011,2021,46.0,0.0,0.0,...,,,,,,,,,,
1,6OqQbeNe7Pi7Zi7SNTi9ko,Billy Gilman,37,"[country, queer country]","[2016, 2006, 2005, 2003, 2001, 2000, 2000, 202...",2000,2020,21.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0aqgzJNXZRtRY2tacDnses,Van She Tech,11,[],"[2021, 2019, 2021, 2009, 2008, 2008, 2008, 200...",2007,2021,9.0,0.0,0.0,...,,,,,,,,,,
3,1Hsdzj7Dlq2I7tHP7501T4,Niall Horan,81,"[dance pop, pop, post-teen pop]","[2020, 2019, 2017, 2021, 2021, 2021, 2021, 202...",2016,2022,23.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5NtLSCw6j3lTSOUjHnZASC,"Juvenile, Wacko, & Skip",29,[],"[2013, 2004]",2004,2013,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
artist_df = artist_df.drop_duplicates(["id"])
artist_df["first_release"] = artist_df["first_release"].astype(int)
artist_df["last_release"] = artist_df["last_release"].astype(int)
artist_df["gender"] = artist_df["gender"].astype(str)
artist_df["country"] = artist_df["country"].astype(str)
artist_df["hometown"] = artist_df["hometown"].astype(str)
artist_df["first_chart_year"] = artist_df["first_chart_year"].astype(int)
artist_df["last_chart_year"] = artist_df["last_chart_year"].astype(int)

print(diff_dataset)
if diff_dataset:
    artist_df_0 = pd.read_parquet(data_path / "artist_summary.pq")
    artist_df_concat = pd.concat([artist_df_0, artist_df]).drop_duplicates(["id"])
    artist_df_concat.to_csv(data_path / "artist_summary.csv.gz")
    artist_df_concat.to_parquet(data_path / "artist_summary.pq")
else:
    artist_df.to_csv(data_path / "artist_summary.csv.gz")
    artist_df.to_parquet(data_path / "artist_summary.pq")

False


In [87]:
artist_df_0.shape

(2399, 243)

In [106]:
artist_df.shape

(6140, 243)

In [107]:
artist_df_concat.shape

(6140, 243)

In [108]:
artist_df_concat.drop_duplicates(["id"]).shape

(6140, 243)