In [1]:
import re
import os
from pathlib import Path
from time import sleep

import pandas as pd
from thefuzz import fuzz
from requests.exceptions import ReadTimeout

from util import spotify, resolve_names, filter_by_track

diff_dataset = True
data_path = Path("../../data").resolve()
os.makedirs(data_path, exist_ok=True)



In [2]:
def fetch_song_chart_by_month():
    """Chart 2000 - 2021: the top 50 chart for every month from Jan 2000 to Jun 2021
    Note: song and albums ranked weekly by indicative revenue globally over a certain period. Indicative Revenue is a model for global sales volume, adjusted for inflation and currency exchange, and using IFPI estimated annual revenue for the recorded music industry in each country.
    #songs_chart_by_month = pd.read_csv("https://chart2000.com/data/chart2000-songmonth-0-3-0063.csv")
    """
    return pd.read_csv("/Users/pez/Downloads/chart2000-songmonth-0-3-0063.csv")


songs_chart_by_month = fetch_song_chart_by_month()
song_artist_df = songs_chart_by_month[["song", "artist"]].drop_duplicates()

status: 0.23 (8 / 3454
status: 0.90 (31 / 3454
status: 1.13 (39 / 3454
status: 1.88 (65 / 3454
status: 2.20 (76 / 3454
status: 2.29 (79 / 3454
status: 2.69 (93 / 3454
status: 2.81 (97 / 3454
status: 3.13 (108 / 3454
status: 3.91 (135 / 3454
status: 4.20 (145 / 3454
status: 4.46 (154 / 3454
status: 4.86 (168 / 3454
status: 5.30 (183 / 3454
status: 5.44 (188 / 3454
status: 5.79 (200 / 3454
status: 5.82 (201 / 3454
status: 6.14 (212 / 3454
status: 6.46 (223 / 3454
status: 6.72 (232 / 3454
status: 6.89 (238 / 3454
status: 7.12 (246 / 3454
status: 7.30 (252 / 3454
status: 7.50 (259 / 3454
status: 8.54 (295 / 3454
status: 8.71 (301 / 3454
status: 9.03 (312 / 3454
status: 9.09 (314 / 3454
status: 9.29 (321 / 3454
status: 9.58 (331 / 3454
status: 9.99 (345 / 3454
status: 10.63 (367 / 3454
status: 10.83 (374 / 3454
status: 11.12 (384 / 3454
status: 11.61 (401 / 3454
status: 11.96 (413 / 3454
status: 12.28 (424 / 3454
status: 12.48 (431 / 3454
status: 12.57 (434 / 3454
status: 12.88 (445 / 3454


In [84]:
def best_match(result, artist, track):
    artist = artist.replace(".", "")
    track = track.replace(".", "")
    for t in result["tracks"]["items"]:
        t_artist = t["artists"][0]["name"].replace(".", "")
        t_track = t["name"].replace(".", "")
        if "karaoke" in t_artist.lower() or "karaoke" in t_track.lower():
            continue
        if "version" in t_track.lower() and not "version" in track.lower():
            continue
        artist_fuzz_score = fuzz.token_set_ratio(artist, t_artist)
        track_fuzz_score = fuzz.token_set_ratio(track, t_track)
        compound_fuzz_score = fuzz.token_set_ratio(f"{artist} {track}", f"{t_artist} {t_track}")
        compound_ratio_fuzz_score = fuzz.ratio(f"{artist} {track}", f"{t_artist} {t_track}")
        print(artist, t_artist, artist_fuzz_score, track, t_track, track_fuzz_score, compound_fuzz_score, compound_ratio_fuzz_score)
        if (artist_fuzz_score > 85 and track_fuzz_score > 85) or compound_fuzz_score > 90 or compound_ratio_fuzz_score > 90:
            return t
    return None


def fetch_track(artist, track):
    artists = [artist]
    if "," in artist:
        artists += artist.split(",")
    if artist.startswith("The "):
        artists.append(artist[4:])
    for artist in artists:
        if " & " in artist:
            artists += artist.split("&")
        try:
            # print(artist, track)
            artist_ = artist
            track_ =  re.sub("[\(\[].*?[\)\]]", "", track)
            result = spotify.search(f"track:{track} artist:{artist}")
            sleep(.25)
            best_match_ = best_match(result, artist, track)
            if result["tracks"]["total"] == 0 or not best_match_:
                artist_, _ = resolve_names(artist, track)
                result = spotify.search(f"track:{track} artist:{artist_}")
                best_match_ = best_match(result, artist, track)
                sleep(.25)
            if result["tracks"]["total"] == 0 or not best_match_:
                result = spotify.search(f"{artist} {track}")
                best_match_ = best_match(result, artist, track)
                sleep(.25)
            if result["tracks"]["total"] == 0 or not best_match_:
                result = spotify.search(f"{artist_} {track}")
                best_match_ = best_match(result, artist, track)
                sleep(.25)
            if result["tracks"]["total"] == 0 or not best_match_:
                result = spotify.search(f"{artist} {track_}")
                best_match_ = best_match(result, artist, track)
                sleep(.25)
            if result["tracks"]["total"] == 0 or not best_match_:
                artist_ = re.sub("'.*?'", "", artist)
                result = spotify.search(f"{artist_} {track}")
                best_match_ = best_match(result, artist_, track)
                sleep(.25)
            if result["tracks"]["total"] == 0 or not best_match_:
                continue
        except ReadTimeout:
            print("spotify time out.")
            continue
        if best_match_:
            return best_match_
    return None


def with_spotify_track_metadata(df, artist_key="artist", track_key="song"):
    """
    Add Spotify ids and metadata to chart songs.
    Songs that have charted are often on the chart the following week.
    As ids and metadata are added to multiple rows at a time.
    Make sure to skip rows that already have id and metadata.
    """
    for index, record in df.iterrows():
        artist, track = record[artist_key], record[track_key]
        # logging.info(f"artist: {artist}, track: {track}")
        filter_by_track_ = filter_by_track(artist_key=artist_key,
                                           track_key=track_key)

        # check if track id is already present
        if "spotify_track_id" in df
                and len(filter_by_track_(df, artist, track)
                        & (df["spotify_track_id"].isna())) == 0:
            # logging.info("skipped: metadata exists")
            continue

        # search for song on spotify
        spotify_track = fetch_track(artist, track)
        if spotify_track is None:
            # logging.info(f"skipped: no track found: {artist} - {track}")
            continue

        # add id and metadata to dataframe
        df.loc[filter_by_track_(df, artist, track),
               "spotify_track_id"] = spotify_track["id"]
        df.loc[filter_by_track_(df, artist, track),
               "spotify_artist_id"] = spotify_track["artists"][0]["id"]
        df.loc[filter_by_track_(df, artist, track),
               "spotify_album_id"] = spotify_track["album"]["id"]
        df.loc[filter_by_track_(df, artist, track),
               "spotify_track_popularity"] = spotify_track["popularity"]
        df.loc[filter_by_track_(df, artist, track),
               "spotify_track_explicit"] = spotify_track["explicit"]
        df.loc[filter_by_track_(df, artist, track),
               "spotify_track_duration_ms"] = spotify_track["duration_ms"]
        df.loc[filter_by_track_(df, artist, track),
               "spotify_track_album_release_date"] =\
            spotify_track["album"]["release_date"]
        df.loc[filter_by_track_(df, artist, track),
               "spotify_track_album_release_date_precision"] =\
            spotify_track["album"]["release_date_precision"]

        # don't trigger spotify rate-limit
        sleep(1)
        if index % 10 == 0:
            total_tracks = len(df)
            null_tracks = df["spotify_track_id"].isnull().sum()
            print(f"status: {100 - null_tracks / total_tracks * 100:.2f} "
                  f"({total_tracks - null_tracks} / {total_tracks}")
    return df


# test...
# fetch_track("Rob Thomas & Santana", "Smooth")
# fetch_track("Rob Thomas Santana", "Smooth")
# fetch_track("Eiffel 65", "Blue (Da Ba Dee)")
# fetch_track("Missy Elliott", "Hot Boyz")
# fetch_track("P Diddy & R Kelly", "Satisfy You")
# fetch_track("Will Smith", "Will 2K")
# fetch_track("Ja Rule, Lil' Mo & Vita", "Put It On Me")
# fetch_track("Joe Thomas Mystikal", "Stutter")
# fetch_track("Ricky Martin & Christina Aguilera", "Nobody Wants To Be Lonely")
# fetch_track("ATC", "Around The World (La La La La La)")
fetch_track("T-Pain & Mike Jones", "I'm In Love with A Stripper")

T-Pain & Mike Jones Hei$enberg 30 I'm In Love with A Stripper Roller Coaster 29 23 11
T-Pain & Mike Jones Drew DeLeon 21 I'm In Love with A Stripper Tina 13 16 10
T-Pain & Mike Jones Jyourz 17 I'm In Love with A Stripper Move 19 14 17
T-Pain & Mike Jones Queso Tone 22 I'm In Love with A Stripper Dance Like You Work at the Club 41 37 31
T-Pain & Mike Jones Queso Tone 22 I'm In Love with A Stripper Dance Like You Work at the Club 41 37 31
T-Pain & Mike Jones Queso Tone 22 I'm In Love with A Stripper Dance Like You Work at the Club 41 37 31
T-Pain & Mike Jones Queso Tone 22 I'm In Love with A Stripper Dance Like You Work at the Club 41 37 31
T-Pain & Mike Jones Queso Tone 22 I'm In Love with A Stripper Dance Like You Work at the Club 41 37 31
T-Pain & Mike Jones Hei$enberg 30 I'm In Love with A Stripper Roller Coaster - Radio Edit 31 22 14
T-Pain  Hei$enberg 12 I'm In Love with A Stripper Roller Coaster 29 27 13
T-Pain  Drew DeLeon 24 I'm In Love with A Stripper Tina 13 20 12
T-Pain  Jyou

In [None]:
song_artist_df = with_spotify_track_metadata(song_artist_df)

In [101]:
pd.merge(songs_chart_by_month, song_artist_df, left_on=["song", "artist"], right_on=["song", "artist"], how="left")

Unnamed: 0,month,position,artist,song,indicativerevenue,us,uk,de,fr,ca,...,spotify_track_album_release_date_x,spotify_track_album_release_date_precision_x,spotify_track_id_y,spotify_artist_id_y,spotify_album_id_y,spotify_track_popularity_y,spotify_track_explicit_y,spotify_track_duration_ms_y,spotify_track_album_release_date_y,spotify_track_album_release_date_precision_y
0,Jan 2000,1,Rob Thomas & Santana,Smooth,3911.953,1,-,44,-,-,...,2005-03-01,day,4qUfTdPwpeTKwck2ptbj4h,6GI52t8N5F02MxU0g5U69P,4gZuhUyDg1VRwosegw9pU2,26.0,False,233400.0,2003-12-15,day
1,Jan 2000,2,Christina Aguilera,What A Girl Wants,3787.015,1,81,18,-,1,...,1999,year,5bGmuxShUba9maPswDnhCs,1l7ZsJRRS8wlW3WfJfPfNS,6fpPZS13ImRVpr7Tqs6yP9,67.0,False,215800.0,1999,year
2,Jan 2000,3,Savage Garden,I Knew I Loved You,3693.844,1,56,68,43,1,...,1999-11-09,day,6nozDLxeL0TE4MS9GqYU1v,3NRFinRTEqUCfaTTZmk8ek,3w69KK7uEA8OsKuW3OeQcC,73.0,False,250360.0,1999-11-09,day
3,Jan 2000,4,Celine Dion,That's The Way It Is,3341.488,8,20,9,10,-,...,1999,year,5s4catxeZsaWFnOrvrXZHf,4S9EykWXhStSc15wEx8QFK,51DfHvunyUysUZoBJEwbF8,69.0,False,241373.0,1999,year
4,Jan 2000,5,Eiffel 65,Blue (Da Ba Dee),3340.978,6,28,22,10,1,...,2011-01-05,day,2yAVzRiEQooPEJ9SYx11L3,64rxQRJsLgZwHHyWKB8fiF,54vbD17F1t5q3yHkj1cX37,79.0,False,283747.0,2011-01-05,day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12895,Jun 2021,46,Ariana Grande,POV,849.983,33,-,-,-,-,...,2020-10-30,day,3UoULw70kMsiVXxW0L3A33,66CXWjxzNUsdJxJ2JdwvnR,3euz4vS7ezKGnNSwgyvKcd,82.0,False,201882.0,2020-10-30,day
12896,Jun 2021,47,Dierks Bentley,Gone,843.855,32,-,-,-,85,...,2020-10-22,day,7IhfjcaXuXCq20ZiAA7igK,7x8nK0m0cP2ksQf0mjWdPS,7zEA1JpBgLz8Zj6cGcIvIV,69.0,False,205960.0,2020-10-22,day
12897,Jun 2021,48,Olivia Rodrigo,"Jealousy, Jealousy",838.863,24,-,-,-,21,...,2021-05-21,day,0MMyJUC3WNnFS1lit5pTjk,1McMsnEElThX1knmY4oliG,6s84u2TUpR3wdUv4NgKA2j,88.0,False,173160.0,2021-05-21,day
12898,Jun 2021,49,Olivia Rodrigo,"1 Step Forward, 3 Steps Back",837.277,19,-,-,-,17,...,2021-05-21,day,4wcBRRpIfesgcyUtis7PEg,1McMsnEElThX1knmY4oliG,6s84u2TUpR3wdUv4NgKA2j,82.0,True,163586.0,2021-05-21,day


In [104]:
songs_chart_by_month

Unnamed: 0,month,position,artist,song,indicativerevenue,us,uk,de,fr,ca,au,spotify_track_id,spotify_artist_id,spotify_album_id,spotify_track_popularity,spotify_track_explicit,spotify_track_duration_ms,spotify_track_album_release_date,spotify_track_album_release_date_precision
0,Jan 2000,1,Rob Thomas & Santana,Smooth,3911.953,1,-,44,-,-,5,5IALWUYK0zDSEmZgb4ICvc,7BB7DAcx48BfzNf34VMr7y,6rkqnXiK12wQ8UthL6A52E,1.0,False,244924.0,2005-03-01,day
1,Jan 2000,2,Christina Aguilera,What A Girl Wants,3787.015,1,81,18,-,1,9,5bGmuxShUba9maPswDnhCs,1l7ZsJRRS8wlW3WfJfPfNS,6fpPZS13ImRVpr7Tqs6yP9,65.0,False,215800.0,1999,year
2,Jan 2000,3,Savage Garden,I Knew I Loved You,3693.844,1,56,68,43,1,21,6nozDLxeL0TE4MS9GqYU1v,3NRFinRTEqUCfaTTZmk8ek,3w69KK7uEA8OsKuW3OeQcC,71.0,False,250360.0,1999-11-09,day
3,Jan 2000,4,Celine Dion,That's The Way It Is,3341.488,8,20,9,10,-,14,5s4catxeZsaWFnOrvrXZHf,4S9EykWXhStSc15wEx8QFK,51DfHvunyUysUZoBJEwbF8,67.0,False,241373.0,1999,year
4,Jan 2000,5,Eiffel 65,Blue (Da Ba Dee),3340.978,6,28,22,10,1,1,2yAVzRiEQooPEJ9SYx11L3,64rxQRJsLgZwHHyWKB8fiF,54vbD17F1t5q3yHkj1cX37,77.0,False,283747.0,2011-01-05,day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12895,Jun 2021,46,Ariana Grande,POV,849.983,33,-,-,-,-,-,3UoULw70kMsiVXxW0L3A33,66CXWjxzNUsdJxJ2JdwvnR,3euz4vS7ezKGnNSwgyvKcd,81.0,False,201882.0,2020-10-30,day
12896,Jun 2021,47,Dierks Bentley,Gone,843.855,32,-,-,-,85,-,7IhfjcaXuXCq20ZiAA7igK,7x8nK0m0cP2ksQf0mjWdPS,7zEA1JpBgLz8Zj6cGcIvIV,68.0,False,205960.0,2020-10-22,day
12897,Jun 2021,48,Olivia Rodrigo,"Jealousy, Jealousy",838.863,24,-,-,-,21,22,0MMyJUC3WNnFS1lit5pTjk,1McMsnEElThX1knmY4oliG,6s84u2TUpR3wdUv4NgKA2j,86.0,False,173160.0,2021-05-21,day
12898,Jun 2021,49,Olivia Rodrigo,"1 Step Forward, 3 Steps Back",837.277,19,-,-,-,17,18,4wcBRRpIfesgcyUtis7PEg,1McMsnEElThX1knmY4oliG,6s84u2TUpR3wdUv4NgKA2j,81.0,True,163586.0,2021-05-21,day


In [105]:
songs_chart_by_month = fetch_song_chart_by_month()
song_chart_by_month = pd.merge(songs_chart_by_month, song_artist_df_all, left_on=["song", "artist"], right_on=["song", "artist"], how="left")
song_chart_by_month.to_parquet(data_path / "raw/song_chart_by_month.pq", index=False)