In [1]:
import pandas as pd
import json
import requests
import datetime
import time
import urllib.parse
import unicodedata
import re
from bs4 import BeautifulSoup
from tqdm import tqdm

In [2]:
spotify_token = "BQDVad7Y0vHsuEh7222Yi-C_4bw0UZEjWpPEZo09nM-n_-6tlPqdGoyCnOfp5dopAOBLWhGUN3sdOQ6cY1MBRKadYfLmlPwwIpaFG_lbcvk1T1VTPHMyzrk0SkIe-1BSTxoqm_fyCHpFDg7a5avuvMzAY6l1WdQWzUEwfc3YHlBKA8WY4liNBq4dPEfYa2PKdZSUSv2RXDk-XfZB7GjqYZecWNDZjyWjjhem"
genius_token = "WJrlisPS9YMSVBPq9yVSMR1HJmzw1kBdgjAAPo3HTjXGgEM2NpglSCjFxA29lb_D"

In [3]:
start_time = datetime.datetime.now()

In [4]:
dates = []
starting_date = datetime.datetime.strptime("2023-01-05", "%Y-%m-%d")
ending_date = datetime.datetime.strptime("2023-01-12", "%Y-%m-%d")
while starting_date <= ending_date:
    dates.append(starting_date.strftime("%Y-%m-%d"))
    starting_date = starting_date + datetime.timedelta(days=7)

dates


['2023-01-05', '2023-01-12']

In [5]:
def try_request(method, url, headers):
    error = True
    i = 0
    while error:
        try:
            response = requests.request(method, url, headers=headers)
            if response.status_code != 200 and response.status_code != 404:
                print(response)
                raise Exception("Retry")
            error = False
        except:
            if i > 10:
                raise Exception("Too many retries for request: " + url)
            time.sleep(1.0 + (i*5))
            i += 1
            print("Retrying " + str(i) +  ":\t" + url)
    return response
            

In [6]:
def normalize_string(s):
    s = unicodedata.normalize("NFKD", s).encode("ASCII", "ignore").decode("utf-8")
    s = re.sub(r'[^a-zA-Z ]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()

    return s

In [7]:
def load_countries():
    with open('countries.json') as f:

        json_countries = json.load(f)
        map(lambda y: y['code'], json_countries)

        result = [x['code'] for x in json_countries]
        return result

countries = ["IT"] #load_countries()

In [8]:
def download_chart_for_country_and_date(country, date):
    time.sleep(0.01) 
    url = f'https://charts-spotify-com-service.spotify.com/auth/v0/charts/regional-{country}-weekly/{date}'
    headers = {
      'accept': 'application/json',
      'accept-language': 'en-US,en;q=0.9',
      'app-platform': 'Browser',
      'authorization': f"Bearer {spotify_token}",
      'cache-control': 'no-cache',
      'content-type': 'application/json',
      'origin': 'https://charts.spotify.com',
      'pragma': 'no-cache',
      'priority': 'u=1, i',
      'referer': 'https://charts.spotify.com/',
      'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
      'sec-ch-ua-mobile': '?0',
      'sec-ch-ua-platform': '"macOS"',
      'sec-fetch-dest': 'empty',
      'sec-fetch-mode': 'cors',
      'sec-fetch-site': 'same-site',
      'spotify-app-version': '0.0.0.production',
      'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36'
    }

    response = try_request("GET", url, headers=headers)

    if 'entries' not in response.json():
        return pd.DataFrame([])

    songs = []
    songs_basic = []
    for song in response.json()['entries']:
        if song['missingRequiredFields'] and song['trackMetadata']['artists'] == []:
            print("Skipping song due to missing artists date: " + str(date) + " country: " + country)
            print(song)
            continue
        new_song = {
            'spotifyId': song['trackMetadata']['trackUri'],
            'trackName': song['trackMetadata']['trackName'],
            'artistName': song['trackMetadata']['artists'][0]['name'],
            'releaseDate': song['trackMetadata']['releaseDate'],
            'currentRank': song['chartEntryData']['currentRank'],
            'peakRank': song['chartEntryData']['peakRank'],
            'weeksOnChart': song['chartEntryData']['appearancesOnChart'],
            'numStreams':song['chartEntryData']['rankingMetric']['value'],
            'entryDate': song['chartEntryData']['entryDate']
        }

        new_song_basic = {
            'spotifyId': song['trackMetadata']['trackUri'],
            'trackName': song['trackMetadata']['trackName'],
            'artistName': song['trackMetadata']['artists'][0]['name'],
            'releaseDate': song['trackMetadata']['releaseDate']
        }
        songs.append(new_song)
        songs_basic.append(new_song_basic)

    df = pd.json_normalize(songs)
    df = df.assign(Country=country)
    df = df.assign(Week=response.json()['displayChart']['date'])
    df = df.set_index(['Country', 'Week', 'spotifyId'])

    df_tracks = pd.json_normalize(songs_basic)

    return df, df_tracks

In [9]:
def download_all_charts_for_date(date):
    df = pd.DataFrame([])
    df_tracks = pd.DataFrame([])
    for country in countries:
        chart, chart_basic = download_chart_for_country_and_date(country, date)

        df = pd.concat([df, chart])
        df_tracks = pd.concat([df_tracks, chart_basic])
    df = df.reset_index()
    df_tracks = df_tracks.reset_index()
    return df, df_tracks

In [10]:
df = pd.DataFrame([])
df_tracks = pd.DataFrame([])
try:
    chart_info = json.loads("charting_data2024-05-09.json")
    print("Loading previous charting data")
except:
    print("Creating new charting data")
    chart_info = {}




Creating new charting data


In [11]:
for i_date in tqdm(dates):
    
    try:
        if i_date in chart_info:
            chart, chart_basic = chart_info[i_date]
            chart = pandas.read_json(chart)
            chart_basic = pandas.read_json(chart_basic)
        else:
            chart, chart_basic = download_all_charts_for_date(i_date)
            chart_info[i_date] = (chart.to_json(), chart_basic.to_json())
        
        df = pd.concat([df, chart])
        df_tracks = pd.concat([df_tracks, chart_basic])
    except Exception as e:
        with open("charting_data" + i_date + ".json" , "w" ) as write:
            json.dump( chart_info , write )
            print("Dumping into " + "charting_data" + i_date + ".json")
            raise e
        


df_tracks = df_tracks.assign(geniusId=None)
df_tracks
# chart = download_all_charts_for_date('2024-12-05')
# chart



00%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  4.34it/s]

Unnamed: 0,index,spotifyId,trackName,artistName,releaseDate,geniusId
0,0,spotify:track:2tTmW7RDtMQtBk7m2rYeSw,"Quevedo: Bzrp Music Sessions, Vol. 52",Bizarrap,2022-07-06,
1,1,spotify:track:0D3QZNAMH2d5MFkVlebo6h,Non lo Sai,Shiva,2022-11-25,
2,2,spotify:track:36EFgeHW1tOUyMAhZ6cjfD,CHIAGNE (feat. Lazza & Takagi & Ketra),Geolier,2022-10-28,
3,3,spotify:track:1MboxS3hV7Wr8UVO59uRth,Alleluia (feat. Sfera Ebbasta),Shiva,2022-11-25,
4,4,spotify:track:1cc9BzqfV6aO0EV8c7jGnN,Take 4,Shiva,2022-11-25,
...,...,...,...,...,...,...
195,195,spotify:track:0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,The Weeknd,2020-03-20,
196,196,spotify:track:5odlY52u43F5BjByhxg7wg,golden hour,JVKE,2022-09-23,
197,197,spotify:track:6yaNNYWPjkWHbh1jADEyJO,KUMITE,Salmo,2021-10-01,
198,198,spotify:track:1laLs6CGimqWYgwnZelz8H,Caramello,Rocco Hunt,2021-11-05,


In [12]:
df_tracks = df_tracks.drop_duplicates(subset='spotifyId', keep='last')

df_tracks

Unnamed: 0,index,spotifyId,trackName,artistName,releaseDate,geniusId
2,2,spotify:track:36EFgeHW1tOUyMAhZ6cjfD,CHIAGNE (feat. Lazza & Takagi & Ketra),Geolier,2022-10-28,
25,25,spotify:track:1EOTrGOcrCwTG1nhUnp0dV,MONEY,Geolier,2022-11-18,
126,126,spotify:track:3Ucr6hQQuY8cZ0UqXV8uO2,"Freed From Desire - prod. Molella, Phil Jay",Gala,1997-11-17,
156,156,spotify:track:2B4GHvToeLTOBB4QLzW3Ni,Pepas,Farruko,2021-10-01,
157,157,spotify:track:65SQUmt3OT8puVhZDbaQYB,NESSUNO (Concertos),Lazza,2022-04-07,
...,...,...,...,...,...,...
195,195,spotify:track:0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,The Weeknd,2020-03-20,
196,196,spotify:track:5odlY52u43F5BjByhxg7wg,golden hour,JVKE,2022-09-23,
197,197,spotify:track:6yaNNYWPjkWHbh1jADEyJO,KUMITE,Salmo,2021-10-01,
198,198,spotify:track:1laLs6CGimqWYgwnZelz8H,Caramello,Rocco Hunt,2021-11-05,


In [13]:
def find_genius_id(track_name, track_main_artist):
    track_name = re.sub(r"\s*\[.*?\]", "", track_name)

    if "feat." in track_name.lower() or "with " in track_name.lower() or "from " in track_name.lower():
        track_name = re.sub(r"\s*\(.*?\)", "", track_name)

    time.sleep(0.010)
    query = {
        "access_token": genius_token,
        "q": f"{track_name}, {track_main_artist}"
    }
    query = urllib.parse.urlencode(query)
    #print(query)
    url = f'https://api.genius.com/search?{query}'
    payload = {}
    headers = {
      'Accept': 'application/json, text/plain, */*',
      'Sec-Fetch-Site': 'same-site',
      'Origin': 'https://docs.genius.com',
      'Sec-Fetch-Dest': 'empty',
      'Accept-Language': 'en-GB,en;q=0.9',
      'If-None-Match': 'W/"2a87e230fcad2086285f6b15cfc4ff90"',
      'Sec-Fetch-Mode': 'cors',
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15',
      'Accept-Encoding': 'gzip, deflate, br',
      'Referer': 'https://docs.genius.com/',
      'Priority': 'u=3, i'
    }

    response = try_request("GET", url, headers=headers)

    response_json = response.json()

    hits = response_json.get('response', {}).get('hits', [])
    hits = [x for x in hits if normalize_string(track_main_artist.lower()) in normalize_string(x.get('result').get('artist_names').lower())]

    if not hits:
        if "(" in track_name:
            stripped_track_name = re.sub(r"\s*\(.*?\)", "", track_name)
            return find_genius_id(stripped_track_name, track_main_artist)
        elif "-" in track_name:
            stripped_track_name = re.sub(r"\s*-\s*.*", "", track_name)
            return find_genius_id(stripped_track_name, track_main_artist)
        elif re.search(r'[^a-zA-Z0-9 ]', track_name):
            stripped_track_name = re.sub(r'[^a-zA-Z ]', ' ', track_name)
            stripped_track_name = re.sub(r'\s+', ' ', stripped_track_name).strip()
            return find_genius_id(stripped_track_name, track_main_artist)
        else:
            return None  # Return None if no results

    track_response = hits[0]['result']

    release_date_components = track_response.get('release_date_components', {})
    if release_date_components:
        release_date = datetime.datetime(
            release_date_components.get('year', 1) or 1,
            release_date_components.get('month', 1) or 1,
            release_date_components.get('day', 1) or 1
        )
        formatted_release_date = release_date.strftime("%Y-%m-%d")
    else:
        formatted_release_date = None

    return {
        'geniusId': track_response.get('id'),
        'geniusTrackName': track_response.get('title'),
        'geniusArtistName': track_response.get('artist_names'),
        'geniusReleaseDate': formatted_release_date
    }


In [14]:
for index, row in tqdm(df_tracks[df_tracks['geniusId'].isna()].iterrows()):
    spotify_id = row.get('spotifyId')
    genius_data = find_genius_id(row["trackName"], row["artistName"])
    if not genius_data:
        continue
    df_tracks.loc[df_tracks["spotifyId"] == spotify_id, ["geniusId", "geniusTrackName", "geniusArtistName", "geniusReleaseDate"]] = [
        genius_data['geniusId'], genius_data['geniusTrackName'], genius_data['geniusArtistName'], genius_data['geniusReleaseDate']
    ]
# find_genius_id("die with a smile", "lady gaga")
df_tracks["geniusId"] = df_tracks["geniusId"].astype("Int64")
df_tracks


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tracks["geniusId"] = df_tracks["geniusId"].astype("Int64")


Unnamed: 0,index,spotifyId,trackName,artistName,releaseDate,geniusId,geniusTrackName,geniusArtistName,geniusReleaseDate
2,2,spotify:track:36EFgeHW1tOUyMAhZ6cjfD,CHIAGNE (feat. Lazza & Takagi & Ketra),Geolier,2022-10-28,8498335,CHIAGNE,Geolier (Ft. Lazza & Takagi & Ketra),2022-10-28
25,25,spotify:track:1EOTrGOcrCwTG1nhUnp0dV,MONEY,Geolier,2022-11-18,8557768,MONEY,Geolier,2022-11-18
126,126,spotify:track:3Ucr6hQQuY8cZ0UqXV8uO2,"Freed From Desire - prod. Molella, Phil Jay",Gala,1997-11-17,1463574,Freed from Desire,Gala,1996-10-23
156,156,spotify:track:2B4GHvToeLTOBB4QLzW3Ni,Pepas,Farruko,2021-10-01,6942022,Pepas,Farruko,2021-06-24
157,157,spotify:track:65SQUmt3OT8puVhZDbaQYB,NESSUNO (Concertos),Lazza,2022-04-07,8616532,NESSUNO (Concertos),Lazza,2022-12-16
...,...,...,...,...,...,...,...,...,...
195,195,spotify:track:0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,The Weeknd,2020-03-20,5049949,Blinding Lights,The Weeknd,2019-11-29
196,196,spotify:track:5odlY52u43F5BjByhxg7wg,golden hour,JVKE,2022-09-23,8192862,golden hour,JVKE,2022-07-15
197,197,spotify:track:6yaNNYWPjkWHbh1jADEyJO,KUMITE,Salmo,2021-10-01,7205355,KUMITE,Salmo,2021-10-01
198,198,spotify:track:1laLs6CGimqWYgwnZelz8H,Caramello,Rocco Hunt,2021-11-05,8107786,Caramello,"Rocco Hunt, Elettra Lamborghini & Lola Indigo",2022-06-17


In [16]:
def get_genius_song_info(genius_id):
    time.sleep(0.010)
    query = {
        "access_token": genius_token,
    }
    query = urllib.parse.urlencode(query)

    url = f'https://api.genius.com/songs/{genius_id}?{query}'
    payload = {}
    headers = {
      'Accept': 'application/json, text/plain, */*',
      'Sec-Fetch-Site': 'same-site',
      'Origin': 'https://docs.genius.com',
      'Sec-Fetch-Dest': 'empty',
      'Accept-Language': 'en-GB,en;q=0.9',
      'If-None-Match': 'W/"2a87e230fcad2086285f6b15cfc4ff90"',
      'Sec-Fetch-Mode': 'cors',
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15',
      'Accept-Encoding': 'gzip, deflate, br',
      'Referer': 'https://docs.genius.com/',
      'Priority': 'u=3, i'
    }

    response = try_request("GET", url, headers=headers)
    response_json = response.json()

    song = response_json.get('response', {}).get('song', {})
    song_relationships = song.get('song_relationships', [])
    song_relationships = [x for x in song_relationships if x.get('relationship_type', None) in {'samples', 'interpolates'} and len(x.get('songs', [])) > 0]
    song_image = [{'id': genius_id, 'type': 'song', 'imageURL': song.get('song_art_image_url')}]
    
    producer_images = [{'id': x.get('id'),'type': 'artist', 'imageURL': x.get('image_url')} for x in song.get('producer_artists')]
    writer_images = [{'id': x.get('id'),'type': 'artist', 'imageURL': x.get('image_url')} for x in song.get('writer_artists')]
    artist_images = [{'id': x.get('id'),'type': 'artist', 'imageURL': x.get('image_url')} for x in song.get('primary_artists')]
    artist_images = artist_images + [{'id': x.get('id'),'type': 'artist', 'imageURL': x.get('image_url')} for x in song.get('featured_artists')]
    

    genius_data = {
        'song_language': song.get('language', None),
        'song_relationships': song_relationships,
        'primary_artists':[{'geniusId': genius_id, 'type': 'primary', 'artistId': x.get('id'), 'name': x.get('name')} for x in song.get('primary_artists')],
        'producers': [{'geniusId': genius_id, 'type': 'producer', 'artistId': x.get('id'), 'name': x.get('name')} for x in song.get('producer_artists')],
        'writers': [{'geniusId': genius_id, 'type': 'writer', 'artistId': x.get('id'), 'name': x.get('name')} for x in song.get('writer_artists')],
        'images' : producer_images + writer_images + artist_images + song_image,
        'featured_artists' : [{'geniusId': genius_id, 'type': 'feature', 'artistId': x.get('id'), 'name': x.get('name')} for x in song.get('featured_artists')]
    }

    return genius_data

def get_sampling_data(genius_id, genius_song_info):
    new_songs = []
    relationships = []
    # samples
    for samples in genius_song_info.get('song_relationships'):
        for songs in samples.get('songs'):

            genius_track = df_tracks[df_tracks["geniusId"] == genius_id]
            if genius_track["geniusArtistName"].iloc[0] == songs.get('artist_names') and genius_track["geniusTrackName"].iloc[0] in songs.get('title'):
                #print(f"skipping {songs.get('artist_names')} - {songs.get('title')} because of self-reference")
                continue

            release_date_components = songs.get('release_date_components', {})
            if release_date_components:
                release_date = datetime.datetime(
                    release_date_components.get('year', 1) or 1,
                    release_date_components.get('month', 1) or 1,
                    release_date_components.get('day', 1) or 1
                )
                formatted_release_date = release_date.strftime("%Y-%m-%d")
            else:
                formatted_release_date = None
            # print(genius_song_info.get('song_relationships'))
            x = {
                "geniusId": songs.get('id'),
                "geniusTrackName": songs.get('title'),
                "geniusArtistName": songs.get('artist_names'),
                "geniusReleaseDate": formatted_release_date,
            }

            new_songs.append(x)
            relationships.append({
                'from_genius_id': genius_id,
                'to_genius_id': songs.get('id'),
                'type': samples.get('relationship_type')
            })

    df_new_songs = pd.json_normalize(new_songs)
    df_relationships = pd.json_normalize(relationships)
    return df_new_songs, df_relationships

df_relationships = pd.DataFrame([])
df_contributions = pd.DataFrame([])
song_languages = []
images = []
for genius_id in tqdm(df_tracks[df_tracks["geniusId"].notna()]["geniusId"].tolist()):
    genius_song_info = get_genius_song_info(genius_id)
    ############################################################# NOTE: song relations do not yeild languages
    df_new_songs, df_new_relationships = get_sampling_data(genius_id, genius_song_info)
    df_tracks = pd.concat([df_tracks, df_new_songs])
    df_relationships = pd.concat([df_relationships, df_new_relationships])
    df_contributions = pd.concat([df_contributions, pd.json_normalize(genius_song_info['producers']), pd.json_normalize(genius_song_info['writers']), pd.json_normalize(genius_song_info['featured_artists']), pd.json_normalize(genius_song_info['primary_artists'])])
    song_languages.append({'geniusId': genius_id, 'trackLanguage':genius_song_info['song_language']})
    images.extend(genius_song_info['images'])
    
df_languages = pd.DataFrame(song_languages)
df_tracks = df_tracks.merge(df_languages, on='geniusId', how='left')
df_images = pd.DataFrame(images)
df_tracks


00%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 222/222 [03:49<00:00,  1.04s/it]

Unnamed: 0,index,spotifyId,trackName,artistName,releaseDate,geniusId,geniusTrackName,geniusArtistName,geniusReleaseDate,trackLanguage
0,2.0,spotify:track:36EFgeHW1tOUyMAhZ6cjfD,CHIAGNE (feat. Lazza & Takagi & Ketra),Geolier,2022-10-28,8498335,CHIAGNE,Geolier (Ft. Lazza & Takagi & Ketra),2022-10-28,pl
1,2.0,spotify:track:36EFgeHW1tOUyMAhZ6cjfD,CHIAGNE (feat. Lazza & Takagi & Ketra),Geolier,2022-10-28,8498335,CHIAGNE,Geolier (Ft. Lazza & Takagi & Ketra),2022-10-28,pl
2,25.0,spotify:track:1EOTrGOcrCwTG1nhUnp0dV,MONEY,Geolier,2022-11-18,8557768,MONEY,Geolier,2022-11-18,en
3,25.0,spotify:track:1EOTrGOcrCwTG1nhUnp0dV,MONEY,Geolier,2022-11-18,8557768,MONEY,Geolier,2022-11-18,en
4,126.0,spotify:track:3Ucr6hQQuY8cZ0UqXV8uO2,"Freed From Desire - prod. Molella, Phil Jay",Gala,1997-11-17,1463574,Freed from Desire,Gala,1996-10-23,en
...,...,...,...,...,...,...,...,...,...,...
310,,,,,,727466,The Hills,The Weeknd,2015-05-27,
311,,,,,,622976,Young Turks,Rod Stewart,1981-11-06,
312,,,,,,118904,Take on Me,a-ha,1985-04-05,
313,,,,,,8957789,Firefly,Power music (Ft. Jordan Robbins),2023-03-17,


In [18]:
df.to_csv("output.csv", encoding='utf-8', index=True, header=True)
df_tracks.to_csv("output_tracks.csv", encoding='utf-8', index=False, header=True)
df_relationships.to_csv("output_relationships.csv", encoding='utf-8', index=False, header=True)
df_images = df_images.drop_duplicates(subset=['id'])
df_images.to_csv("image_urls.csv", encoding='utf-8', index=False, header=True)

In [19]:
df_merged = df_tracks[df_tracks['spotifyId'].notna()].merge(df_relationships, left_on="geniusId", right_on="from_genius_id", how="inner")
df_final = df_merged.merge(df_tracks[df_tracks['spotifyId'].isna()], left_on="to_genius_id", right_on="geniusId", how="inner", suffixes=("_from", "_to"))
df_final = df_final.drop_duplicates()
df_final.to_csv("output_merged.csv", encoding='utf-8', index=False, header=True)

In [20]:
df_contributions
df_merged_contributions = df_tracks[df_tracks['spotifyId'].notna()].merge(df_contributions, left_on="geniusId", right_on="geniusId", how="inner")
df_merged_contributions
# df_merged_contributions.to_csv("output_merged_contributions.csv", encoding='utf-8', index=False, header=True)

Unnamed: 0,index,spotifyId,trackName,artistName,releaseDate,geniusId,geniusTrackName,geniusArtistName,geniusReleaseDate,trackLanguage,type,artistId,name
0,2.0,spotify:track:36EFgeHW1tOUyMAhZ6cjfD,CHIAGNE (feat. Lazza & Takagi & Ketra),Geolier,2022-10-28,8498335,CHIAGNE,Geolier (Ft. Lazza & Takagi & Ketra),2022-10-28,pl,producer,667980,Takagi & Ketra
1,2.0,spotify:track:36EFgeHW1tOUyMAhZ6cjfD,CHIAGNE (feat. Lazza & Takagi & Ketra),Geolier,2022-10-28,8498335,CHIAGNE,Geolier (Ft. Lazza & Takagi & Ketra),2022-10-28,pl,writer,1453163,Geolier
2,2.0,spotify:track:36EFgeHW1tOUyMAhZ6cjfD,CHIAGNE (feat. Lazza & Takagi & Ketra),Geolier,2022-10-28,8498335,CHIAGNE,Geolier (Ft. Lazza & Takagi & Ketra),2022-10-28,pl,writer,380028,Lazza
3,2.0,spotify:track:36EFgeHW1tOUyMAhZ6cjfD,CHIAGNE (feat. Lazza & Takagi & Ketra),Geolier,2022-10-28,8498335,CHIAGNE,Geolier (Ft. Lazza & Takagi & Ketra),2022-10-28,pl,writer,1022585,Davide Petrella
4,2.0,spotify:track:36EFgeHW1tOUyMAhZ6cjfD,CHIAGNE (feat. Lazza & Takagi & Ketra),Geolier,2022-10-28,8498335,CHIAGNE,Geolier (Ft. Lazza & Takagi & Ketra),2022-10-28,pl,writer,608753,Takagi
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630,198.0,spotify:track:1laLs6CGimqWYgwnZelz8H,Caramello,Rocco Hunt,2021-11-05,8107786,Caramello,"Rocco Hunt, Elettra Lamborghini & Lola Indigo",2022-06-17,it,primary,1530427,Lola Indigo
1631,199.0,spotify:track:1xK59OXxi2TAAAbmZK0kBL,Romantic Homicide,d4vd,2022-07-20,8214933,Romantic Homicide,d4vd,2022-07-20,en,producer,2649878,Dan Darmawan
1632,199.0,spotify:track:1xK59OXxi2TAAAbmZK0kBL,Romantic Homicide,d4vd,2022-07-20,8214933,Romantic Homicide,d4vd,2022-07-20,en,writer,3214453,d4vd
1633,199.0,spotify:track:1xK59OXxi2TAAAbmZK0kBL,Romantic Homicide,d4vd,2022-07-20,8214933,Romantic Homicide,d4vd,2022-07-20,en,writer,2649878,Dan Darmawan


In [21]:
df_contributions.to_csv("output_contributions.csv", encoding='utf-8', index=False, header=True)

In [None]:
def genres_from_genius_id(song_id):
    # Scrapes 'main' genres from genius.com
    page_url = 'http://genius.com' + "/songs/" + str(song_id)
    page = try_request("GET", page_url, None)
    if page.status_code == 404:
        # Try to get the webpage through API instead. Costs two more API calls, but is seemingly rare
        query = {
        "access_token": genius_token,
        }
        query = urllib.parse.urlencode(query)
        url = f'https://api.genius.com/songs/{genius_id}?{query}'
        response = try_request("GET", url, None)
        if response.status_code == 404:
            return [], []
        response_json = response.json()
        song = response_json.get('response', {}).get('song', {})
        url = song.get('url', "")
        if url == "":
            return [], []
        page = try_request("GET", url, None)
        
    html = BeautifulSoup(page.text, 'html.parser')
    tags = html.find(class_='SongTags-sc-b55131f0-1')
    if tags is None:
        print("No genre tags for ID: "+ str(song_id))
        return [], []
    first_tag_class = tags.find('a')['class']
    all_tags = tags.find_all('a')
    primary_tags = []
    secondary_tags = []
    for t in all_tags:
        t_class = t['class']
        if first_tag_class[1] == t_class[1]:
            primary_tags.append(t.text)
        else:
            secondary_tags.append(t.text)
    return primary_tags, secondary_tags

In [None]:
stop_time = datetime.datetime.now()
print("Total time: " + str(stop_time-start_time))

In [None]:
genres_list = []
for genius_id in tqdm(df_tracks[df_tracks["geniusId"].notna()]["geniusId"].tolist()):
    time.sleep(0.01)
    primary_genres, secondary_genres = genres_from_genius_id(genius_id)
    genres_list.append({'geniusId':genius_id, 'primaryGenres': primary_genres, 'secondaryGenres': secondary_genres})

df_genres = pd.json_normalize(genres_list)
df_genres

In [None]:
df_genres.to_csv("output_genres.csv", encoding='utf-8', index=False, header=True)