In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util
import nltk

import re
import requests
import pandas as pd
import numpy as np
import json
import os
import dotenv
import sys
import lyricsgenius
import base64
from glob import glob
sys.tracebacklimit = 0 # turn off the error tracebacks

# Connect to Spotify and Genius APIS

In [2]:
dotenv.load_dotenv()
geniusclientid = os.getenv('genius_id')
geniusclientsecret = os.getenv('genius_secret')
geniusclientaccesstoken = os.getenv('genius_token')
spotifyclientid = os.getenv('spot_id')
spotifyclientsecret = os.getenv('spot_secret')

In [8]:
genius = lyricsgenius.Genius(geniusclientaccesstoken)

In [None]:
# r = requests.get('https://httpbin.org/user-agent')
# useragent = json.loads(r.text)['user-agent']
# headers = {'User-Agent':useragent,
#            'From':'mtv2eva@virginia.edu'}

In [None]:
# root = 'https://api.genius.com'
# endpoint = '/search?'
# parameters = {'access_token':geniusclientaccesstoken,
#               'q':'Pink Floyd'}
# r = requests.get(root+endpoint, params=parameters, headers=headers, timeout=30)

# myjson = json.loads(r.text)
# myjson

In [9]:
username = 'michael_vaden'

redirect_uri= 'https://www.virginia.edu/'

client_credentials_manager = SpotifyClientCredentials(client_id=spotifyclientid, client_secret=spotifyclientsecret)

scope = "playlist-modify-public playlist-modify-private playlist-read-private playlist-read-collaborative user-library-modify"

token = util.prompt_for_user_token(username, scope, spotifyclientid, spotifyclientsecret, redirect_uri, show_dialog=True)

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, auth=token)

# Get Songs, Stats, and Lyrics from each decade playlist

In [10]:
def get_tracks(user, playlist):
        myjson = sp.user_playlist_tracks(user, playlist)

        tracks = myjson['items']

        while myjson['next']:
            myjson = sp.next(myjson)
            tracks.extend(myjson['items'])

        return pd.json_normalize(tracks)

def get_tracks_from_url(username, url):
    playlist_URI = url.split("/")[-1].split("?")[0]

    return get_tracks(username, playlist_URI)

def add_song_features(df):
    song_features = pd.DataFrame()
    for track in df['track.uri']:
        song_features = pd.concat([song_features, pd.json_normalize(sp.audio_features(track))])

    return df.merge(song_features, left_on='track.uri', right_on='uri')

def add_song_genres(df):
    df['genres'] = [sp.artist(pd.json_normalize(df['track.artists'][x])['id'][0])['genres'] for x in range(df.shape[0])]
    return df

In [11]:
# All out 70s
playlist_link_70s = 'https://open.spotify.com/playlist/37i9dQZF1DWTJ7xPn4vNaz?si=1c451c032af84894'

# All Out 80s
playlist_link_80s = 'https://open.spotify.com/playlist/37i9dQZF1DX4UtSsGT1Sbe?si=8144b29335e74604'

# All Out 90s
playlist_link_90s = 'https://open.spotify.com/playlist/37i9dQZF1DXbTxeAdrVG2l?si=98353266ac23448a'

# All Out 2000s
playlist_link_2000s = 'https://open.spotify.com/playlist/37i9dQZF1DX4o1oenSJRJd?si=141c98a067094ac9'

# All Out 2010s
playlist_link_2010s = 'https://open.spotify.com/playlist/37i9dQZF1DX5Ejj0EkURtP?si=292a3ad73e854085'

# All Out 2020s
playlist_link_2020s = 'https://open.spotify.com/playlist/37i9dQZF1DX2M1RktxUUHG?si=d19a9dc9fc14439d'

In [12]:
songs_70s = get_tracks_from_url(username, playlist_link_70s)
songs_70s = add_song_features(songs_70s)
songs_70s = add_song_genres(songs_70s)

In [13]:
songs_80s = get_tracks_from_url(username, playlist_link_80s)
songs_80s = add_song_features(songs_80s)
songs_80s = add_song_genres(songs_80s)

In [14]:
songs_90s = get_tracks_from_url(username, playlist_link_90s)
songs_90s = add_song_features(songs_90s)
songs_90s = add_song_genres(songs_90s)

In [15]:
songs_2000s = get_tracks_from_url(username, playlist_link_2000s)
songs_2000s = add_song_features(songs_2000s)
songs_2000s = add_song_genres(songs_2000s)

In [16]:
songs_2010s = get_tracks_from_url(username, playlist_link_2010s)
songs_2010s = add_song_features(songs_2010s)
songs_2010s = add_song_genres(songs_2010s)

In [17]:
songs_2020s = get_tracks_from_url(username, playlist_link_2020s)
songs_2020s = add_song_features(songs_2020s)
songs_2020s = add_song_genres(songs_2020s)

In [18]:
songs_2020s.columns

Index(['added_at', 'is_local', 'primary_color',
       'added_by.external_urls.spotify', 'added_by.href', 'added_by.id',
       'added_by.type', 'added_by.uri', 'track.preview_url',
       'track.available_markets', 'track.explicit', 'track.type',
       'track.episode', 'track.track', 'track.album.available_markets',
       'track.album.type', 'track.album.album_type', 'track.album.href',
       'track.album.id', 'track.album.images', 'track.album.name',
       'track.album.release_date', 'track.album.release_date_precision',
       'track.album.uri', 'track.album.artists',
       'track.album.external_urls.spotify', 'track.album.total_tracks',
       'track.artists', 'track.disc_number', 'track.track_number',
       'track.duration_ms', 'track.external_ids.isrc',
       'track.external_urls.spotify', 'track.href', 'track.id', 'track.name',
       'track.popularity', 'track.uri', 'track.is_local',
       'video_thumbnail.url', 'danceability', 'energy', 'key', 'loudness',
       'mode'

In [None]:
songs_70s_filtered = songs_70s[['track.artists', 'track.duration_ms', 'track.id', 'track.name', 'track.popularity', 'danceability', 'energy', 'key', 'loudness', \
             'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'genres']]

songs_70s_filtered['track.artists'] = songs_70s_filtered['track.artists'].apply(lambda x: x[0]['name'])

songs_80s_filtered = songs_80s[['track.artists', 'track.duration_ms', 'track.id', 'track.name', 'track.popularity', 'danceability', 'energy', 'key', 'loudness', \
             'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'genres']]

songs_80s_filtered['track.artists'] = songs_80s_filtered['track.artists'].apply(lambda x: x[0]['name'])

songs_90s_filtered = songs_90s[['track.artists', 'track.duration_ms', 'track.id', 'track.name', 'track.popularity', 'danceability', 'energy', 'key', 'loudness', \
             'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'genres']]
songs_90s_filtered['track.artists'] = songs_90s_filtered['track.artists'].apply(lambda x: x[0]['name'])

songs_2000s_filtered = songs_2000s[['track.artists', 'track.duration_ms', 'track.id', 'track.name', 'track.popularity', 'danceability', 'energy', 'key', 'loudness', \
             'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'genres']]
songs_2000s_filtered['track.artists'] = songs_2000s_filtered['track.artists'].apply(lambda x: x[0]['name'])

songs_2010s_filtered = songs_2010s[['track.artists', 'track.duration_ms', 'track.id', 'track.name', 'track.popularity', 'danceability', 'energy', 'key', 'loudness', \
             'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'genres']]
songs_2010s_filtered['track.artists'] = songs_2010s_filtered['track.artists'].apply(lambda x: x[0]['name'])

songs_2020s_filtered = songs_2020s[['track.artists', 'track.duration_ms', 'track.id', 'track.name', 'track.popularity', 'danceability', 'energy', 'key', 'loudness', \
             'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'genres']]
songs_2020s_filtered['track.artists'] = songs_2020s_filtered['track.artists'].apply(lambda x: x[0]['name'])

In [20]:
songs_2020s_filtered

Unnamed: 0,track.artists,track.duration_ms,track.id,track.name,track.popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres
0,RAYE,272373,5mHdCZtVyb4DcJw8799hZp,Escapism.,81,0.538,0.742,2,-5.355,1,0.1140,0.13800,0.000047,0.0934,0.250,96.107,4,"[uk contemporary r&b, uk pop]"
1,Noah Kahan,182346,0mflMxspEfB0VbI1kyLiAv,Stick Season,94,0.664,0.500,9,-6.935,1,0.0651,0.79900,0.000000,0.0966,0.801,117.896,4,[pov: indie]
2,Sam Fender,297933,39yxhEuYmm24botBP0O8sx,Seventeen Going Under,37,0.479,0.870,1,-4.876,1,0.0366,0.00788,0.004190,0.0826,0.587,161.964,4,"[modern rock, north east england indie]"
3,Tame Impala,237800,5hM5arv9KDbCHS0k9uqwjr,Borderline,82,0.621,0.873,5,-3.067,0,0.0369,0.04060,0.000009,0.0824,0.873,97.960,4,"[australian psych, modern rock, neo-psychedeli..."
4,Zach Bryan,228013,3WMj8moIAXJhHsyLaqIIHI,Something in the Orange,88,0.369,0.192,4,-12.151,0,0.0400,0.55500,0.000008,0.0954,0.148,175.212,3,[classic oklahoma country]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,21 Savage,193591,6pcywuOeGGWeOQzdUyti6k,Glock In My Lap,83,0.847,0.733,10,-6.439,0,0.1600,0.00574,0.000010,0.1510,0.202,130.029,4,"[atl hip hop, hip hop, rap]"
146,Roddy Ricch,196652,0nbXyq5TXYPCO7pr3N8S4I,The Box,83,0.896,0.586,10,-6.687,0,0.0559,0.10400,0.000000,0.7900,0.642,116.971,4,"[melodic rap, rap, trap]"
147,S1mba,167916,4slSrbTK1sNK4I1mDYEthf,Rover (feat. DTG),69,0.613,0.624,11,-6.660,0,0.2060,0.49600,0.000000,0.2830,0.803,62.948,5,[afroswing]
148,Pop Smoke,160000,1tkg4EHVoqnhR6iFEXb60y,What You Know Bout Love,79,0.709,0.548,10,-8.493,1,0.3530,0.65000,0.000002,0.1330,0.543,83.995,4,"[brooklyn drill, rap]"


In [21]:
def add_lyrics_to_df(song_df):

    lyrics_vector = []
    for i in range(len(song_df)):
        try:
            find_song = genius.search_song(song_df['track.name'][i], song_df['track.artists'][i])
            song_lyrics = find_song.lyrics

        except:
            song_lyrics = ''
        
        if len(song_lyrics.split()) > 1000:
            song_lyrics = ''

        # regex taken from Ryan Lipps' Repository for Text Analytics
        song_lyrics = re.sub(fr'(?i)\d+(.*?lyrics)', '', song_lyrics)
        song_lyrics = re.sub(fr'(?<!\s)Embed$', '', song_lyrics)
        song_lyrics = re.sub(fr'(?i)you might also like', '', song_lyrics)
        song_lyrics = re.sub(fr'(?i)See\s(.*?)\sLiveGet\stickets\sas\slow\sas\s\$\d+', '', song_lyrics)
        song_lyrics = re.sub(fr'(?i)\[instrumental (.*?)\]', '', song_lyrics)
        song_lyrics = re.sub(fr'(?i)\[interlude\]', '', song_lyrics)
        song_lyrics = re.sub(fr'(?i)\w+(\d+)\b', '', song_lyrics)
        song_lyrics = re.sub(fr'[.,;:"?!()-]\d+', '', song_lyrics)
        song_lyrics = re.sub(fr'  ', '', song_lyrics)
        song_lyrics = re.sub(fr'  ', '', song_lyrics)
        song_lyrics = re.sub(fr'  ', '', song_lyrics)
        song_lyrics = re.sub(fr'\[(.*?)\]', '', song_lyrics)
        song_lyrics = re.sub(fr'\s\s', '\n', song_lyrics)

        if '•' in song_lyrics: # accounting for weird list output
            song_lyrics = ''
        lyrics_vector.append(song_lyrics)
    
    song_df['lyrics'] = lyrics_vector

    return song_df

In [22]:
# songs_70s_with_lyrics = add_lyrics_to_df(songs_70s_filtered)

Searching for "Dreams - 2004 Remaster" by Fleetwood Mac...
Done.
Searching for "Tiny Dancer" by Elton John...
Done.
Searching for "Vienna" by Billy Joel...
Done.
Searching for "Baba O'Riley" by The Who...
Done.
Searching for "Right Down the Line" by Gerry Rafferty...
Done.
Searching for "Heroes - 2017 Remaster" by David Bowie...
Done.
Searching for "Rocket Man (I Think It's Going To Be A Long, Long Time)" by Elton John...
Done.
Searching for "The Chain - 2004 Remaster" by Fleetwood Mac...
Done.
Searching for "Wish You Were Here" by Pink Floyd...
Done.
Searching for "Ain't No Sunshine" by Bill Withers...
Searching for "Wild World" by Yusuf / Cat Stevens...
Done.
Searching for "American Pie" by Don McLean...
Done.
Searching for "Go Your Own Way - 2004 Remaster" by Fleetwood Mac...
Done.
Searching for "A Horse with No Name" by America...
Done.
Searching for "Beast Of Burden - Remastered 1994" by The Rolling Stones...
Done.
Searching for "Rich Girl" by Daryl Hall & John Oates...
Done.
Sear

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  song_df['lyrics'] = lyrics_vector


In [23]:
# songs_80s_with_lyrics = add_lyrics_to_df(songs_80s_filtered)

Searching for "Total Eclipse of the Heart" by Bonnie Tyler...
Done.
Searching for "Everywhere - 2017 Remaster" by Fleetwood Mac...
Done.
Searching for "Running Up That Hill (A Deal With God)" by Kate Bush...
Done.
Searching for "Heaven Is A Place On Earth - Promo 7" Edit" by Belinda Carlisle...
No results found for: 'Heaven Is A Place On Earth - Promo 7" Edit Belinda Carlisle'
Searching for "Smalltown Boy" by Bronski Beat...
Done.
Searching for "If I Could Turn Back Time" by Cher...
Searching for "Take My Breath Away - Love Theme from "Top Gun"" by Berlin...
Done.
Searching for "Lay All Your Love On Me" by ABBA...
Done.
Searching for "Everybody Wants To Rule The World" by Tears For Fears...
Done.
Searching for "Enjoy the Silence" by Depeche Mode...
Done.
Searching for "Fast Car" by Tracy Chapman...
Done.
Searching for "With Or Without You" by U2...
Done.
Searching for "Little Lies - 2017 Remaster" by Fleetwood Mac...
Done.
Searching for "I'm On Fire" by Bruce Springsteen...
Done.
Searc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  song_df['lyrics'] = lyrics_vector


In [24]:
# songs_90s_with_lyrics = add_lyrics_to_df(songs_90s_filtered)

Searching for "No Scrubs" by TLC...
Done.
Searching for "Bitter Sweet Symphony - Remastered 2016" by The Verve...
No results found for: 'Bitter Sweet Symphony - Remastered 2016 The Verve'
Searching for "Torn" by Natalie Imbruglia...
Done.
Searching for "You Get What You Give" by New Radicals...
Done.
Searching for "Bitch" by Meredith Brooks...
Done.
Searching for "Save Tonight" by Eagle-Eye Cherry...
Done.
Searching for "All I Wanna Do" by Sheryl Crow...
Searching for "She's so High" by Tal Bachman...
Done.
Searching for "Scar Tissue" by Red Hot Chili Peppers...
Done.
Searching for "Say My Name" by Destiny's Child...
Done.
Searching for "Iris" by The Goo Goo Dolls...
Done.
Searching for "Fade Into You" by Mazzy Star...
Done.
Searching for "Alright" by Supergrass...
Done.
Searching for "Closing Time" by Semisonic...
Searching for "Wonderwall - Remastered" by Oasis...
Done.
Searching for "1979 - Remastered 2012" by The Smashing Pumpkins...
Done.
Searching for "Under the Bridge" by Red Ho

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  song_df['lyrics'] = lyrics_vector


In [25]:
# songs_2000s_with_lyrics = add_lyrics_to_df(songs_2000s_filtered)

Searching for "Since U Been Gone" by Kelly Clarkson...
Done.
Searching for "Yeah! (feat. Lil Jon & Ludacris)" by USHER...
Done.
Searching for "Murder On The Dancefloor" by Sophie Ellis-Bextor...
Searching for "Sweet Disposition" by The Temper Trap...
Done.
Searching for "Crazy In Love (feat. Jay-Z)" by Beyoncé...
Done.
Searching for "Walking On A Dream" by Empire Of The Sun...
Done.
Searching for "Kids" by MGMT...
Done.
Searching for "Young Folks" by Peter Bjorn and John...
Done.
Searching for "Naive" by The Kooks...
Done.
Searching for "Apologize" by Timbaland...
Done.
Searching for "Home (2019 Remaster)" by Edward Sharpe & The Magnetic Zeros...
No results found for: 'Home (2019 Remaster) Edward Sharpe & The Magnetic Zeros'
Searching for "You've Got The Love" by Florence + The Machine...
Done.
Searching for "Intro" by The xx...
Specified song does not contain lyrics. Rejecting.
Searching for "Use Somebody" by Kings of Leon...
Done.
Searching for "Mr. Brightside" by The Killers...
Done

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  song_df['lyrics'] = lyrics_vector


In [26]:
# songs_2010s_with_lyrics = add_lyrics_to_df(songs_2010s_filtered)

Searching for "bad guy" by Billie Eilish...
Done.
Searching for "Midnight City" by M83...
Done.
Searching for "Stolen Dance" by Milky Chance...
Done.
Searching for "Ophelia" by The Lumineers...
Done.
Searching for "when the party's over" by Billie Eilish...
Done.
Searching for "Little Talks" by Of Monsters and Men...
Done.
Searching for "The Less I Know The Better" by Tame Impala...
Done.
Searching for "Feel So Close - Radio Edit" by Calvin Harris...
Done.
Searching for "Team" by Lorde...
Done.
Searching for "I Will Wait" by Mumford & Sons...
Done.
Searching for "Riptide" by Vance Joy...
Done.
Searching for "3 Nights" by Dominic Fike...
Done.
Searching for "Budapest" by George Ezra...
Done.
Searching for "Ho Hey" by The Lumineers...
Done.
Searching for "Roses" by The Chainsmokers...
Done.
Searching for "I Got U" by Duke Dumont...
Done.
Searching for "Electric Love" by BØRNS...
Done.
Searching for "Waves - Robin Schulz Radio Edit" by Mr. Probz...
Done.
Searching for "Nevermind" by Denni

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  song_df['lyrics'] = lyrics_vector


In [27]:
# songs_2020s_with_lyrics = add_lyrics_to_df(songs_2020s_filtered)

Searching for "Escapism." by RAYE...
Done.
Searching for "Stick Season" by Noah Kahan...
Done.
Searching for "Seventeen Going Under" by Sam Fender...
Done.
Searching for "Borderline" by Tame Impala...
Done.
Searching for "Something in the Orange" by Zach Bryan...
Done.
Searching for "Afraid To Feel" by LF SYSTEM...
Done.
Searching for "Where Are You Now" by Lost Frequencies...
Done.
Searching for "Meet Me At Our Spot" by THE ANXIETY...
Done.
Searching for "Heat Waves" by Glass Animals...
Done.
Searching for "Strangers" by Kenya Grace...
Done.
Searching for "Love Tonight (David Guetta Remix Edit)" by Shouse...
Done.
Searching for "Beautiful Things" by Benson Boone...
Done.
Searching for "Miracle (with Ellie Goulding)" by Calvin Harris...
Done.
Searching for "(It Goes Like) Nanana - Edit" by Peggy Gou...
Done.
Searching for "I Ain't Worried" by OneRepublic...
Done.
Searching for "Ferrari" by James Hype...
Done.
Searching for "Lose Control" by Teddy Swims...
Done.
Searching for "ceilings"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  song_df['lyrics'] = lyrics_vector


# Build LIB Table

In [None]:
# songs_70s_with_lyrics['Decade'] = "70s"
# songs_80s_with_lyrics['Decade'] = "80s"
# songs_90s_with_lyrics['Decade'] = "90s"
# songs_2000s_with_lyrics['Decade'] = "2000s"
# songs_2010s_with_lyrics['Decade'] = "2010s"
# songs_2020s_with_lyrics['Decade'] = "2020s"

In [119]:
# Save query results so we don't need to rerun
# songs_70s_with_lyrics.to_csv('data/song_dfs/70s_df.csv')
# songs_80s_with_lyrics.to_csv('data/song_dfs/80s_df.csv')
# songs_90s_with_lyrics.to_csv('data/song_dfs/90s_df.csv')
# songs_2000s_with_lyrics.to_csv('data/song_dfs/2000s_df.csv')
# songs_2010s_with_lyrics.to_csv('data/song_dfs/2010s_df.csv')
# songs_2020s_with_lyrics.to_csv('data/song_dfs/2020s_df.csv')

In [126]:
songs_70s_with_lyrics = pd.read_csv('data/song_dfs/70s_df.csv').drop('Unnamed: 0', axis=1)
songs_80s_with_lyrics = pd.read_csv('data/song_dfs/80s_df.csv').drop('Unnamed: 0', axis=1)
songs_90s_with_lyrics = pd.read_csv('data/song_dfs/90s_df.csv').drop('Unnamed: 0', axis=1)
songs_2000s_with_lyrics = pd.read_csv('data/song_dfs/2000s_df.csv').drop('Unnamed: 0', axis=1)
songs_2010s_with_lyrics = pd.read_csv('data/song_dfs/2010s_df.csv').drop('Unnamed: 0', axis=1)
songs_2020s_with_lyrics = pd.read_csv('data/song_dfs/2020s_df.csv').drop('Unnamed: 0', axis=1)

In [4]:
temp_lib = pd.concat([songs_70s_with_lyrics, songs_80s_with_lyrics, songs_90s_with_lyrics, songs_2000s_with_lyrics, songs_2010s_with_lyrics, songs_2020s_with_lyrics])

In [55]:
SONG_LIB = temp_lib.rename({'track.artists':'artist'}, axis = 1).drop('track.id', axis=1)

SONG_LIB.index.name = 'song_id'

SONG_LIB

Unnamed: 0_level_0,artist,track.duration_ms,track.name,track.popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres,lyrics,Decade
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,Fleetwood Mac,257800,Dreams - 2004 Remaster,86,0.828,0.492,0,-9.744,1,0.0276,0.06440,0.004280,0.1280,0.789,120.151,4,"['album rock', 'classic rock', 'rock', 'soft r...",Stevie Nicks - Edge of Seventeen ( Remaster)\n...,70s
1,Elton John,377093,Tiny Dancer,78,0.414,0.428,0,-11.097,1,0.0278,0.38200,0.000243,0.1480,0.282,145.075,4,"['glam rock', 'mellow gold', 'piano rock', 'ro...","\nBlue jean baby, L.A. lady\nSeamstress for th...",70s
2,Billy Joel,214240,Vienna,84,0.532,0.495,10,-6.662,1,0.0343,0.65900,0.000000,0.0754,0.308,124.936,4,"['album rock', 'classic rock', 'heartland rock...","\nSlow down, you crazy child\nYou're so ambiti...",70s
3,The Who,300400,Baba O'Riley,76,0.489,0.724,5,-8.367,1,0.0352,0.31300,0.185000,0.2870,0.150,117.292,4,"['album rock', 'british invasion', 'classic ro...","\nOut here in the fields, I fight for my meals...",70s
4,Gerry Rafferty,267773,Right Down the Line,74,0.783,0.322,0,-15.091,1,0.0343,0.21300,0.002430,0.1140,0.770,128.226,4,"['art rock', 'classic rock', 'folk rock', 'mel...",\nYou know I need your love\nYou've got that h...,70s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,21 Savage,193591,Glock In My Lap,83,0.847,0.733,10,-6.439,0,0.1600,0.00574,0.000010,0.1510,0.202,130.029,4,"['atl hip hop', 'hip hop', 'rap']","\nY'all niggas stop playin', nigga\nY'all nigg...",2020s
146,Roddy Ricch,196652,The Box,83,0.896,0.586,10,-6.687,0,0.0559,0.10400,0.000000,0.7900,0.642,116.971,4,"['melodic rap', 'rap', 'trap']",\nPullin' out the coupe at the lot\nTold 'em f...,2020s
147,S1mba,167916,Rover (feat. DTG),69,0.613,0.624,11,-6.660,0,0.2060,0.49600,0.000000,0.2830,0.803,62.948,5,['afroswing'],\nShorty said she coming with her bredrins\n'C...,2020s
148,Pop Smoke,160000,What You Know Bout Love,79,0.709,0.548,10,-8.493,1,0.3530,0.65000,0.000002,0.1330,0.543,83.995,4,"['brooklyn drill', 'rap']",\nUh\n\nShawty go joggin' every morning (Every...,2020s


In [97]:
# More regex:
# Check if lyrics contain 'Remaster' or 'Remastered' or 'feat.' to remove text where it is an arbitrary list of songs
# Check if lyrics are less than 20 chars long
# Remove some figures from other languages 

SONG_LIB = SONG_LIB[~SONG_LIB['lyrics'].str.contains(r'übersehen', case=False, na=False)]
SONG_LIB = SONG_LIB[~SONG_LIB['lyrics'].str.contains(r'zurück', case=False, na=False)]
SONG_LIB = SONG_LIB[~SONG_LIB['lyrics'].str.contains(r'ọmọ', case=False, na=False)]
SONG_LIB = SONG_LIB[~SONG_LIB['lyrics'].str.contains(r'şöyle', case=False, na=False)]
SONG_LIB = SONG_LIB[~SONG_LIB['lyrics'].str.contains(r'在', case=False, na=False)]
SONG_LIB = SONG_LIB[~SONG_LIB['lyrics'].str.contains(r'Remaster|Remastered|feat.', case=False, na=False)]
SONG_LIB = SONG_LIB[SONG_LIB['lyrics'].str.len() >= 20]


# Reset index
SONG_LIB.reset_index(drop=True, inplace=True)

In [98]:
SONG_LIB

Unnamed: 0,artist,track.duration_ms,track.name,track.popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres,lyrics,Decade
0,Elton John,377093,Tiny Dancer,78,0.414,0.428,0,-11.097,1,0.0278,0.38200,0.000243,0.1480,0.282,145.075,4,"['glam rock', 'mellow gold', 'piano rock', 'ro...","\nBlue jean baby, L.A. lady\nSeamstress for th...",70s
1,Billy Joel,214240,Vienna,84,0.532,0.495,10,-6.662,1,0.0343,0.65900,0.000000,0.0754,0.308,124.936,4,"['album rock', 'classic rock', 'heartland rock...","\nSlow down, you crazy child\nYou're so ambiti...",70s
2,The Who,300400,Baba O'Riley,76,0.489,0.724,5,-8.367,1,0.0352,0.31300,0.185000,0.2870,0.150,117.292,4,"['album rock', 'british invasion', 'classic ro...","\nOut here in the fields, I fight for my meals...",70s
3,Gerry Rafferty,267773,Right Down the Line,74,0.783,0.322,0,-15.091,1,0.0343,0.21300,0.002430,0.1140,0.770,128.226,4,"['art rock', 'classic rock', 'folk rock', 'mel...",\nYou know I need your love\nYou've got that h...,70s
4,Elton John,281613,"Rocket Man (I Think It's Going To Be A Long, L...",82,0.601,0.532,10,-9.119,1,0.0286,0.43300,0.000006,0.0925,0.342,136.576,4,"['glam rock', 'mellow gold', 'piano rock', 'ro...",\nShe packed my bags last night pre-flight\nZe...,70s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,21 Savage,193591,Glock In My Lap,83,0.847,0.733,10,-6.439,0,0.1600,0.00574,0.000010,0.1510,0.202,130.029,4,"['atl hip hop', 'hip hop', 'rap']","\nY'all niggas stop playin', nigga\nY'all nigg...",2020s
648,Roddy Ricch,196652,The Box,83,0.896,0.586,10,-6.687,0,0.0559,0.10400,0.000000,0.7900,0.642,116.971,4,"['melodic rap', 'rap', 'trap']",\nPullin' out the coupe at the lot\nTold 'em f...,2020s
649,S1mba,167916,Rover (feat. DTG),69,0.613,0.624,11,-6.660,0,0.2060,0.49600,0.000000,0.2830,0.803,62.948,5,['afroswing'],\nShorty said she coming with her bredrins\n'C...,2020s
650,Pop Smoke,160000,What You Know Bout Love,79,0.709,0.548,10,-8.493,1,0.3530,0.65000,0.000002,0.1330,0.543,83.995,4,"['brooklyn drill', 'rap']",\nUh\n\nShawty go joggin' every morning (Every...,2020s


In [77]:
SONG_LIB.dtypes

artist                object
track.duration_ms      int64
track.name            object
track.popularity       int64
danceability         float64
energy               float64
key                    int64
loudness             float64
mode                   int64
speechiness          float64
acousticness         float64
instrumentalness     float64
liveness             float64
valence              float64
tempo                float64
time_signature         int64
genres                object
lyrics                object
Decade                object
dtype: object

In [99]:
import os

# Write each decade as a text file to the 'decades' directory
for decade in SONG_LIB.Decade.unique():
    decade_text = ''
    decade_df = SONG_LIB.query("Decade == @decade")

    file_path = os.path.join("data", "decades", f"{decade}.txt")
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    with open(file_path, 'w') as outfile:
        for i in range(len(decade_df)):
            lyrics = str(decade_df.iloc[i]['lyrics'])
            track = decade_df.iloc[i]['track.name']
            artist = decade_df.iloc[i]['artist']

            lyrics = f'\n\n[Trackname: {track}] [Artist: {artist}]\n' + lyrics
            outfile.write(lyrics)

    print(f'Wrote playlist {decade} to file {file_path}')


Wrote playlist 70s to file data/decades/70s.txt
Wrote playlist 80s to file data/decades/80s.txt
Wrote playlist 90s to file data/decades/90s.txt
Wrote playlist 2000s to file data/decades/2000s.txt
Wrote playlist 2010s to file data/decades/2010s.txt
Wrote playlist 2020s to file data/decades/2020s.txt


In [100]:
LIB = SONG_LIB.groupby('Decade').agg({
    'danceability':'mean',
    'energy':'mean',
    'loudness':'mean',
    'speechiness':'mean',
    'acousticness':'mean',
    'instrumentalness':'mean',
    'liveness':'mean',
    'valence':'mean',
    'tempo':'mean'
}).reset_index()

In [101]:
LIB['source_file_path'] = LIB.Decade.apply(lambda x: f'data/decades/{x}.txt')
LIB['song_regex'] = r'\[Trackname:\s[^\]]+\]\s\[Artist:\s[^\]]+\]'
#LIB['song_regex'] = '\[Trackname:\s[^\]]+\]'

LIB.index.name = 'decade_id'


In [102]:
LIB['document_length'] = LIB['source_file_path'].apply(lambda path: len(open(path, 'r').read()))

In [103]:
LIB

Unnamed: 0_level_0,Decade,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,source_file_path,song_regex,document_length
decade_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,2000s,0.593643,0.760881,-5.696056,0.067888,0.098199,0.037857,0.182493,0.537506,124.994357,data/decades/2000s.txt,\[Trackname:\s[^\]]+\]\s\[Artist:\s[^\]]+\],216931
1,2010s,0.634008,0.6732,-6.217608,0.064475,0.166805,0.022972,0.177836,0.461078,118.889683,data/decades/2010s.txt,\[Trackname:\s[^\]]+\]\s\[Artist:\s[^\]]+\],195291
2,2020s,0.675517,0.619595,-6.329922,0.079997,0.28491,0.00968,0.183418,0.542036,122.16781,data/decades/2020s.txt,\[Trackname:\s[^\]]+\]\s\[Artist:\s[^\]]+\],212348
3,70s,0.571966,0.611978,-10.020067,0.053069,0.310864,0.032363,0.171034,0.63227,122.576472,data/decades/70s.txt,\[Trackname:\s[^\]]+\]\s\[Artist:\s[^\]]+\],127933
4,80s,0.618161,0.681151,-9.199699,0.048652,0.17732,0.023871,0.155342,0.664387,119.188785,data/decades/80s.txt,\[Trackname:\s[^\]]+\]\s\[Artist:\s[^\]]+\],151152
5,90s,0.572694,0.718213,-7.624593,0.048655,0.110941,0.060286,0.175645,0.585344,115.385731,data/decades/90s.txt,\[Trackname:\s[^\]]+\]\s\[Artist:\s[^\]]+\],174393


# Parse Decades into CORPUS

In [104]:
source_file_list = sorted(glob(f"data/decades/*.*"))

source_file_list

['data/decades/2000s.txt',
 'data/decades/2010s.txt',
 'data/decades/2020s.txt',
 'data/decades/70s.txt',
 'data/decades/80s.txt',
 'data/decades/90s.txt']

In [105]:
nltk_resources = [
    'tokenizers/punkt', 
    'taggers/averaged_perceptron_tagger', 
    'corpora/stopwords', 
    'help/tagsets'
]
for rsc in nltk_resources:
    try:
        nltk.data.find(rsc)
    except IndexError:
        nltk.download(rsc)

In [85]:
OHCO = ['decade_id', 'song_num', 'stanza_num', 'line_num', 'token_num']

In [106]:
def parse_decade(decade_id: int, decade_name: str, song_regex: str):
    with open(f'data/decades/{decade_name}.txt', 'r', encoding='utf-8-sig') as file:
        lines = file.readlines()
    LINES = pd.DataFrame({'line_str': lines}, columns=['line_str'])
    LINES.index.name = 'line_num'
    LINES['line_str'] = LINES['line_str'].str.strip()

    chap_pat = LIB.iloc[0].song_regex
    chap_lines = LINES['line_str'].str.match(chap_pat, case=False)

    LINES.loc[chap_lines, 'song_num'] = range(sum(chap_lines))
    LINES['song_num'] = LINES['song_num'].ffill()

    LINES = LINES.dropna(subset=['song_num'])
    LINES = LINES.loc[~chap_lines]
    LINES['song_num'] = LINES['song_num'].astype(int)

    SONGS = LINES.groupby('song_num')['line_str'].apply('\n'.join).to_frame('song_str')
    SONGS['song_str'] = SONGS['song_str'].str.strip()

    stanza_pat = r'\n\n+'
    STANZAS = SONGS['song_str'].str.split(stanza_pat, expand=True).stack().to_frame('stanza_str').sort_index()
    STANZAS.index.names = ['song_num', 'stanza_num']

    STANZAS['stanza_str'] = STANZAS['stanza_str'].str.strip()
    STANZAS = STANZAS[~STANZAS['stanza_str'].str.match(r'^\s*$')]

    line_pat = r'\n'
    LINES = STANZAS['stanza_str'].str.split(line_pat, expand=True).stack().to_frame('line_str')
    LINES.index.names = ['song_num', 'stanza_num', 'line_num']

    LINES = LINES[~LINES['line_str'].str.match(r'^\s*$')]
    LINES['line_str'] = LINES['line_str'].str.strip()

    TOKENS = LINES['line_str'].apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
        .stack().to_frame('pos_tuple')
    TOKENS.index.names = ['song_num', 'stanza_num', 'line_num', 'token_num']

    TOKENS = pd.concat({decade_id: TOKENS}, names=['decade_id'])

    return TOKENS


In [107]:
decades = []

for decade_id in LIB.index:
    decades.append(parse_decade(decade_id, LIB.iloc[decade_id].Decade, LIB.iloc[decade_id].song_regex))

In [108]:
CORPUS = pd.concat(decades).sort_index()

CORPUS['pos'] = CORPUS.pos_tuple.apply(lambda x: x[1])
CORPUS['token_str'] = CORPUS.pos_tuple.apply(lambda x: x[0])
CORPUS['term_str'] = CORPUS.token_str.str.lower().str.replace(r"\W+", "", regex=True)
CORPUS['pos_group'] = CORPUS.pos.str[:2]

In [109]:
CORPUS = CORPUS[CORPUS.term_str != '']
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
decade_id,song_num,stanza_num,line_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,0,0,0,"(Here's, NNP)",NNP,Here's,heres,NN
0,0,0,0,1,"(the, DT)",DT,the,the,DT
0,0,0,0,2,"(thing,, NN)",NN,"thing,",thing,NN
0,0,0,0,3,"(we, PRP)",PRP,we,we,PR
0,0,0,0,4,"(started, VBD)",VBD,started,started,VB
...,...,...,...,...,...,...,...,...,...
5,107,9,2,1,"(y'all, PRP)",PRP,y'all,yall,PR
5,107,9,2,2,"(deep, VBP)",VBP,deep,deep,VB
5,107,9,3,0,"(Y'all, DT)",DT,Y'all,yall,DT
5,107,9,3,1,"(are, VBP)",VBP,are,are,VB


In [110]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()    
VOCAB['p'] = VOCAB.n/VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str', 'pos']].value_counts()\
    .unstack(fill_value=0)\
    .idxmax(1)
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts()\
    .unstack(fill_value=0)\
    .idxmax(1)

In [111]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

In [112]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [116]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,stem_porter,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,2,1,0.000010,16.660497,CD,CD,1,0
10k,2,3,0.000010,16.660497,CD,CD,10k,0
1k,1,2,0.000005,17.660497,CD,CD,1k,0
1s,1,2,0.000005,17.660497,CD,CD,1s,0
2,3,1,0.000014,16.075535,CD,CD,2,0
...,...,...,...,...,...,...,...,...
è,12,1,0.000058,14.075535,NNP,NN,è,0
él,3,2,0.000014,16.075535,NNP,NN,él,0
еlla,1,4,0.000005,17.660497,IN,IN,еlla,0
еsta,1,4,0.000005,17.660497,VBZ,VB,еsta,0


### Write Core Tables to CSV

In [114]:
CORPUS.columns

Index(['pos_tuple', 'pos', 'token_str', 'term_str', 'pos_group'], dtype='object')

In [120]:
LIB.to_csv('data/core_tables/LIB.csv')
CORPUS.to_csv('data/core_tables/CORPUS.csv')
VOCAB.to_csv('data/core_tables/VOCAB.csv')