Spotify Wrapped but for all music platforms

In [34]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
import plotly.express as px

spotify = pd.read_json('StreamingHistory_music_0.json')
youtube = pd.read_json('watch-history.json')

In [35]:
#Convert spotify endTime to datetime
spotify['endTime'] = pd.to_datetime(spotify['endTime']) 
spotify['date'] = pd.to_datetime(spotify['endTime'].dt.date)
spotify['time'] = spotify['endTime'].dt.strftime('%H:%M')
spotify['hour'] = spotify['endTime'].dt.strftime('%I %p')  
spotify['hour'] = spotify['hour'].str.lstrip('0')
spotify['year'] = spotify['endTime'].dt.strftime('%Y')
spotify['month'] = spotify['endTime'].dt.strftime('%m')
spotify['month'] = pd.to_numeric(spotify['month'])

spotify = spotify[spotify['year']=='2024'] #only data from 2024

spotify.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed,date,time,hour,year,month
340,2024-01-02 17:19:00,The Happy Fits,Hold Me Down,223813,2024-01-02,17:19,5 PM,2024,1
341,2024-01-02 17:24:00,Olivia Rodrigo,vampire,219724,2024-01-02,17:24,5 PM,2024,1
342,2024-01-02 17:27:00,Ricky Montgomery,Eraser,183114,2024-01-02,17:27,5 PM,2024,1
343,2024-01-02 17:32:00,Olivia Rodrigo,the grudge,189386,2024-01-02,17:32,5 PM,2024,1
344,2024-01-02 17:35:00,AJR,The Dumb Song,225946,2024-01-02,17:35,5 PM,2024,1


In [36]:
youtube = youtube[youtube['header']=='YouTube Music'] #only take data from youtube music 
youtube = youtube.drop(['titleUrl', 'products', 'activityControls', 'description', 'details', 'header'], axis=1)

#Convert youtube ListTime to datetime
youtube['ListTime'] = pd.to_datetime(youtube['time'], errors='coerce', utc=True)
youtube['date'] = pd.to_datetime(youtube['ListTime'].dt.date) 
youtube['time'] = youtube['ListTime'].dt.strftime('%H:%M')
youtube['hour'] = youtube['ListTime'].dt.strftime('%I %p') 
youtube['hour'] = youtube['hour'].str.lstrip('0')
youtube['year'] = youtube['ListTime'].dt.strftime('%Y')
youtube['month'] = youtube['ListTime'].dt.strftime('%m')
youtube['month'] = pd.to_numeric(youtube['month'])

youtube = youtube[youtube['year']=='2024'] #only take data from 2024

youtube.head()

Unnamed: 0,title,subtitles,time,ListTime,date,hour,year,month
2360,Watched Imran Khan - Pata Chalgea (Un-Official...,"[{'name': 'imrankhanworld', 'url': 'https://ww...",19:58,2024-12-31 19:58:04.072000+00:00,2024-12-31,7 PM,2024,12.0
2361,Watched Gora Gora Rang,"[{'name': 'Imran Khan - Topic', 'url': 'https:...",19:55,2024-12-31 19:55:39.313000+00:00,2024-12-31,7 PM,2024,12.0
2363,Watched Bewafa,"[{'name': 'Imran Khan - Topic', 'url': 'https:...",19:51,2024-12-31 19:51:46.123000+00:00,2024-12-31,7 PM,2024,12.0
2364,Watched Amplifier,"[{'name': 'Imran Khan - Topic', 'url': 'https:...",19:47,2024-12-31 19:47:51.414000+00:00,2024-12-31,7 PM,2024,12.0
2365,Watched Gasolina,"[{'name': 'Daddy Yankee - Topic', 'url': 'http...",19:44,2024-12-31 19:44:32.050000+00:00,2024-12-31,7 PM,2024,12.0


In [37]:
#Eliminate 'Watched' from song titles
def delete_watched(value): 
    pattern = r'^Watched '
    if re.match(pattern, value):
        cleaned = re.sub(r'^Watched ', '', value)
        return cleaned

youtube['title'] = youtube['title'].apply(delete_watched) 

#Get artist names from list in subtitles
youtube = youtube[~youtube['subtitles'].apply(lambda x: isinstance(x, float))] #drop float rows

def get_name(value): #Index artist names 
    return value[0]['name']

youtube['artist'] = youtube['subtitles'].apply(get_name) #Make artist column 

def delete_topic(value): #delete '- Topic' in names 
    pattern = r'.*\- Topic$'
    if re.match(pattern, value):
        cleaned = re.sub(r' \- Topic', '', value)
        return cleaned, True #add cleaned_flag for songs with author
    else: return value, False

youtube[['artist', 'cleaned_flag']] = youtube['artist'].apply(delete_topic).apply(pd.Series)

#Eliminate random characters from titles and artists
youtube['title'] = youtube['title'].str.replace(r"[•·]", " ", regex=True)
youtube['artist'] = youtube['artist'].str.replace(r"[•·]", " ", regex=True)

artists = youtube[youtube['cleaned_flag']==True]['artist'].unique() #Create list of clean artists

sorted_artists = sorted(artists, key=len, reverse=True) #sort greatest to shortest to match longest

#Find artist from artist name in title
def find_artist(title): #Function to find artist name in title
    for artist in artists:
        if artist.lower() in title.lower():
            return artist
    return None
for idx, row in youtube[~youtube['cleaned_flag']].iterrows():
    artist_found = find_artist(row['title']) #change artist if found in title
    if artist_found:
        youtube.at[idx, 'artist'] = artist_found 

#Clean random words from song title
def clean_title(title):
    for artist in artists: #eliminate artist from title
        pattern = re.compile(re.escape(artist), re.IGNORECASE)
        title = pattern.sub('', title)
    cuts = ['(Un-Official Video)', '(Official Video)', '(lyrics)', '-', '(Official)', '(Audio)', '(Official Music Video)', '(feat. )']
    for phrase in cuts: #eliminate phrases from title
        pattern = re.compile(re.escape(phrase), re.IGNORECASE)
        title = pattern.sub('', title)
    title = title.strip()
    return title
youtube['title'] = youtube['title'].apply(clean_title)

In [38]:
#Take select columns from both dataframes to merge 
spotify2 = spotify[['artistName', 'trackName', 'date', 'hour', 'month','msPlayed']].copy()
youtube2 = youtube[['artist', 'title', 'date', 'hour', 'month']].copy()
#Apple calls it 'End Position In Milliseconds'

#Create new column for listening platform
spotify2['platform'] = 'spotify'
youtube2['platform'] = 'youtube'
youtube2['msPlayed'] = None

spotify2.rename(columns={'artistName': 'artist', 'trackName': 'title'}, inplace=True) #rename columns 

music = pd.concat([spotify2, youtube2], ignore_index=True) #merge dataframes 
music.head()

Unnamed: 0,artist,title,date,hour,month,msPlayed,platform
0,The Happy Fits,Hold Me Down,2024-01-02,5 PM,1.0,223813,spotify
1,Olivia Rodrigo,vampire,2024-01-02,5 PM,1.0,219724,spotify
2,Ricky Montgomery,Eraser,2024-01-02,5 PM,1.0,183114,spotify
3,Olivia Rodrigo,the grudge,2024-01-02,5 PM,1.0,189386,spotify
4,AJR,The Dumb Song,2024-01-02,5 PM,1.0,225946,spotify


In [7]:
#Print list of top 5 songs with artist name 
music['title'] = music['title'].astype(str)
song_freq = pd.crosstab(index=music['title'], columns='count') #create frequency count
song_freq = song_freq.reset_index()
top_songs = song_freq.sort_values('count', ascending=False)
song_freq['title'] = song_freq['title'].astype(str)

songs = song_freq.merge(music.drop_duplicates(subset='title'), on='title') #merge artist name 
songs = songs.sort_values('count', ascending=False)
songs = songs.drop(['date', 'hour'], axis=1)
songs.head() #print top songs with artist 

Unnamed: 0,title,count,artist,month,platform
634,"Good Luck, Babe!",39,Chappell Roan,4.0,spotify
288,Calling After Me,31,Wallows,4.0,spotify
666,HOT TO GO!,31,Chappell Roan,3.0,spotify
1388,She Wants Me (To Be Loved),30,The Happy Fits,2.0,spotify
590,From The Start,30,Laufey,1.0,spotify


In [8]:
#Create graph of top 10 songs colored by artist 
top10 = songs.head(10)
artists_names = top10['artist'].unique()

fig = px.bar(top10, x="title", y="count", color="artist", color_discrete_sequence=px.colors.qualitative.Pastel, 
             orientation="v", title="Top 10 Songs")
fig.update_layout(
    xaxis=dict(categoryorder='array', categoryarray=top10['title'].tolist()))
fig.show()

In [9]:
#Create pie chart of top 10 artists
artist_freq = pd.crosstab(index=music['artist'], columns='count') #frequency of top artists 
artist_freq = artist_freq.reset_index()
top_artist = artist_freq.sort_values('count', ascending=False)
artist10 = top_artist.head(10)

fig = px.pie(artist10, values='count', names='artist', title="Top 10 Artists")
fig.show()

In [10]:
monthly_counts = music.groupby(['artist', 'month']).size().reset_index(name='listen_count')

top5_artists_overall = (monthly_counts.groupby('artist')['listen_count'].sum().sort_values(ascending=False)
    .head(5).index.tolist())
color_palette = px.colors.qualitative.Plotly
custom_color_map = {artist: color_palette[i] for i, artist in enumerate(top5_artists_overall)}

top3_artists = pd.DataFrame()
for month in range(1,13):
    df = monthly_counts[monthly_counts['month']==month]
    df = df.sort_values('listen_count', ascending=False).head(3)
    top3_artists = pd.concat([top3_artists, df])
top3_artists = top3_artists.sort_values(by=["month", "listen_count"], ascending=[True, False])

def generate_grayscale(n, start=200, end=80):
    step = (start - end) // max(n - 1, 1)
    return [f"#{v:02x}{v:02x}{v:02x}" for v in range(start, end - 1, -step)]
grayscale_colors = generate_grayscale(10) 

all_artists = top3_artists['artist'].unique()
non_top5_artists = [a for a in all_artists if a not in top5_artists_overall]
for i, artist in enumerate(non_top5_artists):
    custom_color_map[artist] = grayscale_colors[i % len(grayscale_colors)]

fig = px.bar(top3_artists, x="month", y="listen_count", color="artist", 
             color_discrete_map=custom_color_map, barmode="stack", title="Top 3 artists per month")
for trace in fig.data:
    if trace.name not in top5_artists_overall:
        trace.showlegend = False
fig.show()


In [11]:
monthly_counts = music.groupby(['title', 'month']).size().reset_index(name='listen_count')

top3_songs = pd.DataFrame()
for month in range(1,13):
    df = monthly_counts[monthly_counts['month']==month]
    df = df.sort_values('listen_count', ascending=False).head(3)
    top3_songs = pd.concat([top3_songs, df])
top3_songs = top3_songs.sort_values(by=["month", "listen_count"], ascending=[True, False])

fig = px.bar(top3_songs, x="month", y="listen_count", color="title", barmode="stack", 
             title="Top 3 songs per month")
fig.show()

In [12]:
#Top 5 artists over time
top5_art = top_artist['artist'].head(5).unique() #list of top 5 artists
top5 = music[music['artist'].isin(top5_art)]
monthly_counts = top5.groupby(['artist', 'month']).size().reset_index(name='listen_count')

fig = px.line(monthly_counts, x="month", y="listen_count", color="artist", title="Top artists over year")
fig.show()

In [13]:
#Top 5 songs over time
top10 = songs.head(10)
artists_names = top10['artist'].unique()


top5_song = top10['title'].head(5).unique() #list of top 5 songs
top5_song = music[music['title'].isin(top5_song)]
monthly_song = top5_song.groupby(['title', 'month']).size().reset_index(name='listen_count')

fig = px.line(monthly_song, x="month", y="listen_count", color="title", title="Top songs over year")
fig.show()

In [None]:
music2 = music.dropna()
hours = music2.groupby('hour')['msPlayed'].sum().reset_index()
hours["hour"] = pd.to_numeric(hours["hour"])
hours["msPlayed"] = pd.to_numeric(hours["msPlayed"]) / (1000 * 60)
fig = px.bar(hours, x='hour',y="msPlayed",title='Hourly Listening Distribution')
fig.update_layout(yaxis_title='Minutes Played')
fig.show()

In [42]:
music['day_name'] = music['date'].dt.day_name()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
hour_order = ['12 AM', '1 AM', '2 AM', '3 AM', '4 AM', '5 AM','6 AM', '7 AM', '8 AM', '9 AM', '10 AM', '11 AM',
    '12 PM', '1 PM', '2 PM', '3 PM', '4 PM', '5 PM','6 PM', '7 PM', '8 PM', '9 PM', '10 PM', '11 PM']
#music.head()
fig = px.density_heatmap(music, x='hour', y='day_name', category_orders={'day_name':day_order, 'hour':hour_order}, title="Music Listening Through the Week")
fig.update_layout(yaxis_title="Day of the Week", xaxis_title="Hour")
fig.show()

In [15]:
#Biggest listening day and artist for that day 
daily_counts = music.groupby(['date']).size().reset_index(name='listen_count').sort_values('listen_count', ascending=False)
top_day = pd.to_datetime(daily_counts.head(1)['date'].values[0]).strftime('%Y-%m-%d')
topday_count = daily_counts.head(1)['listen_count'].values[0]

topday_df = music[music['date']==top_day]
topday_df = topday_df.groupby(['artist']).size().reset_index(name='listen_count').sort_values('listen_count', ascending=False)
topday_artist = topday_df.head(1)['artist'].values[0]

top_day = pd.to_datetime(top_day).strftime('%m-%d')

print("You listened to", topday_count, "songs on", top_day+ "! Big day for you. Big day for being a fan of", topday_artist, "too it seems.")

You listened to 97 songs on 06-06! Big day for you. Big day for being a fan of Noah Kahan too it seems.


In [16]:
repeat_counts = music.groupby(['date','title']).size().reset_index(name='listen_count').sort_values('listen_count', ascending=False)
repeated_song = repeat_counts.head(1)['title'].values[0]
repeated_day = pd.to_datetime(repeat_counts.head(1)['date'].values[0]).strftime('%m-%d')
repeated_counts = repeat_counts.head(1)['listen_count'].values[0]

print("You listened to", repeated_song, repeated_counts, "times on", repeated_day+ ". A new record for you. It's that good?")

You listened to Good Luck, Babe! 7 times on 04-05. A new record for you. It's that good?


In [17]:
num_songs= repeat_counts.size
if num_songs > 15921:
    print("Wow! You listened to", num_songs, "songs this year. Better than me...")
elif num_songs < 15921:
    print("Huh, you only listened to", num_songs, "songs... I could do better.")
else:
    print("You listened to", num_songs, "songs. Samesies!")

You listened to 15921 songs. Samesies!


In [18]:
#Monthly analysis of top artists
months = [1,2]
mdf = music[music['month'].isin(months)]
monthly_artists = mdf.groupby(['artist']).size().reset_index(name='count')
top5 = monthly_artists.sort_values('count', ascending=False).head(5)
pie = px.pie(top5, values='count', names='artist', title="Top 5 Artists for Select Months")
pie.show()

In [19]:
#Monthly analysis on top songs
months = [1,2]
mdf = music[music['month'].isin(months)]
monthly_songs = mdf.groupby(['title']).size().reset_index(name='count')
top5 = monthly_songs.sort_values('count', ascending=False).head(5)
pie = px.pie(top5, values='count', names='title', title="Top 5 Songs for Select Months")
pie.show()

In [20]:
#Artist deep dive 
big5arts = music.groupby(['artist']).size().reset_index(name='listen_count').sort_values('listen_count', ascending=False).head(5)
print(big5arts)


             artist  listen_count
282      Noah Kahan           448
397  The Happy Fits           406
220          Laufey           387
77    Chappell Roan           352
480      half•alive           247


In [21]:
chosen_artist = "Noah Kahan"
bigdog = music[music['artist']==chosen_artist]

bigdog_music = bigdog.groupby(['title']).size().reset_index(name='count').sort_values('count', ascending=False).head(3)
favsongs = bigdog_music['title'].tolist()

first_listen = bigdog['date'].min().strftime('%m-%d')
print("Love at first sight. On", first_listen, "precisely for you and", chosen_artist, "that is. \n Since then you've been a big fan of", (", ".join((favsongs)[:2]))+ ", and", favsongs[2])


Love at first sight. On 01-02 precisely for you and Noah Kahan that is. 
 Since then you've been a big fan of Northern Attitude, Maine, and New Perspective


In [39]:
new = music.groupby(['title', 'date']).size().reset_index(name='count').sort_values('count', ascending=False)
def find_firstday(title):
    adf = new[new['title']==title]
    date = adf['date'].min().strftime('%Y-%m-%d')
    return date

firstdays = pd.DataFrame()
firstdays['title'] = new['title']
firstdays['firstlisten'] = pd.to_datetime(firstdays['title'].apply(find_firstday))

songrank = music.groupby(['title']).size().reset_index(name='count').sort_values('count', ascending=False)
songrank['rank'] = range(1, len(songrank) + 1)

scatterdata = songrank.merge(firstdays.drop_duplicates(subset='title'), on='title')
scatterdata.head()

Unnamed: 0,title,count,rank,firstlisten
0,"Good Luck, Babe!",39,1,2024-04-05
1,Calling After Me,31,2,2024-04-03
2,HOT TO GO!,31,3,2024-03-24
3,She Wants Me (To Be Loved),30,4,2024-01-01
4,From The Start,30,5,2024-01-13


In [41]:
px.scatter(scatterdata, "firstlisten", "rank", trendline="ols")

In [55]:
#if (selected_platforms=='youtube')
mostminutes = music.groupby('artist')['msPlayed'].sum().reset_index().sort_values('msPlayed', ascending=False)
mostminutes['hours'] = mostminutes['msPlayed'] / 3600000

topmins = mostminutes.head(5)['artist'].tolist()

topsongs = music.groupby(['artist']).size().reset_index(name='listen_count').sort_values('listen_count', ascending=False).head(5)
topsongs = topsongs['artist'].tolist()

if (sorted(topsongs) == sorted(topmins)):
    text1 = "the same artists"
    if (topsongs==topmins):
        text2 = "and in the same order!"
    else:
        text2 = " but not in the same order."
else:
    text1= "different artists"
    text2= ", interesting"

print("If we look at listening based on minutes, your top artists are", topsongs,".")
print("Which is interesting when compared to top artists by song. You've got " + text1 + text2)

If we look at listening based on minutes, your top artists are ['Noah Kahan', 'The Happy Fits', 'Laufey', 'Chappell Roan', 'half•alive'] .
Which is interesting when compared to top artists by song. You've got the same artists but not in the same order.


In [53]:
print(topsongs, topmins)

['Noah Kahan', 'The Happy Fits', 'Laufey', 'Chappell Roan', 'half•alive'] ['Noah Kahan', 'The Happy Fits', 'Laufey', 'half•alive', 'Chappell Roan']


In [None]:
#Platforms throughout the year
fig = px.histogram(music, x='date', color='platform', nbins=24, barmode='stack',
    title='Platforms used throughout the year')
fig.update_layout(xaxis_title='date', yaxis_title='songs')
fig.show()