In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pytz
import plotly.express as px
import plotly.graph_objects as go

streaming_history_file_0 = '/Users/JiIn/Desktop/Data Science/Spotify/MyData_3/StreamingHistory0.json'
streaming_history_file_1 = '/Users/JiIn/Desktop/Data Science/Spotify/MyData_3/StreamingHistory1.json'
streaming_history_file_2 = '/Users/JiIn/Desktop/Data Science/Spotify/MyData_3/StreamingHistory2.json'

streaming_history_0 = pd.read_json(streaming_history_file_0)
streaming_history_1 = pd.read_json(streaming_history_file_1)
streaming_history_2 = pd.read_json(streaming_history_file_2)

streaming_history = pd.concat([streaming_history_0, streaming_history_1, streaming_history_2])


streaming_history['endTime'] = pd.to_datetime(streaming_history['endTime'])
streaming_history.set_index('endTime', inplace = True)

streaming_history = streaming_history.tz_localize('UTC')
streaming_history = streaming_history.tz_convert('US/Eastern')

#print(streaming_history.head(n = 5))

playlist_file = '/Users/JiIn/Desktop/Data Science/Spotify/MyData_3/Playlist1.json'

playlist = pd.read_json(playlist_file)

search_query_file = '/Users/JiIn/Desktop/Data Science/Spotify/MyData_3/SearchQueries.json'
search_query = pd.read_json(search_query_file)

follow_file = '/Users/JiIn/Desktop/Data Science/Spotify/MyData_3/Follow.json'
follow = pd.read_json(follow_file)


In [2]:
print('Original dataframe looks like below: ')
streaming_history.head(n=5)

Original dataframe looks like below: 


Unnamed: 0_level_0,artistName,msPlayed,trackName
endTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-03-15 17:30:00-04:00,The Weeknd,25806,Die For You
2019-03-16 12:06:00-04:00,SZA,171805,The Weekend - Funk Wav Remix
2019-03-16 12:10:00-04:00,Jorja Smith,242035,February 3rd
2019-03-16 12:12:00-04:00,Ari Lennox,150133,Night Drive
2019-03-16 12:16:00-04:00,tobi lou,205090,Darlin'


In [3]:
# Top 10 songs I listened to the most
print('Top 10 songs I listened to the most ')
streaming_history.trackName.value_counts().iloc[:10]

Top 10 songs I listened to the most 


Dang! (feat. Anderson .Paak)              142
NO FUN                                    136
Grace                                     135
King James                                134
Demons                                    125
TEST DRIVE                                122
Sanctuary                                 121
Summertime Magic                          115
Yada Yada                                 115
Reachin' 2 Much (feat. Lalah Hathaway)    106
Name: trackName, dtype: int64

In [76]:
# I want to understand how much music I listen to throughout the day

streaming_history_day = streaming_history.copy()

# I break down the day into 6 categories and calculate the daily average frequency for each category

dayCat = ['Early Morning', 'Mid Morning', 'Early Afternoon', 'Mid Afternoon', 'Early Evening', 'Night']
streaming_history_day.loc[(streaming_history_day.index.hour > 6)&(streaming_history_day.index.hour <= 9), 'dayCategory'] = dayCat[0]
streaming_history_day.loc[(streaming_history_day.index.hour > 9)&(streaming_history_day.index.hour <= 12), 'dayCategory'] = dayCat[1]
streaming_history_day.loc[(streaming_history_day.index.hour > 12)&(streaming_history_day.index.hour <= 15), 'dayCategory'] = dayCat[2]
streaming_history_day.loc[(streaming_history_day.index.hour > 15)&(streaming_history_day.index.hour <= 18), 'dayCategory'] = dayCat[3]
streaming_history_day.loc[(streaming_history_day.index.hour > 18)&(streaming_history_day.index.hour <= 21), 'dayCategory'] = dayCat[4]
streaming_history_day.loc[(streaming_history_day.index.hour > 21), 'dayCategory'] = dayCat[5]

streaming_history_day['dayCatgoryNum'] = 0
for i in range(len(dayCat)):
    streaming_history_day.loc[streaming_history_day['dayCategory'] == dayCat[i], 'dayCategoryNum'] = i

# 1000 milliseconds in a second, 60 seconds in a minute, thus 60000 milliseconds in a minute
streaming_history_day['minPlayed'] = streaming_history_day['msPlayed']/60000

fig = go.Figure()
dailyBreakDown = ['6am to 9am', '9am to 12pm', '12pm to 3pm', '3pm to 6pm', '6pm to 9pm', 'After 9pm']


dayCategoryFreq = round(streaming_history_day.groupby(['dayCategoryNum','dayCategory'])['minPlayed'].count()/365, 2)
fig.add_trace(go.Scatter(x = dayCat, y = dayCategoryFreq.values,
                    mode='lines+markers',
                    name='lines+markers',
                    hovertext = dailyBreakDown))
fig.update_layout(
    annotations=[
        dict(
            x = 0.0,
            y = 1.05,
            showarrow=False,
            text = "Mean music streaming frequency throughout the day",
            xref = "paper",
            yref = "paper"
            
        )],
    yaxis = dict(
        title_text = "Mean Frequency",
        titlefont=dict(size=15)),
    title = {
        'text': 'Daily Streaming Frequency',
        'y':0.88,
        'x':0.08,
        'xanchor': 'left',
        'yanchor': 'top'}
)
fig.show()

fig.write_html("my_daily_streaming_frequency.html")

In [79]:
# Similarly, I want to see how much music I listened each month
streaming_history_group = streaming_history.groupby([streaming_history.index.month, streaming_history.index.date]).count()

fig = px.box(streaming_history_group, x = streaming_history_group.index.get_level_values(0), y="msPlayed")
fig.update_layout(
    annotations=[
        dict(
            x = 0.0,
            y = 1.05,
            showarrow=False,
            text = "Daily aggregate music streaming frequency broken down by month",
            xref = "paper",
            yref = "paper"
        )],
    xaxis = dict(
        title_text="Month",
        tickmode = 'linear',
        titlefont=dict(size=15)),
    yaxis = dict(
        title_text="Frequency",
        titlefont=dict(size=15)),
    title = {
        'text': 'Daily Streaming Frequency by Month',
        'y':0.95,
        'x':0.242,
        'xanchor': 'center',
        'yanchor': 'top'}
)

fig.show()

fig.write_html("my_monthly_streaming_frequency.html")


In [7]:
# Now I want to see what genres I listen to the most
# I will use Spotify's API to pull each artist's genre

# the most frequent artist by hour
by_hour = streaming_history.index.hour
topArtistByHour = streaming_history.groupby(by_hour)['artistName'].apply(lambda x: x.value_counts().head(5))

In [8]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials #To access authorised Spotify data
import os

client_id = os.environ['SPOTIPY_CLIENT_ID']
client_secret = os.environ['SPOTIPY_CLIENT_SECRET']

client_credentials_manager = SpotifyClientCredentials(client_id = client_id, client_secret = client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager) #spotify object to access API

In [10]:
# Create a dictionary for top artists by hour
# hour: ['artist name', 'id', 'popularity', [genres]]
import collections
byHour_dict = {}
genre_only = []
genre_dict = collections.defaultdict(set)
for (hour, artist) in topArtistByHour.index:
    result = sp.search(q = artist, type = 'artist')
    if hour not in byHour_dict:
        byHour_dict[hour] = []
        
    for returned_artist in result['artists']['items']:
        genres = returned_artist['genres']
        artist_id = returned_artist['id']
        popularity = returned_artist['popularity']
        
        genre_only += genres
        for i in range(len(genres)):
            genre_dict[genres[i]].add(artist)
            
        if (returned_artist['name'] == artist) and (genres != []):
            byHour_dict[hour].append((artist, artist_id, popularity, genres))
            
#print(genre_dict)

In [13]:
from collections import Counter


genre_count = Counter(genre_only)
genre_df = pd.DataFrame.from_dict(genre_count, orient='index').reset_index()
genre_df = genre_df.rename(columns = {'index': 'Genre', 0 : 'Frequency'}).sort_values(by = ['Frequency'], ascending = False)
genre_df['Artist'] = genre_df['Genre'].map(genre_dict)
genre_df['ArtistCount'] = genre_df['Artist'].apply(lambda x: len(x))

fig = go.Figure()
fig.add_trace(go.Bar(
    x = genre_df.iloc[:20]['Frequency'],
    y = genre_df.iloc[:20]['Genre'],
    orientation='h',
    marker = dict(
        colorbar = dict(title = 'Number of Artists per Genre'),
        color = genre_df['ArtistCount'], #set color equal to a variable
        colorscale = 'Viridis', # one of plotly colorscales
        showscale = True
    ),
    hovertext = list(genre_df.iloc[:20]['Artist'])
    
))

fig.update_layout(
    xaxis = dict(
        title_text="Frequency per Genre",
        titlefont=dict(size=15)),
    title = {
        'text': 'My Favorite Top 20 Genres',
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'}
)
    
fig.show()
fig.write_html("my_favorite_genre.html")
    

In [14]:
from collections import Counter

byHour_agg_genre = {}
byHour_common_genre = {}

for key in byHour_dict:
    byHour_agg_genre[key] = []
    
    for artist in byHour_dict[key]:
        # split() returns list of all the words in the string 
        byHour_agg_genre[key] += artist[3]
        
#     split = []
#     for genre in byHour_genre[key]:
#         split += genre.split()
        
    # Pass the split_it list to instance of Counter class. 
    counter = Counter(byHour_agg_genre[key]) 
  
    # most_common() produces k frequently encountered 
    # input values and their respective counts. 
    most_occur = counter.most_common(3) 
    byHour_common_genre[key] = most_occur

# for i in byHour_common_genre:
#     print(i, ':', byHour_common_genre[i])


In [15]:
import sys
import spotipy
import spotipy.util as util

# As a test, I want to see what music I listened to recently using Spotify API
scope = 'user-library-read'

print('The songs I recently listened to: ')
if len(sys.argv) > 1:
    username = sys.argv[1]
else:
    print("Usage: %s username" % (sys.argv[0],))
    sys.exit()

token = util.prompt_for_user_token(username, scope)

if token:
    sp = spotipy.Spotify(auth = token)
    #print(results)
    results = sp.current_user_saved_tracks()
    for item in results['items']:
        track = item['track']
        print(track['name'] + ' - ' + track['artists'][0]['name'])
else:
    print("Can't get token for", username)

The songs I recently listened to: 
Fantasy - Alina Baraz
Paramount - Tora
Can't Buy The Mood - Two Another Remix - Tora
Jaigantic (Galimatias Remix) - Tora
Turmoil - Iman Omari
Pray For You - Jhené Aiko
10k Hours (feat. Nas) - Jhené Aiko
Off The Ground - Anderson .Paak
Lite Weight (feat. The Free Nationals United Fellowship Choir) - Anderson .Paak
Joypunks - Big Wild
Gray Area (feat. Mick Jenkins) - KAYTRANADA
Stay True (feat. Helena) - Men I Trust
Dazed (feat. Gabrielle & Geoffroy) - Men I Trust
Quiet (feat. Odile) - Men I Trust
Break for Lovers (feat. Helena) - Men I Trust
Alright - Men I Trust
Say, Can You Hear - Men I Trust
Numb - Men I Trust
Lauren - Men I Trust
Dragonball Durag - Thundercat


In [16]:
scope = 'user-top-read'
spotify = spotipy.Spotify()

if len(sys.argv) > 1:
    username = sys.argv[1]
else:
    print("Usage: %s username" % (sys.argv[0],))
    sys.exit()

token = util.prompt_for_user_token(username, scope)

if token:
    sp = spotipy.Spotify(auth = token)
    top_artist_read = sp.current_user_top_artists(limit=100, offset=0, time_range='medium_term')
print('popularity - artist - genres')

top_artist_dict = {}

#dictionary of artists where key = artist name and values are id, popularity, genre, and #followers
for artist in top_artist_read['items']:
    top_artist_dict[artist['name']] = (artist['id'], artist['popularity'], artist['genres'], artist['followers']['total'])
    print(artist['popularity'], '-', artist['name'], '-', artist['genres'])

#print(top_artist_dict)





            User authentication requires interaction with your
            web browser. Once you enter your credentials and
            give authorization, you will be redirected to
            a url.  Paste that url you were directed to to
            complete the authorization.

        
Opened https://accounts.spotify.com/authorize?client_id=b4e4f4490d4a4b1a9d325f9d591eb622&response_type=code&redirect_uri=http%3A%2F%2Flocalhost%3A9000%2Fcallback&scope=user-top-read in your browser


Enter the URL you were redirected to: http://localhost:9000/callback?code=AQDR0W4QyWvOQRYZhKDUqCmy0L4bIrf8iVNBXpz7reFyPSPoKeieQAhTbZR1sisjHSQ_Dd-XWg12OvbH1aqLwpfJnIwAJ458cA4yMS9M3UemaXCjcsK2zOpq3G_fXwxlum93DTGG3SJOdYciXKa2igmGNzGAJ0Ew-9f7uv7tmfaeOcmCVyhWuWflYZBpTYmLZZ5EAzMCK_pr-jZSXn7V4xhyDg


popularity - aritst - genres
83 - Chance the Rapper - ['chicago rap', 'conscious hip hop', 'hip hop', 'pop rap', 'rap']
96 - Drake - ['canadian hip hop', 'canadian pop', 'hip hop', 'pop rap', 'rap', 'toronto rap'

In [17]:
# Top 50 tracks in the long-term 
scope = 'user-top-read'
spotify = spotipy.Spotify()

if len(sys.argv) > 1:
    username = sys.argv[1]
else:
    print("Usage: %s username" % (sys.argv[0],))
    sys.exit()

token = util.prompt_for_user_token(username, scope)

if token:
    sp = spotipy.Spotify(auth = token)
    top_track_read = sp.current_user_top_tracks(limit=100, offset=0, time_range='long_term')

# Create a dictionary whose key is track id, and values are artist, track name and audio features.
top_track_dict = {}
print('List of tracks I listened to the most ("popularity" - "track"  by "aritst"):\n')
print('\n')
print('Note: "popularity" was calculated by Spotify by incorporating how much I listened to each track and its artist.')

for track in top_track_read['items']:
    track_name = track['name']
    track_artist = track['artists'][0]['name']
    track_release_date = track['album']['release_date']
    track_id = track['id']
    track_popularity = track['popularity']
    audio_features = sp.audio_features(track_id)[0]
    audio_features_danceability = sp.audio_features(track_id)[0]['danceability']
    print(track_popularity, '-', track_name, 'by', track_artist)
    
    top_track_dict[track_id] = [track_artist, track_release_date, track_name, track_popularity, audio_features, audio_features_danceability]

#print(top_track_dict)



List of tracks I listened to the most ("popularity" - "track"  by "aritst"):



Note: "popularity" was calculated by Spotify by incorporating how much I listened to each track and its artist.
69 - Summertime Magic by Childish Gambino
55 - And July (feat. DEAN & DJ Friz) by HEIZE
24 - thank u, next by Ariana Grande
70 - Feels Like Summer by Childish Gambino
30 - One At A Time Please by HONNE
54 - I Got You (Always and Forever) by Chance the Rapper
62 - GOD. by Kendrick Lamar
45 - Belong in the Sun by ¿Téo?
4 - My Boo - Hitman's Club Mix by Ghost Town DJs
79 - Passionfruit by Drake
34 - Take You High by HONNE
77 - The Weekend - Funk Wav Remix by SZA
68 - Fire & Desire by Drake
43 - Grace by ADOY
72 - Dang! (feat. Anderson .Paak) by Mac Miller
73 - TEST DRIVE by Joji
52 - Jenga (feat. Gaeko) by HEIZE
67 - Poetic Justice by Kendrick Lamar
50 - Give You Up - Darius Remix by Crayon
62 - Let Go by Beau Young Prince
71 - NO FUN by Joji
65 - Handsome by Chance the Rapper
51 - You, Clouds, Rain 

In [18]:
# Create a dataframe for top tracks using top_track_dict
df_top_track = pd.DataFrame.from_dict(top_track_dict, orient='index', columns = ['track_artist', 'track_release_date', 'track_name', 'track_popularity',
       'audio_features', 'audio_features_danceability'])

fig = go.Figure(data = go.Scatter(
    y = df_top_track['track_popularity'],
    x = df_top_track['track_release_date'],
    mode ='markers',
    marker = dict(
        colorbar = dict(title = 'Danceability'),
        size = 16,
        color = df_top_track['audio_features_danceability'], #set color equal to a variable
        colorscale = 'Viridis', # one of plotly colorscales
        showscale = True
    ),
    text = df_top_track['track_name']+ ' by ' + df_top_track['track_artist'] + ' released on ' + df_top_track['track_release_date'],
    hovertemplate =
        "<b>%{text}</b><br><br>"))
        

fig.update_layout(
    yaxis = dict(
        title_text="Track Popularity",
        titlefont=dict(size=15)),
    xaxis = dict(
        title_text="Track Release Date",
        titlefont=dict(size=15)),
    title = {
        'text': 'Popularity Index vs. Release Date of Top 50 Tracks by Danceability',
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'}
    
    
    
)

fig.show()
fig.write_html("top_50_tracks.html")

In [19]:
# What do the audio features of my favorite tracks look like?

# Create a dictionary for audio features
audio_feat_dict = {}
for track_id in top_track_dict:
    audio_feat = top_track_dict[track_id][4]
    for feat in audio_feat:
        if feat not in audio_feat_dict:
            audio_feat_dict[feat] =[audio_feat[feat]]
        else:
            audio_feat_dict[feat].append(audio_feat[feat])

# I am mainly interested in the below 5 features
# Take a mean of each feature and store into dictionary
feat_extract = ['danceability', 'speechiness', 'valence', 'acousticness', 'energy']
feat_extract_dict = {}
for feat in feat_extract:
    mean_feat = np.mean(audio_feat_dict[feat])
    feat_extract_dict[feat] = mean_feat

In [20]:
spider_df = pd.DataFrame.from_dict(feat_extract_dict, orient='index', columns = ['audio_feat'])
fig = px.line_polar(spider_df, r=spider_df['audio_feat'], theta=spider_df.index, line_close=True)
fig.update_traces(fill='toself')
fig.show()
fig.write_html("my_music_audio_feat.html")