# Spotify Wrapped Project
## By Noah Tri
## Version 1.0
---

### Part 1: Importing the full spotify streaming history from json files

- Set your working directory to the folder which contains your downloaded extended streaming history from the Accounts & Privacy page on Spotify

In [None]:
## import files and create dataframe of all of them

import pandas as pd 
import numpy as np
import json 
import glob
import re
import time

files = glob.glob("Streaming_History_Audio*.json")
all_streams_list = []
for file in files:
    with open(file) as file_contents:
        file_list = json.load(file_contents)
    all_streams_list.extend(file_list)

all_streams_df = pd.DataFrame(all_streams_list).rename(columns={'ts':'date_time', 'master_metadata_track_name':'song', 'master_metadata_album_artist_name':'artist', 'master_metadata_album_album_name':'album'})
all_streams_df['date_time'] = pd.to_datetime(all_streams_df['date_time'])
all_streams_df['date'] = (all_streams_df.date_time).dt.date
all_streams_df['year'] = all_streams_df['date'].apply(lambda row: row.year)
all_streams_df['month'] = all_streams_df['date'].apply(lambda row: row.strftime("%B"))

full_files = all_streams_df[~all_streams_df['song'].isnull()].drop_duplicates().iloc[:,[0,2,5,6,7,8,12,13,14,15,19,20,21]].reset_index(drop=True)
full_files['track_id'] = full_files['spotify_track_uri'].astype(str).apply(lambda x: re.sub('spotify:track:', "", x))

### Part 2: Setting up access for the Spotify API

- get client ID and Client Secret from Spotify Developer Dashboard (https://developer.spotify.com/dashboard)

In [None]:
import requests

client_id = 'enter_your_client_id_here'
clinet_secret = 'enter_your_client_secret_here'

base_url = "https://api.spotify.com"

In [None]:
# Access Token
AUTH_URL = 'https://accounts.spotify.com/api/token'
params = {'client_id':client_id, 'client_secret':clinet_secret, 'grant_type':'client_credentials'}

auth_response = requests.post(AUTH_URL, params)
assert auth_response.status_code == 200, "Bad request"
access_token = (auth_response.json())['access_token']
headers = {'Authorization': f"Bearer {access_token}"}


### Part 3: Combining the API data with Streaming History data

1. Get all track information using track API
2. Replace album information for every song that has album type == single if that song belongs to an album using the artist API then album API

*Note: I chose to replace albums in part 2 for consistency in album statistics later.*

In [8]:
all_track_ids = full_files['track_id'].unique().tolist()

full_track_length = len(all_track_ids)
loop_times = int(full_track_length/50) + (full_track_length % 50 > 0)

all_track_dicts = []

for i in range(loop_times):

    # Set index and define ID lists
    index1 = 50*i
    index2 = 50*(i+1)

    if i < loop_times:
        current_list = all_track_ids[index1:index2]
    if i == loop_times:
        current_list = all_track_ids[index1:]
    current_list = [x for x in current_list if x != 'None']

    track_ids = ",".join(current_list)

    # Make the API call
    url = base_url+"/v1/tracks"
    track_resp = requests.get(url, headers=headers, params = {"ids":track_ids})
    assert track_resp.status_code == 200, track_resp.status_code
    full_track_dict = track_resp.json()

    # Build dictionary for each track
    for track in full_track_dict['tracks']:
        if track == None: 
            continue

        temp_dict = {'song_name': track['name'],'track_duration_ms':track['duration_ms'],'disc_number':track['disc_number'], 'track_number':track['track_number'],'album_tracks':track['album']['total_tracks'],'album_name':track['album']['name'],
                    "album_type":track['album']['album_type'],'release_date':track['album']['release_date'],'release_date_precision':track['album']['release_date_precision'],
                    'track_id': str(track['id']), "track_link":track['external_urls']['spotify'],'album_id':track['album']['id'], "album_link":track['album']['external_urls']['spotify']}

        if (track.get('images') != None):
                temp_dict['album_image']= track['album']['images'][0]['url']
            
        for artist_number in range(len(track['artists'])):
            if artist_number > 3:
                break
            column_name = "artist_"+ (str(artist_number+1))
            temp_dict[column_name] = track['artists'][artist_number]['name']
            if artist_number == 0:
                temp_dict['artist_id'] = track['artists'][artist_number]['id']
            
        all_track_dicts.append(temp_dict)

df_all_tracks = pd.DataFrame(all_track_dicts)

In [9]:
option_1 = full_files.merge(df_all_tracks, on = 'track_id', how = 'inner').drop_duplicates().sort_values('date_time').reset_index(drop=True)
songs_wrong_album_type = option_1.loc[option_1['album_type'] != "album"].loc[:,['song_name', 'artist_id', 'ms_played','track_duration_ms','artist_1']]
list_artist_id = songs_wrong_album_type.groupby('artist_id').sum('ms_played').sort_values('ms_played',ascending=False)[:500].index.tolist()

In [None]:
artist_album_list = []

for artist_id in list_artist_id:
    url = base_url+"/v1/artists/" + artist_id+ "/albums"
    artist_resp = requests.get(url, headers=headers, params = {'limit':50, 'include_groups':"album", 'market':"US"})
    assert artist_resp.status_code == 200, artist_resp.status_code
    for album in artist_resp.json()['items']:
        temp_dict = {"album_name":album['name'], 'album_id':album['id']}
        artist_album_list.append(temp_dict)

df_replace_albums = pd.DataFrame(artist_album_list).drop_duplicates().reset_index(drop=True)      

In [None]:
## take what columns I will change for each song in this cell
song_list = []

album_length = df_replace_albums.shape[0]
loop_times = int(album_length/20) + (album_length % 20 > 0)

for i in range(loop_times):
    index1 = 20*i
    index2 = 20*(i+1)-1

    if i < loop_times:
        current_albums = df_replace_albums.loc[index1:index2,"album_id"]
    if i == loop_times:
        current_albums = df_replace_albums.loc[index1:, "album_id"]

    album_id_string = current_albums.iloc[0]
    for entry in current_albums.iloc[1:]:
        album_id_string = album_id_string+","+entry
    time.sleep(.3)
    # API request
    album_url = base_url+"/v1/albums"
    album_resp = requests.get(album_url, headers=headers, params={'ids':album_id_string, 'market':'US'})
    assert album_resp.status_code == 200, album_resp.status_code
    album_dict = album_resp.json()

    # Build dictionary for each song
    for album in album_dict['albums']:

        for track in album['tracks']['items']:

            temp_dict = {"song_name":str(track['name']),'disc':track['disc_number'], 'track_number': track['track_number'], 'track_duration_ms':track['duration_ms'], # track duration?
                        'track_id':track['id'], "track_link":track['external_urls']['spotify'],'album_id':album['id'],
                        'album_name':album['name'], 'album_type':album['album_type'], 'album_tracks':album['total_tracks'],'release_date':album['release_date'], 'precision':album['release_date_precision'],
                        'album_link':album['external_urls']['spotify']}
            
            if (track.get('images') != None):
                temp_dict['album_image']= album['images'][0]['url']
            
            for artist_number in range(len(track['artists'])):
                if artist_number > 0:
                    break
                column_name = "artist_"+ (str(artist_number+1))
                temp_dict[column_name] = track['artists'][artist_number]['name']

            song_list.append(temp_dict)


df_songs_replace_albums = pd.DataFrame(song_list)
songs_first_album = df_songs_replace_albums.loc[df_songs_replace_albums.groupby(['song_name','artist_1'])['release_date'].idxmin()]

In [16]:
songs_first_album = df_songs_replace_albums.loc[df_songs_replace_albums.groupby(['song_name','artist_1','track_duration_ms'])['release_date'].idxmin()]
songs_first_album_filtered = songs_first_album.merge(option_1[option_1['album_type']!="album"].loc[:,'song_name'].drop_duplicates().reset_index(drop=True), how = "inner")
x = songs_first_album_filtered

In [None]:
# define function for changing album types that are singles/compilations
def replace_album(row):

    if (row['album_type']=="album"):
        return row
    
    bool1 = (x['song_name']==row['song_name'])
    bool2 = (x['artist_1']==row['artist_1'])
    replace_row = x[bool1 & bool2]

    if replace_row.shape[0]==0:
        return row
    
    row['album'] = replace_row.iloc[0].loc['album_name']
    row['album_name'] = replace_row.iloc[0].loc['album_name']
    row['album_type'] = replace_row.iloc[0].loc['album_type']
    row['track_id'] = replace_row.iloc[0].loc['track_id']
    row['disc_number'] = replace_row.iloc[0].loc['disc']
    row['track_number'] = replace_row.iloc[0].loc['track_number']
    row['album_tracks'] = replace_row.iloc[0].loc['album_tracks']
    row['track_link'] = replace_row.iloc[0].loc['track_link']
    row['album_id'] = replace_row.iloc[0].loc['album_id']
    row['album_link'] = replace_row.iloc[0].loc['album_link']
    return row

final_streaming_history = option_1.apply(replace_album, axis = 1)


In [None]:
## validating rows of original streaming history match after adding the api data
outer = full_files.merge(final_streaming_history.loc[:,['date_time', 'song']], how = 'outer', indicator=True)
anti_join = outer[(outer._merge=='left_only')].drop('_merge', axis =1)
anti_join.shape[0]

### Part 4: Writing final_streaming_history to csv

In [None]:
# write to csv file
final_streaming_history.to_csv("/Users/noahtri/Desktop/Spotify/Final_Streaming_History.csv", index = False)