# Spotify data preperation

### Import libraries

In [458]:
import os
import json
import pandas as pd
import requests
import pytz
from tzlocal import get_localzone
import datetime
import math
import json

### Define Functions

#### Import JSON

In [459]:
def union_json_files(directory):
    """
    Union multiple JSON files from a directory into a single dataframe.
    
    Parameters:
    - directory: Path to the directory containing the JSON files
    
    Returns:
    - DataFrame containing unioned data
    """
    # List files in the directory that start with "Streaming_" and end with ".json"
    file_paths = [os.path.join(directory, file) for file in os.listdir(directory) 
                  if file.startswith("Streaming_") and file.endswith(".json")]
    
    # Initialize an empty list to store dataframes
    dfs = []
    
    # Iterate over each file path
    for path in file_paths:
        with open(path, "r") as file:
            data = json.load(file)
        
        # Convert the JSON data into a dataframe
        df = pd.DataFrame(data)
        
        # Append the dataframe to the list
        dfs.append(df)
    
    # Concatenate all the dataframes together
    result_df = pd.concat(dfs, ignore_index=True)
    
    return result_df


#### Prefilter Data

In [463]:
def prefilterData(df: pd.DataFrame):
    """
    Goal:
            - Creates timestamp
            - add or substract hours depending on the users location
            - Prefilters data to the current and last year
            - add start date feature
            - rename some columns
            - create feature to indicate podcasts

    Params: dataframe with spotify data from the union_json_files function

    Returns: modified df
    
    """
    # Convert ts in UTC timestamp
    df['end_date'] = pd.to_datetime(df['ts'])
    df = df.drop('ts', axis = 1)

    def convert_UTC_toLocal(df):
        """
        Gets the local time of the user and converts the UTC time into the timezone of the user
        """
        local = pytz.timezone(str(get_localzone()))  # Convert timezone to string
        df['end_date'] = df['end_date'].dt.tz_convert(local)  # Use tz_convert instead of tz.convert

        return df

    df = convert_UTC_toLocal(df)

    # filters everything, where col episode_name AND master_metadata_track_name are null
    df = df.dropna(subset=['episode_name', 'master_metadata_track_name'], how='all') 

    # Filter everything which is older than current year and last year

    lastYear = df['end_date'].max().year -1

    df = df[df['end_date'].dt.year >= lastYear]
    
    # Add start_date as a feature

    df['start_date'] = df['end_date'] - pd.to_timedelta(df['ms_played'], unit= 'ms')

    # Get the Track ID from the URI

    df['uri'] = df['spotify_track_uri'].str.split(':').str[-1]

    # Rename columns

    df.rename(columns={
        'master_metadata_track_name' : 'track_name',
        'master_metadata_album_artist_name': 'artist',
        'master_metadata_album_album_name': 'album'
    }, inplace= True)

    # Create feature if its a podcast or not

    df['podcast?'] = df['track_name'].isna()
    
    return df

#### API Call

##### OAuth

In [465]:
def getAuthentification(clientID, clientSecret, tokenURL, grantType):
    """
    Goal: Gets the OAuth token from the Spotify API

    Params: come from the Spotify App created here: https://developer.spotify.com/dashboard

    Returns: OAuth Token in access_token variable
    
    """
    data = {
    "client_id": clientID,
    "client_secret": clientSecret,
    "grant_type": grantType
    }

    response = requests.post(tokenURL, data=data)

    if response.status_code == 200:
        access_token = response.json().get('access_token')
        return access_token
    else:
        print("Failed to obtain access token")
        print("Status Code:", response.status_code)
        print("Response:", response.text)



In [466]:
clientID = 'eca3fc7945fa4309bfe684dc1d413364'
clientSecret = 'e4243ef91d8a4994a9aacd08934c333a'
# tokenURL = 'https://accounts.spotify.com/api/token'
grantType = 'client_credentials'

getAuthentification(clientID, clientSecret, tokenURL, grantType)

'BQDl514RkziXMcW7BiVgMS63YoafMI9z_-r9Cu7Ykm-semZfXHBP-f-2GnyWT1ZTcTwYNdhknGgUlIYdY8ms5ecfdC8ayQgaDK5xesaJ9F0D7Ncp4jQ'

##### Prepare data for API call

In [468]:
def prepareSpotifyURIGroupsForAPI(df):
    # get unique Spotify URI and remove NULL values
    uri = pd.DataFrame(df['uri'].dropna().unique()).rename(columns={0: 'uri'}, inplace= False)

    # Create URI groups
    numberOfRecords = len(uri)
    count = math.ceil(numberOfRecords / 50) # divide by 50 to get the number of groups which have the length of 50
    uri['group'] = 0
    groupCounter = 0

    for i in range(numberOfRecords):
        groupCounter += 1
        if groupCounter > count:
            groupCounter = 1
        uri.at[i, 'group'] = groupCounter

    # concatenate groups together

    uri = uri.groupby('group')['uri'].apply(lambda x: ','.join(x))

    return uri
    

##### Request

In [473]:
def trackRequest(prepared_uri):
    baseURL = 'https://api.spotify.com/v1/tracks?ids='
    urls = [baseURL + group_uris for group_uris in prepared_uri]
    bearer_token = getAuthentification(clientID, clientSecret, tokenURL, grantType)

    headers = {
        "Authorization": f"Bearer {bearer_token}"
    }

    responses = []
    for url in urls:
        response = requests.get(url, headers= headers)
        if response.status_code == 200:
            responses.append(response.json())
        else:
            print("Failed to access tracks endpoint")
            print("Status Code:", response.status_code)
            print("Response:", response.text)
            print(f"URL = {url}")

    return responses

##### Parse JSON

In [476]:
def parseAPIResponse(list: list) -> pd.DataFrame:

    """
    Goal: Parse the json from the API response and get only the track_id, duration and popularity. Other features can be extracted as well

    Params: list, that is the output from the getTrackMetadata function

    Returns: df with parsed features, the df is joinable to the main df (output from the prefilterData function)
    
    """

    track_info_list = []

    for i in range(len(list)):
        current_batch = list[i]['tracks']

        for j in range(len(current_batch)):
            track = current_batch[j]
            track_id = track.get('id', 'unknown')
            duration_ms = track.get('duration_ms', 0)
            popularity = track.get('popularity', 'unknown')

            # store results in dict
            track_info = {
                'track_id': track_id,
                'duration_ms': duration_ms,
                'popularity': popularity
            }

            # Append track infos to list
            track_info_list.append(track_info)
        
    track_df = pd.DataFrame(track_info_list)

    return track_df

#print(track_df[track_df['track_id'] == '507IFqPuG8mBQoKebfGG9t'])

##### Wrap functions together

In [477]:
def getTrackMetadata(df: pd.DataFrame,
                     clientID,
                     clientSecret,
                     tokenURL,
                     grantType) -> pd.DataFrame:
    
    getAuthentification(clientID, clientSecret, tokenURL, grantType)
    prepered_uri = prepareSpotifyURIGroupsForAPI(df)
    track_metadata_request = trackRequest(prepered_uri)
    track_metadata = parseAPIResponse(track_metadata_request)

    return track_metadata   

#### Calculate top 10 tracks per year

In [478]:
def getTop10Tracks (df: pd.DataFrame) -> pd.DataFrame :
    """
    Goal: Get the top 10 tracks per year

    Params: pre-filtered df

    Returns: df with top 10 tracks per year
    
    """
    data = df

    df['start_date'] = pd.to_datetime(df['start_date'])

    # Extract the year into a new column
    df['year'] = df['start_date'].dt.year

    grouped_df = data.groupby(['uri', 'year'])['ms_played'].sum().reset_index()
    filtered_df = grouped_df.dropna()

    top_entries = (filtered_df.sort_values(['year', 'ms_played'], ascending= False)
                   .groupby('year')
                   .head(10))
    
    top_entries['top10?'] = True

    top_entries = top_entries.drop(['year', 'ms_played'], axis = 1)

    return top_entries

#### Create final dataframe

In [479]:
def prepareFinalDf(preFiltered_data: pd.DataFrame, 
                   metadata: pd.DataFrame, 
                   top10_tracks: pd.DataFrame) -> pd.DataFrame :
    
    # Join with metadata
    data_metadata = preFiltered_data.merge(
        metadata,
        left_on='uri',
        right_on='track_id',
        how = 'inner'
    )

    # Join with top10 tracks
    final_df = data_metadata.merge(
        top10_tracks,
        how = 'left',
        on = 'uri'
        )
    
    # Fill null values in top10? column for songs that are not top 10

    final_df['top10?'] = final_df['top10?'].fillna(False)

    print('Success! Your table has been created!')

    return final_df

### Finale

In [None]:
# Directory containing the JSON files
directory = r"directory_with_your_spotify_json_files"

# Union the files
unioned_df = union_json_files(directory)

pre_filtered_data = prefilterData(unioned_df)

clientID = 'yourClientID'
clientSecret = 'yourClientSecret'
tokenURL = 'https://accounts.spotify.com/api/token'
grantType = 'client_credentials'

track_metadata = getTrackMetadata(df=pre_filtered_data,
                                  clientID=clientID,
                                  clientSecret=clientSecret,
                                  tokenURL=tokenURL,
                                  grantType=grantType)

top_10_tracks = getTop10Tracks(pre_filtered_data)

final_df = prepareFinalDf(preFiltered_data= pre_filtered_data,
                          metadata=track_metadata,
                          top10_tracks=top_10_tracks)



display(final_df)

In [481]:
final_df.to_csv(directory + "_full_streaming_history_python_version.csv")