### Packages and Libraries

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials


### Constants

In [None]:
file_paths = ['data/StreamingHistory_music_0.json', 'data/StreamingHistory_music_1.json', 'data/StreamingHistory_music_2.json', 'data/StreamingHistory_music_3.json']

manual_genres = {
    "Ben Platt": "pop/folk",
    "Delaney Bailey": "folk/indie",
    "Laufey": "jazz/pop",
    "Libianca": "r&b/afrobeats",
    "Meryl Streep": "indie",
    "SZA": "r&b",
    "Steve Lacy": "r&b/indie"
}

CLIENT_ID = ""
CLIENT_SECRET = ""

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET
))



### Functions

In [2]:
def indonesia_genre(genres):
    for genre in genres:
        if 'indonesia' in genre:
            return 'yes' 
    return 'no'  

def simplify_genre(genres):
    simplified_genres = set() 

    for genre in genres:
        if 'pop' in genre:
            simplified_genres.add('pop')
        if 'jazz' in genre:
            simplified_genres.add('jazz')
        if 'rock' in genre:
            simplified_genres.add('rock')
        if 'hip hop' in genre or 'rap' in genre:
            simplified_genres.add('hip hop/rap')
        if 'indie' in genre:
            simplified_genres.add('indie')
        if 'folk' in genre:
            simplified_genres.add('folk')
        if 'r&b' in genre:
            simplified_genres.add('r&b')
        if 'country' in genre:
            simplified_genres.add('country')
        if 'worship' in genre:
            simplified_genres.add('worship')
        if 'drill' in genre:
            simplified_genres.add('drill')
        if 'alt z' in genre:
            simplified_genres.add('pop')
        if 'house' in genre:
            simplified_genres.add('house')

    if not simplified_genres:
        simplified_genres.add('other')

    return '/'.join(sorted(simplified_genres)) 


### Read Spotify JSON Files 

In [None]:
all_data = []

for file_path in file_paths:
    with open(file_path, 'r') as f:
        data = json.load(f)  
        all_data.extend(data)  

df = pd.DataFrame(all_data)

df.head()

### Data Cleaning

In [None]:
df['endTime'] = pd.to_datetime(df['endTime']) 
df['endTime'] = df['endTime'].dt.tz_localize('UTC')
df['endTime'] = df['endTime'].dt.tz_convert('America/Los_Angeles')
df['year'] = df['endTime'].dt.year
df_2024 = df[df['year'] == 2024].copy()

df_2024.head()

### Exploratory Analysis

In [None]:
top_artists_by_count = (
    df_2024
    .groupby('artistName', as_index=False)
    .size() 
    .rename(columns={'size': 'playCount'})  
    .sort_values(by='playCount', ascending=False) 
)

top_artists_by_count.head(10)


In [None]:
df_2024['hour'] = df_2024['endTime'].dt.hour

hourly_listening = df_2024.groupby('hour')['msPlayed'].sum()
hourly_listening.to_csv("data_viz/hourly_listening.csv")

plt.figure(figsize=(10, 6))
plt.bar(hourly_listening.index, hourly_listening.values)
plt.xlabel('Hour of the Day')
plt.ylabel('Total Listening Time (ms)')
plt.title('Hourly Listening Trends')
plt.xticks(range(24))  
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()

In [None]:
df_artists_5 = top_artists_by_count.head(5)['artistName']
df_top_5 = df_2024[df_2024['artistName'].isin(df_artists_5)]
df_top_5['year_month'] = df_top_5['endTime'].dt.to_period('M')

monthly_trends = (
    df_top_5.groupby(['artistName', 'year_month'])['msPlayed']
    .sum()
    .reset_index()
)
monthly_trends.to_csv("data_viz/monthly_trend_top_5.csv")

plt.figure(figsize=(12, 6))

top_5_artists = df_top_5['artistName'].unique()

plt.figure(figsize=(12, 6))
for artist in top_5_artists:
    artist_data = monthly_trends[monthly_trends['artistName'] == artist]
    plt.plot(
        artist_data['year_month'].astype(str),  
        artist_data['msPlayed'],
        marker='o',
        label=artist
    )

plt.xlabel('Month')
plt.ylabel('Total Listening Time (ms)')
plt.title('Monthly Listening Trends for Top 5 Artists')
plt.xticks(rotation=45)
plt.legend(title="Artist")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [23]:
artists_more_20 = (
    df_2024
    .groupby('artistName', as_index=False)
    .size()  # Count occurrences
    .rename(columns={'size': 'playCount'}) 
)

artists_more_20 = artists_more_20[artists_more_20['playCount'] > 20]
list_artists_genre = artists_more_20['artistName'].tolist()

### Use Spotify API for Genre

In [25]:
artist_genre_data = []

for artist_name in list_artists_genre:
    try:
        results = sp.search(q=f'artist:{artist_name}', type='artist', limit=1)
        if results['artists']['items']:
            artist_info = results['artists']['items'][0]
            genres = artist_info['genres']
            artist_genre_data.append({'artistName': artist_name, 'genres': genres})
        else:
            artist_genre_data.append({'artistName': artist_name, 'genres': None})
    except Exception as e:
        print(f"Error fetching data for artist {artist_name}: {e}")
        artist_genre_data.append({'artistName': artist_name, 'genres': None})

df_artist_genres = pd.DataFrame(artist_genre_data)

df_artist_genres = df_artist_genres[~df_artist_genres['genres'].apply(lambda x: isinstance(x, list) and len(x) == 0)]

df_artist_genres.reset_index(drop=True, inplace=True)
df_artist_genres.head()

In [None]:
# add columns 
df_artist_genres['indonesian_artist'] = df_artist_genres['genres'].apply(indonesia_genre)
df_artist_genres['simplified_genres'] = df_artist_genres['genres'].apply(simplify_genre)
df_artist_genres.head()

In [None]:
# some genres to assign manually


df_artist_genres['simplified_genres'] = df_artist_genres.apply(
    lambda row: manual_genres[row['artistName']] if row['artistName'] in manual_genres else row['simplified_genres'],
    axis=1
)

df_artist_genres.to_csv("data_viz/df_artist_genre.csv")

In [None]:
genre_counts = df_artist_genres['simplified_genres'].value_counts().head(10)
genre_counts.to_csv("data_viz/df_artist_genre_count.csv")
genre_counts

In [None]:
plt.figure(figsize=(12, 6))
genre_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Count of Genres', fontsize=16)
plt.xlabel('Genre', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.tight_layout()

plt.show()


In [None]:
genre_counts = df_artist_genres['indonesian_artist'].value_counts().head(10)

plt.figure(figsize=(8, 8))
genre_counts.plot(
    kind='pie',
    autopct='%1.1f%%', 
    colors=plt.cm.Paired.colors,  
    ylabel='', 
    title='Indonesian Artist?'
)
plt.tight_layout()
plt.show()