In [None]:
import os
import datetime

import pandas as pd
import matplotlib.pyplot as plt

import ipywidgets as widgets
from ipywidgets import interact

In [None]:
# GATHER ALL THE DATA

path_to_json = 'my_spotify_data/'
frames = []
for file_name in [file for file in os.listdir(path_to_json) if file.endswith('.json')]:
    frames.append(pd.read_json(path_to_json + file_name))

complete_df = pd.concat(frames)
complete_df

In [None]:
# SANETIZE DATA

df = complete_df.copy()

# drop all rows containing podcasts
df = df[df['spotify_track_uri'].notna()]

# drop all songs which were playing less than 15 seconds
df = df[df['ms_played'] > 15000]

# convert ts from string to datetime
df['ts'] = pd.to_datetime(df['ts'], utc=False)
df['date'] = df['ts'].dt.date

# drop all columns which are not needed
columns_to_keep = [
    'ts',
    'date',
    'ms_played',
    'platform',
    'conn_country',
    'master_metadata_track_name',
    'master_metadata_album_artist_name',
    'master_metadata_album_album_name',
    'spotify_track_uri'
]
df = df[columns_to_keep]

df = df.sort_values(by=['ts'])
songs_df = df.copy()
songs_df

In [None]:
# TOP SONGS OF ALL TIME

df = songs_df.copy()


df = df.groupby(['spotify_track_uri']).size(
).reset_index().rename(columns={0: 'count'})
df = df.sort_values(by=['count'], ascending=False).reset_index()

df = df.merge(songs_df.drop_duplicates(subset='spotify_track_uri'))
df = df[['master_metadata_track_name', 'master_metadata_album_artist_name', 'master_metadata_album_album_name', 'count']]
df.head(20)

In [None]:
# TOP SONGS IN 2023

def top_songs_in_year(year):
    df = songs_df.copy()

    df['year'] = df['ts'].dt.year

    df = df.loc[(df['year'] == year)]

    print(
        f"Time listened in {year}: {datetime.timedelta(milliseconds=int(df['ms_played'].sum()))}")

    df = df.groupby(['spotify_track_uri']).size(
    ).reset_index().rename(columns={0: 'count'})
    df = df.sort_values(by=['count'], ascending=False).reset_index()

    df = df.merge(songs_df.drop_duplicates(subset='spotify_track_uri'))
    df = df[['master_metadata_track_name',
             'master_metadata_album_artist_name',
             'master_metadata_album_album_name',
             'count']]

    return df.head(20)


top_songs_in_year(2023)

In [None]:
# CREATE WIDGET UTILS

# date list
dates = pd.date_range(songs_df['ts'].values[0], songs_df['ts'].values[-1])

# create date range slider
options = [(str(date)[:10], date) for date in dates]
index = (0, len(dates)-1)

date_range_slider = widgets.SelectionRangeSlider(
    options=options,
    index=index,
    description='date range',
    orientation='horizontal',
    layout={'width': '1200px'}
)

In [None]:
# TOP SONGS IN A SPECIFIC TIME RANGE

@interact
def top_songs(date_range=date_range_slider):
    df = songs_df.copy()

    time_range_start = pd.Timestamp(date_range[0])
    time_range_end = pd.Timestamp(date_range[1])

    df = df.loc[(df['date'] >= time_range_start.date())
                & (df['date'] <= time_range_end.date())]

    df = df.groupby(['spotify_track_uri']).size(
    ).reset_index().rename(columns={0: 'count'})
    df = df.sort_values(by=['count'], ascending=False).reset_index()

    df = df.merge(songs_df.drop_duplicates(subset='spotify_track_uri'))
    df = df[['master_metadata_track_name',
             'master_metadata_album_artist_name',
             'master_metadata_album_album_name',
             'count']]
    return df.head(20)

In [None]:
# TOP ARTIST IN A SPECIFIC TIME RANGE

@interact
def top_artists(date_range=date_range_slider):
    df = songs_df.copy()

    time_range_start = pd.Timestamp(date_range[0])
    time_range_end = pd.Timestamp(date_range[1])

    df = df.loc[(df['date'] >= time_range_start.date())
                & (df['date'] <= time_range_end.date())]

    df = df.groupby(['master_metadata_album_artist_name']).size(
    ).reset_index().rename(columns={0: 'count'})
    df = df.sort_values(by=['count'], ascending=False).reset_index()
    df = df.drop('index', axis=1)

    return df.head(20)

In [None]:
# TOP ALBUM IN A SPECIFIC TIME RANGE

@interact
def top_albums(date_range=date_range_slider):
    df = songs_df.copy()

    time_range_start = pd.Timestamp(date_range[0])
    time_range_end = pd.Timestamp(date_range[1])

    df = df.loc[(df['date'] >= time_range_start.date())
                & (df['date'] <= time_range_end.date())]

    df = df.groupby(['master_metadata_album_album_name']).size(
    ).reset_index().rename(columns={0: 'count'})
    df = df.sort_values(by=['count'], ascending=False).reset_index()

    df = df.merge(songs_df.drop_duplicates(
        subset='master_metadata_album_album_name'))
    df = df[['master_metadata_album_album_name',
             'master_metadata_album_artist_name',
             'count']]

    return df.head(20)

In [None]:
# SONGS PLAYED PER YEAR

def plot_songs_per_year():
    df = songs_df.copy()

    df['year'] = df['ts'].dt.year

    df = df.groupby(['year']).size().reset_index(name='count')

    plt.figure(figsize=(10, 6))
    plt.bar(df['year'], df['count'])
    plt.xlabel('Year')
    plt.ylabel('Number of Played Tracks')
    plt.title('Number of Played Tracks per Year')

    plt.show()

plot_songs_per_year()

In [None]:
# TEMPORAL DISTRIBUTION OF PLAYED TRACKS OVER THE YEARS

def plot_temporal_distribution():
    df = songs_df.copy()

    df['year'] = df['ts'].dt.year
    df['hour'] = df['ts'].dt.hour

    df = df.groupby(['year', 'hour']).size().reset_index(name='count')

    fig, ax = plt.subplots(figsize=(12, 8))

    for year, data in df.groupby('year'):
        ax.plot(data['hour'], data['count'], label=str(year))

    plt.title('Temporal Distribution of Played Tracks Over Years')
    plt.xlabel('Time (Hour)')
    plt.ylabel('Number of Played Tracks')
    plt.legend(title='Year')

    plt.show()

plot_temporal_distribution()

In [None]:
# WEEKDAY DISTRIBUTION OF PLAYED TRACKS OVER THE YEARS

def plot_weekday_distribution():
    df = songs_df.copy()

    df['year'] = df['ts'].dt.year
    df['weekday'] = df['ts'].dt.weekday

    df = df.groupby(['year', 'weekday']).size().reset_index(name='count')

    fig, ax = plt.subplots(figsize=(12, 8))

    for year, data in df.groupby('year'):
        ax.plot(data['weekday'], data['count'], label=str(year))

    weekdays_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    plt.xticks(range(7), weekdays_order)

    plt.title('Weekday Distribution of Played Tracks Over Years')
    plt.xlabel('Weekday')
    plt.ylabel('Number of Played Tracks')
    plt.legend(title='Year')

    plt.show()

plot_weekday_distribution()

In [None]:
# PLATFORM DISTRIBUTION

def categorize_device(device):
    # only needed for data before 2023
    # as this data was much more detailed
    if 'android' in device.lower():
        return 'Android'
    elif 'windows' in device.lower():
        return 'Windows'
    elif 'ios' in device.lower():
        return 'iOS'
    elif 'amazon' in device.lower():
        return 'Amazon'
    elif 'linux' in device.lower():
        return 'Linux'
    else:
        return 'Other'


def plot_platform_distribution():
    df = songs_df.copy()

    df['platform_category'] = [categorize_device(
        device) for device in df['platform']]
    df['year'] = df['ts'].dt.year

    df = df.groupby(['year', 'platform_category']
                    ).size().reset_index(name='count')

    df_pivot = df.pivot(
        index='year', columns='platform_category', values='count').fillna(0)

    fig, ax = plt.subplots(figsize=(12, 8))
    df_pivot.plot(kind='bar', stacked=True, ax=ax)

    plt.title('Platform/Device Distribution of Played Tracks Over Years')
    plt.xlabel('Year')
    plt.ylabel('Number of Played Tracks')
    plt.legend(title='Platform/Device',
               bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.show()


plot_platform_distribution()