In [None]:

import hashlib
import os
import warnings
import zipfile
import pandas as pd
import pytz

warnings.filterwarnings('ignore')

# ------------------ CONFIGURATION ------------------
# zip file that was dowloaded from spotify (e.g. your kids and your own)
# if you want to analyze multiple files, you have to run this script multiple times with different zip files
spotify_data_zip_file = '/tmp/Spotify Extended Streaming History.zip'

# directory where the generated files will be saved, a unique subdirectory for the zip file will be created for each zip file
output_dir = '/tmp/social_media_analysis/generated/'

# specify the year to analyze, or 'all' to analyze all years
# e.g. 2023 or 2024 or all
year_to_analyze = 'all'
# ---------------------------------------------------

# -- some checks
if not os.path.exists(spotify_data_zip_file):
    raise FileNotFoundError(f"File not found: {spotify_data_zip_file}")

# get md5 hash of zip file, so that we can store different results for different zip files and don't mix them up
md5_hash = hashlib.md5()
with open(spotify_data_zip_file, "rb") as f:
    for chunk in iter(lambda: f.read(4096), b""):
        md5_hash.update(chunk)
zip_file_md5_hash = md5_hash.hexdigest()

# e.g. /tmp/spotify_results-21fd2ff0510ae3aa7d5bdea50c26ed85
output_dir = os.path.join(output_dir, f'spotify_results-{zip_file_md5_hash}')

# create output directory if not exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"Output directory: {output_dir}")

# Map English weekdays to German
weekday_mapping = {
    'Mon': 'Mo', 'Tue': 'Di', 'Wed': 'Mi', 'Thu': 'Do', 'Fri': 'Fr', 'Sat': 'Sa', 'Sun': 'So'
}
weekday_to_num = {'Mo': 0, 'Di': 1, 'Mi': 2, 'Do': 3, 'Fr': 4, 'Sa': 5, 'So': 6}


def to_hours(millis: int):
    return round(millis / 1000 / 60 / 60, 2)


def to_minutes(millis: int):
    return round(millis / 1000 / 60, 2)


def get_target_filename(filename: str) -> str:
    return os.path.join(output_dir, filename)


def get_raw_data() -> pd.DataFrame:
    _dframe = []
    tmp = []
    zf = zipfile.ZipFile(spotify_data_zip_file, 'r')
    for name in zf.namelist():
        if 'Streaming_History_Video_' not in name:
            continue
        tmp.append(pd.read_json(zf.open(name)))
    _dframe = pd.concat(tmp)
    # warn if we have no data
    if _dframe.empty:
        warnings.warn(f'No data found in the zip file {spotify_data_zip_file}')
        exit(1)


    # only with episode_name
    _dframe = _dframe[_dframe['episode_name'].notnull()]

    # Convert the `ts` field to a datetime object
    # Convert to Berlin timezone
    _dframe['ts'] = pd.to_datetime(_dframe['ts'])
    _berlin_tz = pytz.timezone('Europe/Berlin')
    _dframe['ts'] = _dframe['ts'].dt.tz_convert(_berlin_tz)

    _dframe['minutes_played'] = round(_dframe['ms_played'] / 1000 / 60)
    _dframe['hours_played'] = round(_dframe['ms_played'] / 1000 / 60 / 60, 2)
    _dframe['hour'] = _dframe['ts'].dt.hour
    _dframe['year_month'] = _dframe['ts'].dt.to_period('M')
    _dframe['year'] = _dframe['ts'].dt.year

    _dframe['weekday'] = _dframe['ts'].dt.strftime('%a')
    _dframe['weekday'] = _dframe['weekday'].map(weekday_mapping)

    # only keep year_to_analyze or all
    if year_to_analyze != 'all':
        _dframe = _dframe[_dframe['year'] == int(year_to_analyze)]

    return _dframe

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = get_raw_data()

# Group by year and calculate the total minutes played
grouped_df = df.groupby('year')['ms_played'].sum().reset_index()

# Calculate the number of days in each year
grouped_df['days_in_year'] = grouped_df['year'].apply(lambda x: 366 if pd.Timestamp(year=x, month=12, day=31).is_leap_year else 365)

# Calculate average minutes played per day
grouped_df['avg_minutes_per_day'] = to_minutes(grouped_df['ms_played']) / grouped_df['days_in_year']

# Plot the results
plt.figure(figsize=(10, 5))
plt.bar(grouped_df['year'].astype(str), grouped_df['avg_minutes_per_day'])
plt.title('Average Minutes Played Per Day for Each Year')
plt.xlabel('Year')
plt.ylabel('Average Minutes Per Day')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(get_target_filename('average_minutes_per_day_per_year.png'))
plt.show()

In [None]:
def format_ms(x, pos):
    return str(round(x / 3.6e+6, 2)) + 'h'


df = get_raw_data()


def top5(year: int, df1):
    _dframe = df1
    # Determine top artists listened to.
    _dframe = _dframe.loc[pd.to_datetime(df['ts']).apply(lambda t: t.year == year)]
    top_show_df = _dframe.groupby(['episode_show_name'])[['ms_played']]
    top_artists_df = top_show_df.sum().sort_values('ms_played', ascending=False)[:5]
    top_artists = [artist for (artist) in top_artists_df.index.values]
    fig, ax = plt.subplots(figsize=(10, 5))
    for artist in top_artists:
        # Generate rolling sums of total amount of time listening to the artist.
        df2 = _dframe.loc[_dframe['episode_show_name'] == artist][['ts', 'ms_played']]
        df2 = df2.sort_values('ts')
        df2 = df2.rolling('365D', on='ts').sum()
        ax.plot(df2['ts'], df2['ms_played'], label=artist)
    ax.set_title(f"Top 5 Shows in {year}")
    plt.gca().yaxis.set_major_formatter(format_ms)
    plt.legend()
    plt.savefig(get_target_filename(f"top5_shows_{year}.png"))
    plt.show()

# get all years from data and call top5 for each
years = df['year'].unique()
for y in years:
    top5(y, df)


In [None]:
df = get_raw_data()

def stream_time_per_show(year, df1):
    df1 = df1.loc[pd.to_datetime(df['ts']).apply(lambda t: t.year == year)]
    grouped_df1 = df1.groupby(['episode_show_name', 'year_month'])['ms_played'].sum().reset_index()
    grouped_df1['hours_played'] = to_hours(grouped_df1['ms_played'])
    top_shows = grouped_df1.groupby('episode_show_name')['ms_played'].sum().nlargest(10).index
    grouped_df1 = grouped_df1[grouped_df1['episode_show_name'].isin(top_shows)]
    pivot_df = grouped_df1.pivot(index='year_month', columns='episode_show_name', values='hours_played')
    # Plot the results
    pivot_df.plot(kind='line', figsize=(15, 7))
    plt.title(f'Stream Time per show (Top 10) in {year}')
    plt.xlabel('Month')
    plt.ylabel('Hours')
    plt.legend(title='Show Name', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(get_target_filename(f"stream_time_per_show_{year}.png"))
    plt.show()


# get all years from data and call stream_time_per_show for each
years = df['year'].unique()
for y in years:
    stream_time_per_show(y, df)

In [None]:
import matplotlib.pyplot as plt

# Group by year_month and sum ms_played
grouped_df = df.groupby('year_month')['ms_played'].sum().reset_index()

# Convert ms_played to hours for better readability
grouped_df['hours_played'] = to_hours(grouped_df['ms_played'])

# Plot the results
plt.figure(figsize=(15, 7))
plt.bar(grouped_df['year_month'].astype(str), grouped_df['hours_played'])
plt.title('Hours per month')
plt.xlabel('Month/Year')
plt.ylabel('Hours')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(get_target_filename('hours_per_month.png'))
plt.show()

In [None]:
import matplotlib.pyplot as plt

df = get_raw_data()

# Group by year_month and sum ms_played
grouped_df = df.groupby('year_month')['ms_played'].sum().reset_index()

# Calculate the number of days in each month
grouped_df['days_in_month'] = grouped_df['year_month'].dt.days_in_month

# Calculate average minutes per day
grouped_df['avg_minutes_per_day'] = to_minutes(grouped_df['ms_played']) / grouped_df['days_in_month']

# Plot the results
plt.figure(figsize=(15, 7))
plt.bar(grouped_df['year_month'].astype(str), grouped_df['avg_minutes_per_day'])
plt.axhline(y=30, color='g', linestyle='--', label='30 Minutes')
plt.axhline(y=60, color='y', linestyle='--', label='60 Minutes')
plt.axhline(y=120, color='r', linestyle='--', label='120 Minutes')
plt.title('Average minutes per day')
plt.xlabel('Month/Year')
plt.ylabel('Minutes')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.savefig(get_target_filename('average_minutes_per_day.png'))
plt.show()

In [None]:

import matplotlib.pyplot as plt

df = get_raw_data()

# Group by hour and calculate the total ms_played per hour
grouped_df = df.groupby('hour')['ms_played'].sum().reset_index()

# Calculate the total ms_played
total_ms_played = grouped_df['ms_played'].sum()

# Calculate the percentage of total time streamed per hour
grouped_df['percentage_played'] = (grouped_df['ms_played'] / total_ms_played) * 100

# Plot the results
plt.figure(figsize=(15, 7))
plt.bar(grouped_df['hour'], grouped_df['percentage_played'])

plt.title(f'% Distribution of {to_hours(total_ms_played)} total hours stream time per daytime')
plt.xlabel('Hour')
plt.ylabel('% of total time streamed')
plt.xticks(range(24))
plt.tight_layout()
plt.savefig(get_target_filename('percentage_distribution_per_hour.png'))
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches

df = get_raw_data()

# Group by weekday and minute, and count the number of samples
grouped = df.groupby(['weekday', 'minutes_played']).size().reset_index(name='counts')

# Map weekdays to x-coordinates
grouped['x'] = grouped['weekday'].map(weekday_to_num)

# Normalize the minutes for colormap
norm = mcolors.Normalize(vmin=grouped['minutes_played'].min(), vmax=grouped['minutes_played'].max())
cmap = cm.get_cmap('Dark2')

# Create colors based on the normalized minutes
colors = cmap(norm(grouped['minutes_played']))

plt.figure(figsize=(10, 5))
plt.scatter(grouped['x'], grouped['minutes_played'], c=colors, s=grouped['counts'] * 10, alpha=0.7, edgecolors='white')

# Add legend for circle sizes
handles = [
    mpatches.Patch(color='white', label='Diameter:'),
    plt.Line2D([0], [0], marker='o', color='w', label='1 Stream', markersize=np.sqrt(1 * 10), markerfacecolor='gray',
               alpha=0.7),
    plt.Line2D([0], [0], marker='o', color='w', label='10 Streams', markersize=np.sqrt(10 * 10), markerfacecolor='gray',
               alpha=0.7),
    plt.Line2D([0], [0], marker='o', color='w', label='100 Streams', markersize=np.sqrt(100 * 10),
               markerfacecolor='gray', alpha=0.7)
]
plt.legend(handles=handles, loc='upper right', bbox_to_anchor=(1.3, 0.9), handletextpad=2, borderpad=2,
           labelspacing=1.5)
plt.text(1.05, 1, f"Number of streams: {df.shape[0]}", transform=plt.gca().transAxes)
plt.text(1.05, 0.95, f"Minutes total: {df['minutes_played'].sum()}", transform=plt.gca().transAxes)
plt.title('Minutes per weekday')
plt.xticks(range(7), ['Mo', 'Di', 'Mi', 'Do', 'Fr', 'Sa', 'So'])
plt.xlabel('Weekday')
plt.ylabel('Minutes')
plt.grid(True)
plt.savefig(get_target_filename('minutes_per_weekday.png'), bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pytz
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches

df = get_raw_data()

grouped = df.groupby(['hour', 'minutes_played']).size().reset_index(name='counts')

# Normalize the minutes for colormap
norm = mcolors.Normalize(vmin=grouped['minutes_played'].min(), vmax=grouped['minutes_played'].max())
cmap = cm.get_cmap('Dark2')

# Create colors based on the normalized minutes
colors = cmap(norm(grouped['minutes_played']))

plt.figure(figsize=(10, 5))
plt.scatter(grouped['hour'], grouped['minutes_played'], c=colors, s=grouped['counts'] * 10, alpha=0.7,
            edgecolors='white')

# Add legend for circle sizes
handles = [
    mpatches.Patch(color='white', label='Diameter:'),
    plt.Line2D([0], [0], marker='o', color='w', label='1 Stream', markersize=np.sqrt(1 * 10), markerfacecolor='gray',
               alpha=0.7),
    plt.Line2D([0], [0], marker='o', color='w', label='10 Streams', markersize=np.sqrt(10 * 10), markerfacecolor='gray',
               alpha=0.7),
    plt.Line2D([0], [0], marker='o', color='w', label='100 Streams', markersize=np.sqrt(100 * 10),
               markerfacecolor='gray', alpha=0.7)
]
plt.legend(handles=handles, loc='upper right', bbox_to_anchor=(1.3, 0.9), handletextpad=2, borderpad=2,
           labelspacing=1.5)
plt.text(1.05, 1, f"Number of streams: {df.shape[0]}", transform=plt.gca().transAxes)
plt.text(1.05, 0.95, f"Minutes total: {df['minutes_played'].sum()}", transform=plt.gca().transAxes)
plt.title('Minutes per hour')
plt.xlabel('Hour')
plt.ylabel('Minutes')
plt.grid(True)
plt.savefig(get_target_filename('minutes_per_hour.png'), bbox_inches='tight')
plt.show()

In [None]:
import pandas as pd

df = get_raw_data()

# Group by `episode_show_name` and sum the hours and minutes played, and count the number of streams
grouped_df = df.groupby('episode_show_name').agg(
    hours_played=('hours_played', 'sum'),
    minutes_played=('minutes_played', 'sum'),
    num_streams=('episode_show_name', 'count')
).reset_index()

# Calculate the average minutes per stream
grouped_df['avg_minutes_per_stream'] = grouped_df['minutes_played'] / grouped_df['num_streams']

# Round the `hours_played`, `minutes_played`, and `avg_minutes_per_stream` to 2 decimal places
grouped_df['hours_played'] = grouped_df['hours_played'].round(2)
grouped_df['minutes_played'] = grouped_df['minutes_played'].round(2)
grouped_df['avg_minutes_per_stream'] = grouped_df['avg_minutes_per_stream'].round(2)

# Sort the DataFrame by `minutes_played` in descending order
sorted_df = grouped_df.sort_values(by='minutes_played', ascending=False).reset_index(drop=True)

# Display the sorted DataFrame
sorted_df.to_csv(get_target_filename('show_stats_sorted_by_minutes.csv'), index=False, sep=';')

sorted_df