In [1]:
import pandas as pd
import polars as pl
import os

In [2]:
DATA_PATH = os.path.join('../Apple Music Activity')
FILE_NAME = 'Apple Music - Play History Daily Tracks.csv'

In [3]:
lf = (
    pl.scan_csv(os.path.join(DATA_PATH, FILE_NAME))
    .with_columns(
        pl.col("Date Played")
        .cast(pl.Utf8)
        .str.strptime(pl.Date, format="%Y%m%d")
        .alias("date_played"),
        (pl.col("Play Duration Milliseconds")/(1000*60*60))
        .alias("play_duration_hours"),
        (pl.col("Play Duration Milliseconds")/(1000*60))
        .alias("play_duration_minutes")
    )
)

In [4]:
lf

In [5]:
pl.scan_csv(os.path.join(DATA_PATH, FILE_NAME)).schema

  pl.scan_csv(os.path.join(DATA_PATH, FILE_NAME)).schema


Schema([('Country', String),
        ('Track Identifier', Int64),
        ('Media type', String),
        ('Date Played', Int64),
        ('Hours', String),
        ('Play Duration Milliseconds', Int64),
        ('End Reason Type', String),
        ('Source Type', String),
        ('Play Count', Int64),
        ('Skip Count', Int64),
        ('Ignore For Recommendations', Boolean),
        ('Track Reference', String),
        ('Track Description', String)])

In [6]:
lf_2025 = lf.filter(pl.col("date_played").dt.year() == 2025)

In [7]:
df_2025 = lf_2025.collect()

In [8]:
df_2025

Country,Track Identifier,Media type,Date Played,Hours,Play Duration Milliseconds,End Reason Type,Source Type,Play Count,Skip Count,Ignore For Recommendations,Track Reference,Track Description,date_played,play_duration_hours,play_duration_minutes
str,i64,str,i64,str,i64,str,str,i64,i64,bool,str,str,date,f64,f64
"""United States""",1737150439,"""AUDIO""",20250101,"""5, 19""",459706,"""NATURAL_END_OF_TRACK""","""IPHONE, GAME_CONSOLE""",2,1,false,"""1737150439""","""Future, Metro Boomin & Kendric…",2025-01-01,0.127696,7.661767
"""United States""",1747373147,"""AUDIO""",20250101,"""5, 19""",452489,"""TRACK_SKIPPED_FORWARDS""","""IPHONE, GAME_CONSOLE""",2,1,false,"""1747373147""","""Fred again.., Anderson .Paak &…",2025-01-01,0.125691,7.541483
"""United States""",1440887515,"""AUDIO""",20250101,"""5, 19""",429860,"""NATURAL_END_OF_TRACK""","""IPHONE, GAME_CONSOLE""",3,2,false,"""1440887515""","""Playboi Carti - Magnolia""",2025-01-01,0.119406,7.164333
"""United States""",1687215995,"""AUDIO""",20250101,"""5, 19""",408918,"""NATURAL_END_OF_TRACK""","""IPHONE, GAME_CONSOLE""",2,0,false,"""1687215995""","""Lil Durk - All My Life (feat. …",2025-01-01,0.113588,6.8153
"""United States""",1760997262,"""AUDIO""",20250101,"""19""",264382,"""NATURAL_END_OF_TRACK""","""IPHONE""",1,0,false,"""1760997262""","""¥$, Kanye West & Ty Dolla $ign…",2025-01-01,0.073439,4.406367
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""United States""",1480847207,"""AUDIO""",20250504,"""6""",0,"""TRACK_SKIPPED_FORWARDS""","""IPHONE""",0,1,false,"""1480847207""","""Lauv - Modern Loneliness""",2025-05-04,0.0,0.0
"""United States""",1103026967,"""AUDIO""",20250504,"""17""",0,"""TRACK_SKIPPED_FORWARDS""","""IPHONE""",0,1,false,"""1103026967""","""RL Grime - Aurora""",2025-05-04,0.0,0.0
"""United States""",1158846625,"""AUDIO""",20250504,"""5""",0,"""MANUALLY_SELECTED_PLAYBACK_OF_…","""IPHONE""",0,1,false,"""1158846625""","""Pritam, Abhijeet Sawant & Adit…",2025-05-04,0.0,0.0
"""United States""",1683978546,"""AUDIO""",20250504,"""17""",0,"""TRACK_SKIPPED_FORWARDS""","""IPHONE""",0,1,false,"""1683978546""","""Mawi - No Love""",2025-05-04,0.0,0.0


In [9]:
# use the collected (eager) DataFrame and Polars group_by/agg syntax
top_songs_2025 = (
    df_2025
    .group_by('Track Description')
    .agg([
        pl.col('play_duration_minutes').sum().alias("total_minutes"),
        pl.col('Play Count').sum().alias('total_play_count')
    ])
    .sort('total_minutes', descending=True)
    .limit(10)
)

In [10]:
top_songs_2025

Track Description,total_minutes,total_play_count
str,f64,i64
"""The Weeknd - Cry For Me""",131.49145,39
"""Daft Punk - Contact""",77.6999,16
"""Playboi Carti & The Weeknd - R…",75.700783,26
"""¥$, Kanye West & Ty Dolla $ign…",73.7893,22
"""Drake & 21 Savage - Spin Bout …",70.552417,19
"""The Weeknd & Anitta - São Paul…",67.431333,16
"""The Weeknd - Earned It (Fifty …",65.960483,15
"""The Weeknd & Playboi Carti - T…",65.440583,17
"""Kx5, deadmau5 & Kaskade - Esca…",62.989083,18
"""Kendrick Lamar - Not Like Us""",59.79015,16


In [12]:
# top = top_songs_2025.collect()

In [13]:
print (top_songs_2025)

shape: (10, 3)
┌─────────────────────────────────┬───────────────┬──────────────────┐
│ Track Description               ┆ total_minutes ┆ total_play_count │
│ ---                             ┆ ---           ┆ ---              │
│ str                             ┆ f64           ┆ i64              │
╞═════════════════════════════════╪═══════════════╪══════════════════╡
│ The Weeknd - Cry For Me         ┆ 131.49145     ┆ 39               │
│ Daft Punk - Contact             ┆ 77.6999       ┆ 16               │
│ Playboi Carti & The Weeknd - R… ┆ 75.700783     ┆ 26               │
│ ¥$, Kanye West & Ty Dolla $ign… ┆ 73.7893       ┆ 22               │
│ Drake & 21 Savage - Spin Bout … ┆ 70.552417     ┆ 19               │
│ The Weeknd & Anitta - São Paul… ┆ 67.431333     ┆ 16               │
│ The Weeknd - Earned It (Fifty … ┆ 65.960483     ┆ 15               │
│ The Weeknd & Playboi Carti - T… ┆ 65.440583     ┆ 17               │
│ Kx5, deadmau5 & Kaskade - Esca… ┆ 62.989083     ┆ 18        

### April 2025

In [14]:
lf_apr = (
    pl.scan_csv(os.path.join(DATA_PATH, 'Apple Music - Play History Daily Tracks April 2025.csv'))
    .with_columns(
        (pl.col("Play Duration Milliseconds")/(1000*60*60))
        .alias("play_duration_hours"),
        (pl.col("Play Duration Milliseconds")/(1000*60))
        .alias("play_duration_minutes")
    )
)

In [15]:
lf_apr

In [16]:
top_song_apr = (
    lf_apr
    .group_by('Track Description')
    .agg([
        # pl.col('play_duration_minutes').sum().alias('total_minutes'),
        pl.col('Play Count').sum().alias('total_plays')
    ])
    .sort('total_plays', descending=True)
    .limit(11)
)

In [17]:
top_song_apr.collect()

Track Description,total_plays
str,i64
"""Daft Punk - Contact""",15
"""Martin Garrix & Arijit Singh -…",15
"""Billie Eilish - WILDFLOWER""",13
"""THE ANXIETY, WILLOW & Tyler Co…",11
"""Mismatched - Cast & Sickflip -…",10
…,…
"""Skrillex & Dj Smokey - SKRILLE…",9
"""Ramin Djawadi - The Prince Tha…",8
"""The Chainsmokers - Paris""",8
"""Drake - 9""",7


In [None]:
daily_hours = (
    lf_2025
    .group_by('Date Played')
    .agg([
        pl.col('play_duration_hours').sum().alias('total_hours')
    ])
    .sort('Date Played')
)

AttributeError: 'LazyFrame' object has no attribute 'groupby'

In [25]:
print (daily_hours.collect())

shape: (123, 2)
┌─────────────┬─────────────┐
│ Date Played ┆ total_hours │
│ ---         ┆ ---         │
│ i64         ┆ f64         │
╞═════════════╪═════════════╡
│ 20250101    ┆ 2.8945325   │
│ 20250102    ┆ 1.420362    │
│ 20250103    ┆ 8.223111    │
│ 20250104    ┆ 5.070216    │
│ 20250105    ┆ 5.58934     │
│ …           ┆ …           │
│ 20250430    ┆ 2.725068    │
│ 20250501    ┆ 4.670846    │
│ 20250502    ┆ 3.536968    │
│ 20250503    ┆ 3.462501    │
│ 20250504    ┆ 2.892364    │
└─────────────┴─────────────┘


In [None]:
def load_song_history(file_path: str) -> pl.LazyFrame:
    return (
        pl.scan_csv(file_path)
        .with_columns(
            pl.col('Date Played')
            .cast(pl.Utf8)
            .str.strptime()
        )
    )