In [None]:
import duckdb

# Ensure data has been added to expected directory as per README instructions
YTM_FILEPATH = "/workspaces/ytmusic_analytics/data/watch-history.json"

In [None]:
try:
    data = duckdb.read_json(YTM_FILEPATH)
except FileNotFoundError as e:
    print(f"File not found: {YTM_FILEPATH}. Please ensure the file exists.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
data.describe()

In [24]:
# Create table
q = """
CREATE OR REPLACE TABLE watch_history_clean AS
SELECT
  header,
  CASE
    WHEN title LIKE 'Watched %'
      THEN substr(title, length('Watched ')+1)
    ELSE title
  END AS title,
  titleUrl,
  subtitles,
  CAST("time" AS TIMESTAMP) AS ts,
  products,
  activityControls,
  description,
  details
FROM data;
"""

duckdb.sql(q)

In [None]:
# Validate
q = """SELECT * 
FROM watch_history_clean
LIMIT 100"""
r = duckdb.sql(q)
r.show()

In [None]:
# Remove ads
# TODO: clean this up
ads_to_remove_q = """
SELECT COUNT(*) as before_count
FROM watch_history_clean
WHERE len(details) = 1
  AND details[1].name = 'From Google Ads';
"""

ads_to_remove_r = duckdb.sql(ads_to_remove_q)

print(f"Ads to remove: {ads_to_remove_q}")

remove_ads_q = """
DELETE FROM watch_history_clean
WHERE len(details) = 1
  AND details[1].name = 'From Google Ads';
"""
duckdb.sql(remove_ads_q)

ads_removed = duckdb.sql("""
SELECT COUNT(*) AS after_count
FROM watch_history_clean
WHERE len(details) = 1
  AND details[1].name = 'From Google Ads';
""")

print(f"Number ads removed: {ads_removed}")

In [37]:
# Create 2025 table
q = """
CREATE OR REPLACE TABLE yt_music_history_2025 AS
SELECT
  ROW_NUMBER() OVER (ORDER BY ts)      AS id,
  title                                  AS song_title,
  --Trim topic
  IF(
    -- check if the last 8 chars are ' - Topic'
    subtitles[1].name[-8:] = ' - Topic',
    -- if so, drop those 8 chars
    subtitles[1].name[:-8],
    -- otherwise leave untouched
    subtitles[1].name) 					 AS song_artist,
  CAST(ts AS TIMESTAMP)                  AS listened_ts,
  titleUrl                               AS youtube_url
FROM watch_history_clean
WHERE ts >= '2025-01-01'
and "header" = 'YouTube Music'"""

duckdb.sql(q)


In [None]:
# Validate
q = """SELECT *
FROM yt_music_history_2025
LIMIT 100"""

r = duckdb.sql(q)
r.show()