# Linker Demo

In [1]:
# add autorelaod extension
%load_ext autoreload
%autoreload 2

In [2]:
# Imports and environment setup
import os
import sys
from pathlib import Path
from dotenv import load_dotenv

# Ensure project root on sys.path
notebook_dir = Path.cwd().resolve()
project_root = notebook_dir.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Project-local imports
from linker import (
    MusicLinker, SoundtrackParser, Config, setup_logging,
    YouTubeClient, GeminiMatcher, SoundtrackMetadata
)
from linker.utils import save_results_to_json, save_results_to_csv

# Load .env and set up logging
load_dotenv()
setup_logging('INFO')

In [None]:
# Validate keys are present
YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY')
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

print('YouTube key set:', bool(YOUTUBE_API_KEY))
print('Gemini key set:', bool(GEMINI_API_KEY))
assert YOUTUBE_API_KEY, 'Missing YOUTUBE_API_KEY in environment'
assert GEMINI_API_KEY, 'Missing GEMINI_API_KEY in environment'

In [3]:
# Or simply provide your API keys here
from keys import YOUTUBE_API_KEY, GEMINI_API_KEY

## 1) YouTube API smoke test
Perform a simple search limited to a few results and print titles + URLs.

In [4]:
yt = YouTubeClient(YOUTUBE_API_KEY)
query = 'Million Dollar Baby Blue Morgan'  # Adjust query freely

videos = yt.search_videos(query=query, max_results=3)

2025-11-30 17:37:19 - googleapiclient.discovery_cache - INFO - file_cache is only supported with oauth2client<4.0.0


In [5]:
for v in videos:
    print(v)

ðŸ“º Blue Morgan (End Credits) (From "Million Dollar Baby")
   Channel: Clint Eastwood - Topic
   URL: https://www.youtube.com/watch?v=iLcV1FEdTyc
   Views: 408,867
   Likes: 3,056
   Duration: PT4M30S
   Published: 2019-01-09
ðŸ“º Blue Morgan (End Credits) (From "Million Dollar Baby")
   Channel: Clint Eastwood - Topic
   URL: https://www.youtube.com/watch?v=UdHrnXTvMtg
   Views: 79,733
   Likes: 753
   Duration: PT4M31S
   Published: 2018-11-15
ðŸ“º Blue Morgan (Million Dollar Baby Soundtrack | Clint Eastwood)
   Channel: Dan Jones Guitar
   URL: https://www.youtube.com/watch?v=U98-fJHcHsQ
   Views: 37,073
   Likes: 836
   Duration: PT2M43S
   Published: 2018-08-30


## 2) Gemini API smoke test
Use the LLM matcher to pick the best match among the candidates we just fetched.
Weâ€™ll craft a small `SoundtrackMetadata` for context.

In [8]:
matcher = GeminiMatcher(GEMINI_API_KEY)
# Minimal soundtrack context; tune to your query above
snd = SoundtrackMetadata(
    title='Blue Morgan',
    performer=None,
    movie_title='Million Dollar Baby')

best, score = matcher.find_best_match(
    soundtrack=snd, candidates=videos, use_comments=False)

print('Best match:' , best.url if best else None)
print('Confidence:', score.confidence if score else None)
print('Reasoning:', score.reasoning[:200] + '...' if score and score.reasoning else None)

2025-11-30 17:41:49 - google_genai.models - INFO - AFC is enabled with max remote calls: 10.
2025-11-30 17:42:00 - httpx - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"


Best match: https://www.youtube.com/watch?v=iLcV1FEdTyc
Confidence: 0.98
Reasoning: Candidate 1 is the definitive best match for the 'Blue Morgan' soundtrack from 'Million Dollar Baby'. It is an official upload from the 'Clint Eastwood - Topic' channel, which is auto-generated by You...


## 3) Parse metadata from TTL (subset movie)
Weâ€™ll read some triples for a subset movie (e.g., `tt0405159`) and list a few tracks.

In [6]:
# Path is already imported in cell 2, ensure that cell is executed first
subset_root = str(Path('../data') / 'subset')
imdb_id = 'tt0094226'  # choose from: tt0094226, tt0120338, tt0405159
tracks_ttl = SoundtrackParser.parse_soundtrack_ttl(subset_root, imdb_id)

print(f'TTL tracks found: {len(tracks_ttl)}')
for t in tracks_ttl[:5]:
    print(t)
    
# tracks_ttl[:2]

TTL tracks found: 2
ðŸŽµ MOOD INDIGO
   Composed by: Duke Ellington
   Lyrics by: Irving Mills, Barney Bigard
   From: The Untouchables
ðŸŽµ Vesti la giubba
   Performed by: Mario Del Monaco, Orchestra dell'Accademia Nazionale di Santa Cecilia
   Composed by: Ruggero Leoncavallo
   From: The Untouchables


## 4) Parse metadata from direct text (no TTL)
We can also ingest raw IMDb-style text to build `SoundtrackMetadata`.

In [None]:
# Parse metadata from text
soundtrack_text = '''Blue Morgan
Music by Clint Eastwood

Boxing Baby
Written by Kyle Eastwood and Michael Stevens

Blue Diner
Written by Kyle Eastwood and Michael Stevens
'''
tracks_text = SoundtrackParser.parse_soundtrack_text(soundtrack_text, movie_title='Million Dollar Baby')
for t in tracks_text:
    print({'title': t.title, 'composer': t.composer})
tracks_text

## 5) End-to-end linking on a few tracks
Use `MusicLinker` to search, score (via Gemini), and return matches.
Weâ€™ll limit to a couple of tracks to keep the demo light.

In [7]:
GEMINI_MODEL = "gemini-2.5-flash"

# Smaller settings for a quick demo
linker = MusicLinker(
    youtube_api_key=YOUTUBE_API_KEY,
    gemini_api_key=GEMINI_API_KEY,
    max_search_results=5,
    max_comments_per_video=0,
    use_comments=False,
    gemini_model=GEMINI_MODEL
)

2025-11-30 18:16:08 - googleapiclient.discovery_cache - INFO - file_cache is only supported with oauth2client<4.0.0


In [8]:
demo_tracks = (tracks_ttl[:1] or tracks_text[:1])
print('Demo tracks:', [t.title for t in demo_tracks])

results = linker.find_matches_batch(demo_tracks, max_workers=1)
for r in results:
    print('Song:', r.soundtrack.title)
    print('Query:', r.search_query)
    if r.best_match:
        print('Best URL:', r.best_match.url)
        if r.match_score:
            print('Confidence:', r.match_score.confidence)
    else:
        print('No match:', r.error)
results

2025-11-30 18:16:47 - linker.music_linker - INFO - Search attempt 1 for 'MOOD INDIGO'
2025-11-30 18:16:47 - linker.music_linker - INFO - Searching for: MOOD INDIGO Duke Ellington The Untouchables
2025-11-30 18:16:47 - linker.music_linker - INFO - Searching for: MOOD INDIGO Duke Ellington The Untouchables


Demo tracks: ['MOOD INDIGO']


Processing tracks:   0%|          | 0/1 [00:00<?, ?track/s]2025-11-30 18:16:48 - linker.music_linker - INFO - Found 5 candidates
2025-11-30 18:16:48 - linker.music_linker - INFO - Analyzing candidates with LLM...
2025-11-30 18:16:48 - google_genai.models - INFO - AFC is enabled with max remote calls: 10.
2025-11-30 18:16:48 - linker.music_linker - INFO - Found 5 candidates
2025-11-30 18:16:48 - linker.music_linker - INFO - Analyzing candidates with LLM...
2025-11-30 18:16:48 - google_genai.models - INFO - AFC is enabled with max remote calls: 10.
2025-11-30 18:17:01 - httpx - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
2025-11-30 18:17:01 - linker.music_linker - INFO - âœ“ Found match for 'MOOD INDIGO': https://www.youtube.com/watch?v=VTajBUNaLCY (confidence: 0.95)
Processing tracks: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:13<00:00, 13.95s/track]2025-11-30 18:17:01 - httpx - INFO - HTTP Reques

Song: MOOD INDIGO
Query: MOOD INDIGO Duke Ellington The Untouchables
Best URL: https://www.youtube.com/watch?v=VTajBUNaLCY
Confidence: 0.95





[MusicLinkResult(soundtrack=SoundtrackMetadata(title='MOOD INDIGO', composer='Duke Ellington', lyrics_by='Irving Mills, Barney Bigard', performer=None, producer=None, movie_title='The Untouchables', additional_info=None, is_traditional=False, is_uncredited=False), best_match=YouTubeVideo(video_id='VTajBUNaLCY', title='Â«Mood IndigoÂ» (Â«The UntouchablesÂ»)', url='https://www.youtube.com/watch?v=VTajBUNaLCY', description='Written by Duke Ellingon, Irving Mills and Barney Bigard, arranged by Bob Wilber \n\nEnnio Morricone â€“ Â«The UntouchablesÂ» (Music From The Motion Picture) \n\nÂ© La-La Land Records, Paramount Pictures, Universal Music Special Markets \n1987/2012', channel_title='RetroPanorama', published_at='2023-02-22T13:07:42Z', view_count=551, like_count=14, duration='PT3M41S', comments=[]), match_score=MatchScore(confidence=0.95, reasoning="Candidate 2 is the best match because its title and description explicitly and accurately link the song 'Mood Indigo' to 'The Untouchables' 

## 7) Save results (optional)
Persist to JSON/CSV for later inspection.

In [None]:
out_dir = Path('output')
out_dir.mkdir(parents=True, exist_ok=True)
save_results_to_json(results, str(out_dir / 'demo_results.json'))
save_results_to_csv(results, str(out_dir / 'demo_results.csv'))
print('Saved to output/demo_results.json and output/demo_results.csv')

## 6) Process entire subset dataset
Process all movies in the data/subset folder and save results for each movie in its respective movie_soundtrack folder.

In [None]:
import json
from pathlib import Path

# Get all movie folders in the subset
subset_path = Path('../data/subset')
movie_folders = sorted([d for d in subset_path.iterdir() if d.is_dir()])

print(f"Found {len(movie_folders)} movies in subset: {[d.name for d in movie_folders]}")

# Process each movie
for movie_folder in movie_folders:
    imdb_id = movie_folder.name
    print(f"\n{'='*60}")
    print(f"Processing: {imdb_id}")
    print('='*60)
    
    try:
        # Parse soundtrack metadata from TTL
        tracks = SoundtrackParser.parse_soundtrack_ttl(str(subset_path), imdb_id)
        print(f"Found {len(tracks)} tracks")
        
        if not tracks:
            print(f"No tracks found for {imdb_id}, skipping...")
            continue
        
        # Run linker on all tracks
        print(f"Finding YouTube matches...")
        results = linker.find_matches_batch(tracks, max_workers=1)
        
        # Prepare output directory
        output_path = movie_folder / 'movie_soundtrack' / f'{imdb_id}_results.json'
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Save results to JSON
        save_results_to_json(results, str(output_path))
        print(f"âœ“ Saved results to: {output_path}")
        
        # Print summary
        successful_matches = sum(1 for r in results if r.best_match)
        print(f"Summary: {successful_matches}/{len(results)} tracks matched")
        
    except Exception as e:
        print(f"âœ— Error processing {imdb_id}: {e}")
        continue

print(f"\n{'='*60}")
print("All movies processed!")
print('='*60)

## Troubleshooting
- Ensure `.env` has `YOUTUBE_API_KEY` and `GEMINI_API_KEY`.
- If imports fail, run `pip install -r requirements.txt`.
- Reduce `max_search_results` or disable comments to limit API usage.
- Switch `imdb_id` to one of the available subset IDs: `tt0094226`, `tt0120338`, `tt0405159`.