In [None]:
import csv

file_path = "../podcasts-no-audio-13GB/metadata/spotify-podcasts-2020/metadata.tsv"

unique_show_ids = set()
unique_rss_links = set()

with open(file_path, encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter="\t")
    for row in reader:
        show_id = row["show_uri"].split(":")[-1]
        rss_link = row["rss_link"].strip()
        if rss_link:  # avoid empty ones
            unique_show_ids.add(show_id)
            unique_rss_links.add(rss_link)

print("Unique show IDs:", len(unique_show_ids))
print("Unique RSS links:", len(unique_rss_links))

if len(unique_show_ids) == len(unique_rss_links):
    print("✅ Each show has one unique RSS link.")
else:
    print("⚠️ Mismatch: Some shows share the same RSS link or are missing one.")


Unique show IDs: 18376
Unique RSS links: 18376
✅ Each show has one unique RSS link.


In [None]:
import csv
import feedparser
import pandas as pd

# Input and output paths
INPUT_TSV = "../podcasts-no-audio-13GB/metadata/spotify-podcasts-2020/metadata.tsv"
OUTPUT_CSV = "augmented_podcast_metadata.csv"

# Read original TSV
with open(INPUT_TSV, encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter="\t")
    data = list(reader)

results = []

for row in data:
    episode_id = row["episode_uri"].split(":")[-1]
    show_id = row["show_uri"].split(":")[-1]
    rss_link = row["rss_link"].strip()

    audio_url = ""
    episode_image = ""
    show_image = ""

    if rss_link:
        try:
            feed = feedparser.parse(rss_link)

            # Get show-level image
            if "itunes_image" in feed.feed:
                show_image = feed.feed["itunes_image"].get("href", "")
            elif "image" in feed.feed and "href" in feed.feed["image"]:
                show_image = feed.feed["image"]["href"]

            for entry in feed.entries:
                guid = entry.get("guid") or entry.get("id") or entry.get("link", "")
                if episode_id in guid:
                    if entry.enclosures:
                        audio_url = entry.enclosures[0].get("href", "")
                    if "itunes_image" in entry:
                        episode_image = entry["itunes_image"].get("href", "")
                    break

        except Exception as e:
            print(f"Failed parsing {rss_link}: {e}")

    results.append({
        "episode_id": episode_id,
        "episode_title": row["episode_name"],
        "episode_description": row["episode_description"],
        "show_id": show_id,
        "show_name": row["show_name"],
        "publisher": row["publisher"],
        "language": row["language"],
        "duration": float(row["duration"]) if row["duration"] else None,
        "rss_link": rss_link,
        "audio_url": audio_url,
        "episode_image": episode_image,
        "show_image": show_image
    })

# Save to CSV
pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
print(f"Saved augmented metadata to {OUTPUT_CSV}")
