In [1]:
import csv

file_path = "../podcasts-no-audio-13GB/metadata/spotify-podcasts-2020/metadata.tsv"

unique_show_ids = set()
unique_rss_links = set()

with open(file_path, encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter="\t")
    for row in reader:
        show_id = row["show_uri"].split(":")[-1]
        rss_link = row["rss_link"].strip()
        if rss_link:  # avoid empty ones
            unique_show_ids.add(show_id)
            unique_rss_links.add(rss_link)

print("Unique show IDs:", len(unique_show_ids))
print("Unique RSS links:", len(unique_rss_links))

if len(unique_show_ids) == len(unique_rss_links):
    print("✅ Each show has one unique RSS link.")
else:
    print("⚠️ Mismatch: Some shows share the same RSS link or are missing one.")

Unique show IDs: 18376
Unique RSS links: 18376
✅ Each show has one unique RSS link.


In [2]:
import feedparser
import pandas as pd

# Function to parse RSS feed and return a pandas DataFrame
def parse_rss_feed(rss_url):
    feed = feedparser.parse(rss_url)

    # Extract show-level data
    show_title = feed.feed.get("title", "")
    show_description = feed.feed.get("description", "")
    show_link = feed.feed.get("link", "")
    show_language = feed.feed.get("language", "")
    show_author = feed.feed.get("author", "")
    show_image = ""
    if "itunes_image" in feed.feed:
        show_image = feed.feed["itunes_image"].get("href", "")
    elif "image" in feed.feed and "href" in feed.feed["image"]:
        show_image = feed.feed["image"]["href"]

    # Extract episodes
    episodes = []
    for entry in feed.entries:
        episode = {
            "show_title": show_title,
            "show_description": show_description,
            "show_link": show_link,
            "show_language": show_language,
            "show_author": show_author,
            "show_image": show_image,
            "episode_title": entry.get("title", ""),
            "episode_description": entry.get("description", ""),
            "episode_link": entry.get("link", ""),
            "episode_guid": entry.get("guid", ""),
            "episode_pubDate": entry.get("published", ""),
            "episode_audio": entry.enclosures[0]["href"] if entry.enclosures else "",
            "episode_duration": entry.get("itunes_duration", ""),
            "episode_summary": entry.get("itunes_summary", ""),
            "episode_image": entry.get("itunes_image", {}).get("href", show_image),
            "episode_explicit": entry.get("itunes_explicit", ""),
            "episode_season": entry.get("itunes_season", ""),
            "episode_number": entry.get("itunes_episode", ""),
            "episode_type": entry.get("itunes_episodeType", "")
        }
        episodes.append(episode)

    df = pd.DataFrame(episodes)
    return df

# Example usage for a Jupyter Notebook
# Replace the URL below with your RSS feed link
rss_url = "https://anchor.fm/s/dfc3598/podcast/rss"
df = parse_rss_feed(rss_url)

# Display the DataFrame
df




Unnamed: 0,show_title,show_description,show_link,show_language,show_author,show_image,episode_title,episode_description,episode_link,episode_guid,episode_pubDate,episode_audio,episode_duration,episode_summary,episode_image,episode_explicit,episode_season,episode_number,episode_type
0,The Christmas Episode,Join Hunter and Brent each week while we tackl...,https://podcasters.spotify.com/pod/show/the-ch...,en,Hunter Babcock,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,I Love Lucy Christmas Special,<p>Hunter and Brent talk about hair and the Co...,https://podcasters.spotify.com/pod/show/the-ch...,1e4617b1-fcf6-4f60-ae3e-fae3e2253237,"Wed, 04 Mar 2020 16:00:00 GMT",https://anchor.fm/s/dfc3598/podcast/play/10801...,00:36:08,,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,,2.0,24.0,
1,The Christmas Episode,Join Hunter and Brent each week while we tackl...,https://podcasters.spotify.com/pod/show/the-ch...,en,Hunter Babcock,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,In Excelsis Deo,<p>Hunter and Brent return from the hiatus to ...,https://podcasters.spotify.com/pod/show/the-ch...,c1fe945c-0ea1-4dd7-8206-6a2ab485566d,"Wed, 26 Feb 2020 16:00:00 GMT",https://anchor.fm/s/dfc3598/podcast/play/10629...,00:35:48,,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,,2.0,23.0,
2,The Christmas Episode,Join Hunter and Brent each week while we tackl...,https://podcasters.spotify.com/pod/show/the-ch...,en,Hunter Babcock,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,Guess Who's Coming to Christmas,<p>Special guest John joins Brent and Hunter a...,https://podcasters.spotify.com/pod/show/the-ch...,826e02cf-8251-40d4-81bb-e79482910c26,"Wed, 12 Feb 2020 17:00:00 GMT",https://anchor.fm/s/dfc3598/podcast/play/10327...,00:33:45,,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,,2.0,22.0,
3,The Christmas Episode,Join Hunter and Brent each week while we tackl...,https://podcasters.spotify.com/pod/show/the-ch...,en,Hunter Babcock,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,Eddie Murphy/Lizzo,"<p>Brent, Hunter, and call-in guest Collin dis...",https://podcasters.spotify.com/pod/show/the-ch...,2d55cd28-eeb3-4ea5-883f-780ad409b627,"Wed, 05 Feb 2020 16:00:00 GMT",https://anchor.fm/s/dfc3598/podcast/play/10177...,00:36:11,,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,,2.0,21.0,
4,The Christmas Episode,Join Hunter and Brent each week while we tackl...,https://podcasters.spotify.com/pod/show/the-ch...,en,Hunter Babcock,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,Express Christmas,<p>Brent and Hunter discuss the Academy Awards...,https://podcasters.spotify.com/pod/show/the-ch...,9b1f76bb-7e7f-436b-a46f-41e1b938bc80,"Wed, 29 Jan 2020 16:00:00 GMT",https://anchor.fm/s/dfc3598/podcast/play/10009...,00:32:23,,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,,2.0,20.0,
5,The Christmas Episode,Join Hunter and Brent each week while we tackl...,https://podcasters.spotify.com/pod/show/the-ch...,en,Hunter Babcock,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,The Draft Dodger,<p>Hunter and Brent discuss Guatemalan Christm...,https://podcasters.spotify.com/pod/show/the-ch...,7d89f264-6789-40f7-9a2a-0e88ea8271c7,"Wed, 22 Jan 2020 04:54:39 GMT",https://anchor.fm/s/dfc3598/podcast/play/98592...,00:32:55,,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,,2.0,19.0,
6,The Christmas Episode,Join Hunter and Brent each week while we tackl...,https://podcasters.spotify.com/pod/show/the-ch...,en,Hunter Babcock,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,Mr. Monk and the Secret Santa,<p>2020 vision! Brent and Hunter discuss their...,https://podcasters.spotify.com/pod/show/the-ch...,59f8152f-618d-49a1-aa2d-b8a6848eebb8,"Wed, 08 Jan 2020 04:55:05 GMT",https://anchor.fm/s/dfc3598/podcast/play/95560...,00:34:21,,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,,2.0,18.0,
7,The Christmas Episode,Join Hunter and Brent each week while we tackl...,https://podcasters.spotify.com/pod/show/the-ch...,en,Hunter Babcock,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,The Christmas Christmas Episode,<p>MERRY CHRISTMAS! THE DAY THAT WE TALK ABOUT...,https://podcasters.spotify.com/pod/show/the-ch...,45788aa0-64bf-4850-95e7-2f986340ec6d,"Wed, 25 Dec 2019 16:53:51 GMT",https://anchor.fm/s/dfc3598/podcast/play/92806...,00:35:38,,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,,2.0,17.0,
8,The Christmas Episode,Join Hunter and Brent each week while we tackl...,https://podcasters.spotify.com/pod/show/the-ch...,en,Hunter Babcock,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,Here Comes Santa Claus,<p>Brent and Hunter take you to the streets an...,https://podcasters.spotify.com/pod/show/the-ch...,c5ac1c7b-b0d1-43f4-816f-9ac1d3554590,"Wed, 18 Dec 2019 15:32:55 GMT",https://anchor.fm/s/dfc3598/podcast/play/90855...,00:37:28,,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,,2.0,16.0,
9,The Christmas Episode,Join Hunter and Brent each week while we tackl...,https://podcasters.spotify.com/pod/show/the-ch...,en,Hunter Babcock,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,Here Comes Aaron Carter,<p>Hunter and Brent can barely get through the...,https://podcasters.spotify.com/pod/show/the-ch...,791d3d8d-178f-fde9-2ada-e2817b84a0d9,"Thu, 12 Dec 2019 01:45:37 GMT",https://anchor.fm/s/dfc3598/podcast/play/89550...,00:27:59,,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,,2.0,15.0,


In [None]:
import pandas as pd
import feedparser

# === Settings ===
metadata_path = "../podcasts-no-audio-13GB/metadata/spotify-podcasts-2020/metadata.tsv"
rss_url = "https://anchor.fm/s/dfc3598/podcast/rss"

# === Load Metadata ===
metadata = pd.read_csv(metadata_path, delimiter="\t", encoding="utf-8")
metadata["show_id"] = metadata["show_uri"].apply(lambda x: x.split(":")[-1])
metadata["episode_id"] = metadata["episode_uri"].apply(lambda x: x.split(":")[-1])
metadata["rss_link"] = metadata["rss_link"].astype(str).str.strip()

# Keep only relevant columns
metadata = metadata[[
    "episode_name", "episode_id", "show_name", "show_id", "rss_link"
]]

# === Filter only the show for the given RSS ===
filtered_metadata = metadata[metadata["rss_link"] == rss_url].copy()

# === Parse the RSS Feed ===
feed = feedparser.parse(rss_url)

# Build a dictionary: {guid: entry}
rss_entries = {}
for entry in feed.entries:
    guid = entry.get("guid") or entry.get("id") or entry.get("link")
    if guid:
        rss_entries[guid] = entry

# Get show-level image
show_image = ""
if "itunes_image" in feed.feed:
    show_image = feed.feed["itunes_image"].get("href", "")
elif "image" in feed.feed and "href" in feed.feed["image"]:
    show_image = feed.feed["image"]["href"]

# Prepare final fields
filtered_metadata["image_show"] = show_image
filtered_metadata["image_episode"] = ""
filtered_metadata["audio_url"] = ""

for idx, row in filtered_metadata.iterrows():
    metadata_title = row["episode_name"].strip().lower()  # Normalize metadata title
    matched_entry = None

    # Log matching process
    for guid, entry in rss_entries.items():
        rss_title = entry.get("title", "").strip().lower()  # Normalize RSS title
        #print(f"Checking metadata title: {metadata_title} against RSS title: {rss_title}")
        if metadata_title == rss_title:  # Match by title
            matched_entry = entry
            break

    if matched_entry:
        # Check for audio enclosure
        if matched_entry.enclosures:
            audio_url = matched_entry.enclosures[0].get("href", "")
            filtered_metadata.at[idx, "audio_url"] = audio_url
            print(f"Matched! Metadata title: {metadata_title}, RSS title: {rss_title}, Audio URL: {audio_url}")
        else:
            print(f"⚠️ No audio enclosure for metadata title: {metadata_title}, RSS title: {rss_title}")

        # Set episode image
        filtered_metadata.at[idx, "image_episode"] = matched_entry.get("itunes_image", {}).get("href", show_image)
    else:
        print(f"⚠️ No match found for metadata title: {metadata_title}")

# === Save to CSV or show in notebook ===
output_path = "final_podcast_table.csv"
filtered_metadata = filtered_metadata[[
    "episode_name", "episode_id", "show_name", "show_id",
    "image_show", "image_episode", "audio_url"
]]
#filtered_metadata.to_csv(output_path, index=False)
#print(f"Saved final table to {output_path}")
filtered_metadata


Matched! Metadata title: the christmas (music) episode, RSS title: the christmas (music) episode, Audio URL: https://anchor.fm/s/dfc3598/podcast/play/8805681/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fproduction%2F2019-11-4%2F36606655-44100-2-4c13b6d408342.mp3
Matched! Metadata title: a very special family guy freakin' christmas, RSS title: a very special family guy freakin' christmas, Audio URL: https://anchor.fm/s/dfc3598/podcast/play/8689403/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fproduction%2F2019-10-27%2F35876883-44100-2-5f1cce13bd017.mp3
Matched! Metadata title: i'll be home for christmas (e.r.), RSS title: i'll be home for christmas (e.r.), Audio URL: https://anchor.fm/s/dfc3598/podcast/play/8429237/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fproduction%2F2019-10-13%2F34053917-44100-2-3151fbc0c0298.mp3


Unnamed: 0,episode_name,episode_id,show_name,show_id,image_show,image_episode,audio_url
13242,The Christmas (Music) Episode,0yVedZUm6lNBI2xBJsZigK,The Christmas Episode,5zn6fKhiv7cjHnkaVKptCj,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://anchor.fm/s/dfc3598/podcast/play/88056...
25725,A Very Special Family Guy Freakin' Christmas,1tb1d5MUQTHN4dCYS5BHbS,The Christmas Episode,5zn6fKhiv7cjHnkaVKptCj,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://anchor.fm/s/dfc3598/podcast/play/86894...
32709,I'll Be Home for Christmas (E.R.),2OtcivUOB008EV1RKuWxJ9,The Christmas Episode,5zn6fKhiv7cjHnkaVKptCj,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://anchor.fm/s/dfc3598/podcast/play/84292...


In [4]:
import feedparser

# RSS URL
rss_url = "https://anchor.fm/s/dfc3598/podcast/rss"

# Parse feed
feed = feedparser.parse(rss_url)

# Check each episode
missing_audio = []

for entry in feed.entries:
    if not entry.enclosures:
        missing_audio.append(entry.get("title", "Untitled Episode"))

if missing_audio:
    print(f"⚠️ Episodes without audio ({len(missing_audio)}):")
    for title in missing_audio:
        print(f"- {title}")
else:
    print(f"✅ All {len(feed.entries)} episodes have audio enclosures.")


✅ All 33 episodes have audio enclosures.


In [9]:
import pandas as pd
import feedparser

# === Settings ===
metadata_path = "../podcasts-no-audio-13GB/metadata/spotify-podcasts-2020/metadata.tsv"

# === Load Metadata ===
metadata = pd.read_csv(metadata_path, delimiter="\t", encoding="utf-8")
metadata["show_id"] = metadata["show_uri"].apply(lambda x: x.split(":")[-1])
metadata["episode_id"] = metadata["episode_uri"].apply(lambda x: x.split(":")[-1])
metadata["rss_link"] = metadata["rss_link"].astype(str).str.strip()

# Keep only relevant columns
metadata = metadata[[
    "episode_name", "episode_id", "show_name", "show_id", "rss_link"
]]

# === Get the first 3 unique RSS links ===
unique_rss_links = metadata["rss_link"].dropna().unique()[:3]

# === Initialize lists and counters ===
episodes_with_audio = []
unmatched_count = 0

# === Process each RSS link ===
for rss_url in unique_rss_links:
    print(f"Processing RSS feed: {rss_url}")

    # === Filter only the show for the given RSS ===
    filtered_metadata = metadata[metadata["rss_link"] == rss_url].copy()

    # === Parse the RSS Feed ===
    feed = feedparser.parse(rss_url)

    # Build a dictionary: {guid: entry}
    rss_entries = {}
    for entry in feed.entries:
        guid = entry.get("guid") or entry.get("id") or entry.get("link")
        if guid:
            rss_entries[guid] = entry

    # Get show-level image
    show_image = ""
    if "itunes_image" in feed.feed:
        show_image = feed.feed["itunes_image"].get("href", "")
    elif "image" in feed.feed and "href" in feed.feed["image"]:
        show_image = feed.feed["image"]["href"]

    # Prepare final fields
    filtered_metadata["image_show"] = show_image
    filtered_metadata["image_episode"] = ""
    filtered_metadata["audio_url"] = ""

    for idx, row in filtered_metadata.iterrows():
        metadata_title = row["episode_name"].strip().lower()  # Normalize metadata title
        matched_entry = None

        # Log matching process
        for guid, entry in rss_entries.items():
            rss_title = entry.get("title", "").strip().lower()  # Normalize RSS title
            if metadata_title == rss_title:  # Match by title
                matched_entry = entry
                break

        if matched_entry:
            # Check for audio enclosure
            if matched_entry.enclosures:
                audio_url = matched_entry.enclosures[0].get("href", "")
                filtered_metadata.at[idx, "audio_url"] = audio_url
                filtered_metadata.at[idx, "image_episode"] = matched_entry.get("itunes_image", {}).get("href", show_image)
                episodes_with_audio.append(filtered_metadata.loc[idx])
                #print(f"Matched! Metadata title: {metadata_title}, RSS title: {rss_title}, Audio URL: {audio_url}")
            else:
                unmatched_count += 1
                print(f"⚠️ No audio enclosure for metadata title: {metadata_title}, RSS title: {rss_title}")
        else:
            unmatched_count += 1
            #print(f"⚠️ No match found for metadata title: {metadata_title}")

# === Convert results to DataFrame ===
df_with_audio = pd.DataFrame(episodes_with_audio)

# === Ensure total entries match ===
total_checked = len(df_with_audio) + unmatched_count
print(f"Total entries checked: {total_checked}")
print(f"Entries with audio: {len(df_with_audio)}")
print(f"Unmatched entries: {unmatched_count}")

# Display the DataFrame
#print("Episodes with audio:")
display(df_with_audio)

Processing RSS feed: https://anchor.fm/s/11b84b68/podcast/rss
Processing RSS feed: https://anchor.fm/s/b07181c/podcast/rss
Processing RSS feed: https://anchor.fm/s/81a072c/podcast/rss
Total entries checked: 263
Entries with audio: 15
Unmatched entries: 248


Unnamed: 0,episode_name,episode_id,show_name,show_id,rss_link,image_show,image_episode,audio_url
0,1: It’s Christmas Time!,000A9sRBYdVh66csG2qEdj,Kream in your Koffee,2NYtxEZyYelR6RMKmjfPLB,https://anchor.fm/s/11b84b68/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/s/11b84b68/podcast/play/9079...
11957,2: Tan Hands Save Lives,0sTNg31EACSHfZlt41RHmS,Kream in your Koffee,2NYtxEZyYelR6RMKmjfPLB,https://anchor.fm/s/11b84b68/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/s/11b84b68/podcast/play/9090...
46132,6: #BYOD (with Liz Pickles),3Ny7dKZ1QHZwadslXJ8Umf,Kream in your Koffee,2NYtxEZyYelR6RMKmjfPLB,https://anchor.fm/s/11b84b68/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/s/11b84b68/podcast/play/9796...
29274,The Bodies In The Bathtubs - January 11 2020 -...,29YonFNXG3zgpovmN5jsPg,Morning Cup Of Murder,15iWCbU7QoO23EndPEO6aN,https://anchor.fm/s/b07181c/podcast/rss,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://chrt.fm/track/4EB79A/pscrb.fm/rss/p/tr...
31887,"Welcome To December - Teasers, Promos and more!",2LFdc0F51vDtF3eYB0PV4t,Morning Cup Of Murder,15iWCbU7QoO23EndPEO6aN,https://anchor.fm/s/b07181c/podcast/rss,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://chrt.fm/track/4EB79A/pscrb.fm/rss/p/tr...
33707,Bind Torture and Kill B.T.K. - January 15 2020...,2TN8ceYiYt3s4rKAdLrjtq,Morning Cup Of Murder,15iWCbU7QoO23EndPEO6aN,https://anchor.fm/s/b07181c/podcast/rss,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://chrt.fm/track/4EB79A/pscrb.fm/rss/p/tr...
48243,Murder Over A Locked Safe - January 28 2020 - ...,3XGKTbzFsWXhEhhhFuTVPQ,Morning Cup Of Murder,15iWCbU7QoO23EndPEO6aN,https://anchor.fm/s/b07181c/podcast/rss,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://chrt.fm/track/4EB79A/pscrb.fm/rss/p/tr...
54166,"Welcome to October 2019! Updates, Information ...",3yJGK1sfUNhk96hHDoaG19,Morning Cup Of Murder,15iWCbU7QoO23EndPEO6aN,https://anchor.fm/s/b07181c/podcast/rss,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://chrt.fm/track/4EB79A/pscrb.fm/rss/p/tr...
63415,Worst Serial Killer in Slovakian History - Dec...,4fMRDkraf5vDFj3nUjg40m,Morning Cup Of Murder,15iWCbU7QoO23EndPEO6aN,https://anchor.fm/s/b07181c/podcast/rss,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://chrt.fm/track/4EB79A/pscrb.fm/rss/p/tr...
81917,Intro to January with True Crime Finland and T...,62Qh8UhqU9Pb5vhwpPm8ZX,Morning Cup Of Murder,15iWCbU7QoO23EndPEO6aN,https://anchor.fm/s/b07181c/podcast/rss,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://megaphone.imgix.net/podcasts/a5cb16d4-...,https://chrt.fm/track/4EB79A/pscrb.fm/rss/p/tr...


In [10]:
# Count the number of unique RSS links in the metadata
unique_rss_links_count = metadata["rss_link"].nunique()
print(f"Number of unique RSS links in the metadata: {unique_rss_links_count}")

Number of unique RSS links in the metadata: 18376


In [18]:
pip install requests

Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (35 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Using cached requests-2.32.3-py3-none-any.whl (64 kB)
Using cached charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (145 kB)
Using cached idna-3.10-py3-none-any.whl (70 kB)
Installing collected packages: idna, charset-normalizer, requests
Successfully installed charset-normalizer-3.4.1 idna-3.10 requests-2.32.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import feedparser
import time
import requests

# === Settings ===
metadata_path = "../podcasts-no-audio-13GB/metadata/spotify-podcasts-2020/metadata.tsv"

# === Load Metadata ===
metadata = pd.read_csv(metadata_path, delimiter="\t", encoding="utf-8")
metadata["show_id"] = metadata["show_uri"].apply(lambda x: x.split(":")[-1])
metadata["episode_id"] = metadata["episode_uri"].apply(lambda x: x.split(":")[-1])
metadata["rss_link"] = metadata["rss_link"].astype(str).str.strip()

# Keep only relevant columns
metadata = metadata[[
    "episode_name", "episode_id", "show_name", "show_id", "rss_link"
]]

# === Get the first 500 unique RSS links ===
unique_rss_links = metadata["rss_link"].dropna().unique()[:500]

# === Initialize lists and counters ===
episodes_with_audio = []
unmatched_count = 0

# === Start timing ===
start_time = time.time()

# === Process each RSS link ===
for rss_url, index in unique_rss_links:
    print(f"{index}: Processing RSS feed: {rss_url}")

    # Check if the URL is accessible within 5 seconds
    try:
        response = requests.get(rss_url, timeout=5)
        if response.status_code != 200:
            print(f"⚠️ Unable to access RSS feed: {rss_url} (Status code: {response.status_code})")
            continue
    except requests.exceptions.RequestException as e:
        print(f"⚠️ Timeout or error accessing RSS feed: {rss_url} ({e})")
        continue

    # === Filter only the show for the given RSS ===
    filtered_metadata = metadata[metadata["rss_link"] == rss_url].copy()

    # === Parse the RSS Feed ===
    feed = feedparser.parse(rss_url)

    # Build a dictionary: {guid: entry}
    rss_entries = {}
    for entry in feed.entries:
        guid = entry.get("guid") or entry.get("id") or entry.get("link")
        if guid:
            rss_entries[guid] = entry

    # Get show-level image
    show_image = ""
    if "itunes_image" in feed.feed:
        show_image = feed.feed["itunes_image"].get("href", "")
    elif "image" in feed.feed and "href" in feed.feed["image"]:
        show_image = feed.feed["image"]["href"]

    # Prepare final fields
    filtered_metadata["image_show"] = show_image
    filtered_metadata["image_episode"] = ""
    filtered_metadata["audio_url"] = ""

    for idx, row in filtered_metadata.iterrows():
        metadata_title = row["episode_name"].strip().lower()  # Normalize metadata title
        matched_entry = None

        # Log matching process
        for guid, entry in rss_entries.items():
            rss_title = entry.get("title", "").strip().lower()  # Normalize RSS title
            if metadata_title == rss_title:  # Match by title
                matched_entry = entry
                break

        if matched_entry:
            # Check for audio enclosure
            if matched_entry.enclosures:
                audio_url = matched_entry.enclosures[0].get("href", "")
                filtered_metadata.at[idx, "audio_url"] = audio_url
                filtered_metadata.at[idx, "image_episode"] = matched_entry.get("itunes_image", {}).get("href", show_image)
                episodes_with_audio.append(filtered_metadata.loc[idx])
                #print(f"Matched! Metadata title: {metadata_title}, RSS title: {rss_title}, Audio URL: {audio_url}")
            else:
                unmatched_count += 1
                print(f"⚠️ No audio enclosure for metadata title: {metadata_title}, RSS title: {rss_title}")
        else:
            unmatched_count += 1
            #print(f"⚠️ No match found for metadata title: {metadata_title}")

# === End timing ===
end_time = time.time()
elapsed_time = end_time - start_time

# === Convert results to DataFrame ===
df_with_audio = pd.DataFrame(episodes_with_audio)

# === Ensure total entries match ===
total_checked = len(df_with_audio) + unmatched_count
print(f"Total entries checked: {total_checked}")
print(f"Entries with audio: {len(df_with_audio)}")
print(f"Unmatched entries: {unmatched_count}")
print(f"Time taken to process 500 RSS links: {elapsed_time:.2f} seconds")

# Display the DataFrame
print("Episodes with audio:")
display(df_with_audio)

ModuleNotFoundError: No module named 'pandas'

In [None]:
import pandas as pd
import feedparser
import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# === Settings ===
metadata_path = "../podcasts-no-audio-13GB/metadata/spotify-podcasts-2020/metadata.tsv"
MAX_WORKERS = 10  # Number of threads for parallel processing

# === Load Metadata ===
metadata = pd.read_csv(metadata_path, delimiter="\t", encoding="utf-8")
metadata["show_id"] = metadata["show_uri"].apply(lambda x: x.split(":")[-1])
metadata["episode_id"] = metadata["episode_uri"].apply(lambda x: x.split(":")[-1])
metadata["rss_link"] = metadata["rss_link"].astype(str).str.strip()

# Keep only relevant columns
metadata = metadata[[
    "episode_name", "episode_id", "show_name", "show_id", "rss_link"
]]

# === Get the first 500 unique RSS links ===
unique_rss_links = metadata["rss_link"].dropna().unique()[:500]

# === Initialize lists and counters ===
episodes_with_audio = []
unmatched_count = 0

# === Function to process a single RSS link ===
def process_rss_link(rss_url):
    global unmatched_count
    try:
        # Check if the URL is accessible within 5 seconds
        response = requests.get(rss_url, timeout=5)
        if response.status_code != 200:
            print(f"⚠️ Unable to access RSS feed: {rss_url} (Status code: {response.status_code})")
            return []

        # Parse the RSS Feed
        feed = feedparser.parse(rss_url)

        # Build a dictionary: {guid: entry}
        rss_entries = {}
        for entry in feed.entries:
            guid = entry.get("guid") or entry.get("id") or entry.get("link")
            if guid:
                rss_entries[guid] = entry

        # Get show-level image
        show_image = ""
        if "itunes_image" in feed.feed:
            show_image = feed.feed["itunes_image"].get("href", "")
        elif "image" in feed.feed and "href" in feed.feed["image"]:
            show_image = feed.feed["image"]["href"]

        # Filter metadata for the current RSS link
        filtered_metadata = metadata[metadata["rss_link"] == rss_url].copy()
        filtered_metadata["image_show"] = show_image
        filtered_metadata["image_episode"] = ""
        filtered_metadata["audio_url"] = ""

        matched_episodes = []
        for idx, row in filtered_metadata.iterrows():
            metadata_title = row["episode_name"].strip().lower()  # Normalize metadata title
            matched_entry = None

            # Match metadata title with RSS entry title
            for guid, entry in rss_entries.items():
                rss_title = entry.get("title", "").strip().lower()  # Normalize RSS title
                if metadata_title == rss_title:
                    matched_entry = entry
                    break

            if matched_entry:
                # Check for audio enclosure
                if matched_entry.enclosures:
                    audio_url = matched_entry.enclosures[0].get("href", "")
                    filtered_metadata.at[idx, "audio_url"] = audio_url
                    filtered_metadata.at[idx, "image_episode"] = matched_entry.get("itunes_image", {}).get("href", show_image)
                    matched_episodes.append(filtered_metadata.loc[idx])
                else:
                    unmatched_count += 1
            else:
                unmatched_count += 1

        return matched_episodes

    except requests.exceptions.RequestException as e:
        #print(f"⚠️ Timeout or error accessing RSS feed: {rss_url} ({e})")
        return []

# === Start timing ===
start_time = time.time()

# === Process RSS links in parallel ===
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(process_rss_link, rss_url): rss_url for rss_url in unique_rss_links}
    for future in as_completed(futures):
        rss_url = futures[future]
        try:
            episodes_with_audio.extend(future.result())
        except Exception as e:
            #print(f"⚠️ Error processing RSS feed: {rss_url} ({e})")

# === End timing ===
end_time = time.time()
elapsed_time = end_time - start_time

# === Convert results to DataFrame ===
df_with_audio = pd.DataFrame(episodes_with_audio)

# === Ensure total entries match ===
total_checked = len(df_with_audio) + unmatched_count
print(f"Total entries checked: {total_checked}")
print(f"Entries with audio: {len(df_with_audio)}")
print(f"Unmatched entries: {unmatched_count}")
print(f"Time taken to process 500 RSS links: {elapsed_time:.2f} seconds")

# Display the DataFrame
print("Episodes with audio:")
display(df_with_audio)

⚠️ Unable to access RSS feed: https://anchor.fm/s/fa62700/podcast/rss (Status code: 404)
⚠️ Unable to access RSS feed: https://anchor.fm/s/11a7abf0/podcast/rss (Status code: 404)
⚠️ Unable to access RSS feed: https://anchor.fm/s/e2aaf40/podcast/rss (Status code: 404)
⚠️ Unable to access RSS feed: https://anchor.fm/s/4b4f394/podcast/rss (Status code: 404)
⚠️ Unable to access RSS feed: https://anchor.fm/s/70bc028/podcast/rss (Status code: 404)
⚠️ Unable to access RSS feed: https://anchor.fm/s/d8cf584/podcast/rss (Status code: 404)
⚠️ Unable to access RSS feed: https://anchor.fm/s/e59de8c/podcast/rss (Status code: 404)
⚠️ Unable to access RSS feed: https://anchor.fm/s/129266f4/podcast/rss (Status code: 404)
⚠️ Unable to access RSS feed: https://anchor.fm/s/87afb18/podcast/rss (Status code: 404)
⚠️ Unable to access RSS feed: https://anchor.fm/s/fc369a0/podcast/rss (Status code: 404)
⚠️ Timeout or error accessing RSS feed: https://podcast.rss.com/elevate/feed.xml (HTTPSConnectionPool(host='

Unnamed: 0,episode_name,episode_id,show_name,show_id,rss_link,image_show,image_episode,audio_url
9,Talia and me part 2!,003EmVD7eAmRTBSwHXqi1W,The Good Sign,1dyTrS3vDtpClrURfKdo3q,https://anchor.fm/s/fef17a8/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://anchor.fm/s/fef17a8/podcast/play/88990...
5381,Me and my big sis,0OHm5UCels4t8Mk7KYwzC5,The Good Sign,1dyTrS3vDtpClrURfKdo3q,https://anchor.fm/s/fef17a8/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://anchor.fm/s/fef17a8/podcast/play/86810...
30697,Dealing with Disappointment,2G3wbzYhNY9Yinr7KuNumD,The Good Sign,1dyTrS3vDtpClrURfKdo3q,https://anchor.fm/s/fef17a8/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://anchor.fm/s/fef17a8/podcast/play/85188...
31671,Q and A,2KLGk8qQVQiCz8JApKcikY,The Good Sign,1dyTrS3vDtpClrURfKdo3q,https://anchor.fm/s/fef17a8/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://anchor.fm/s/fef17a8/podcast/play/87655...
34487,Donna Simantov reminds you to stay positive an...,2WpkZSA6TqXd4qLZpDQK4O,The Good Sign,1dyTrS3vDtpClrURfKdo3q,https://anchor.fm/s/fef17a8/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://d3t3ozftmdmh3i.cloudfront.net/staging/...,https://anchor.fm/s/fef17a8/podcast/play/82036...
...,...,...,...,...,...,...,...,...
104526,Trauma⎜Radial Head Fractures (ft. Dr. Joaquin ...,7w4m61Gx0FNc18NQzTNSc9,The Orthobullets Podcast,0I22C9iyvVT3M6DEILuD9F,https://anchor.fm/s/415caa8/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/s/415caa8/podcast/play/26920...
104596,Question Session⎪Femoroacetabular Impingement ...,7wPBa4ygniHYtBZTm5vQgD,The Orthobullets Podcast,0I22C9iyvVT3M6DEILuD9F,https://anchor.fm/s/415caa8/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/s/415caa8/podcast/play/49327...
105003,Hand⎪Replantation,7yOahSvk2aqSfXfidVuXUT,The Orthobullets Podcast,0I22C9iyvVT3M6DEILuD9F,https://anchor.fm/s/415caa8/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/s/415caa8/podcast/play/83981...
105180,Shoulder & Elbow⎪Osteochondritis Dissecans of ...,7zFqPzptxpgkt9eqIBGJKd,The Orthobullets Podcast,0I22C9iyvVT3M6DEILuD9F,https://anchor.fm/s/415caa8/podcast/rss,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://d3t3ozftmdmh3i.cloudfront.net/producti...,https://anchor.fm/s/415caa8/podcast/play/40964...


In [2]:
import pandas as pd
import feedparser
import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# === Settings ===
metadata_path = "../podcasts-no-audio-13GB/metadata/spotify-podcasts-2020/metadata.tsv"
MAX_WORKERS = 10  # Number of threads for parallel processing

# === Load Metadata ===
metadata = pd.read_csv(metadata_path, delimiter="\t", encoding="utf-8")
metadata["show_id"] = metadata["show_uri"].apply(lambda x: x.split(":")[-1])
metadata["episode_id"] = metadata["episode_uri"].apply(lambda x: x.split(":")[-1])
metadata["rss_link"] = metadata["rss_link"].astype(str).str.strip()

# Keep only relevant columns
metadata = metadata[[
    "episode_name", "episode_id", "show_name", "show_id", "rss_link"
]]

# === Get all unique RSS links ===
unique_rss_links = metadata["rss_link"].dropna().unique()

# === Initialize lists and counters ===
episodes_with_audio = []
unmatched_count = 0

# === Function to process a single RSS link ===
def process_rss_link(rss_url):
    global unmatched_count
    try:
        # Check if the URL is accessible within 5 seconds
        response = requests.get(rss_url, timeout=5)
        if response.status_code != 200:
            #print(f"⚠️ Unable to access RSS feed: {rss_url} (Status code: {response.status_code})")
            return []

        # Parse the RSS Feed
        feed = feedparser.parse(rss_url)

        # Build a dictionary: {guid: entry}
        rss_entries = {}
        for entry in feed.entries:
            guid = entry.get("guid") or entry.get("id") or entry.get("link")
            if guid:
                rss_entries[guid] = entry

        # Get show-level image
        show_image = ""
        if "itunes_image" in feed.feed:
            show_image = feed.feed["itunes_image"].get("href", "")
        elif "image" in feed.feed and "href" in feed.feed["image"]:
            show_image = feed.feed["image"]["href"]

        # Filter metadata for the current RSS link
        filtered_metadata = metadata[metadata["rss_link"] == rss_url].copy()
        filtered_metadata["image_show"] = show_image
        filtered_metadata["image_episode"] = ""
        filtered_metadata["audio_url"] = ""

        matched_episodes = []
        for idx, row in filtered_metadata.iterrows():
            metadata_title = row["episode_name"].strip().lower()  # Normalize metadata title
            matched_entry = None

            # Match metadata title with RSS entry title
            for guid, entry in rss_entries.items():
                rss_title = entry.get("title", "").strip().lower()  # Normalize RSS title
                if metadata_title == rss_title:
                    matched_entry = entry
                    break

            if matched_entry:
                # Check for audio enclosure
                if matched_entry.enclosures:
                    audio_url = matched_entry.enclosures[0].get("href", "")
                    filtered_metadata.at[idx, "audio_url"] = audio_url
                    filtered_metadata.at[idx, "image_episode"] = matched_entry.get("itunes_image", {}).get("href", show_image)
                    matched_episodes.append(filtered_metadata.loc[idx])
                else:
                    unmatched_count += 1
            else:
                unmatched_count += 1

        return matched_episodes

    except requests.exceptions.RequestException as e:
        #print(f"⚠️ Timeout or error accessing RSS feed: {rss_url} ({e})")
        return []

# === Start timing ===
start_time = time.time()

# === Process RSS links in parallel ===
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(process_rss_link, rss_url): rss_url for rss_url in unique_rss_links}
    for future in as_completed(futures):
        rss_url = futures[future]
        try:
            episodes_with_audio.extend(future.result())
        except Exception as e:
            next
            #print(f"⚠️ Error processing RSS feed: {rss_url} ({e})")

# === End timing ===
end_time = time.time()
elapsed_time = end_time - start_time

# === Convert results to DataFrame ===
df_with_audio = pd.DataFrame(episodes_with_audio)

# === Ensure total entries match ===
total_checked = len(df_with_audio) + unmatched_count
print(f"Total entries checked: {total_checked}")
print(f"Entries with audio: {len(df_with_audio)}")
print(f"Unmatched entries: {unmatched_count}")
print(f"Time taken to process all RSS links: {elapsed_time:.2f} seconds")

# Save the DataFrame to a CSV file
output_path = "final_podcast_table_all.csv"
df_with_audio.to_csv(output_path, index=False)
print(f"Saved final table to {output_path}")

Total entries checked: 96709
Entries with audio: 82480
Unmatched entries: 14229
Time taken to process all RSS links: 1181.80 seconds
Saved final table to final_podcast_table_all.csv


In [2]:
import pandas as pd
import feedparser
import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm 

# === Settings ===
metadata_path = "../../podcasts-no-audio-13GB/metadata/spotify-podcasts-2020/metadata.tsv"
MAX_WORKERS = 10  # Number of threads for parallel processing

# === Load Metadata ===
metadata = pd.read_csv(metadata_path, delimiter="\t", encoding="utf-8")
metadata["show_id"] = metadata["show_uri"].apply(lambda x: x.split(":")[-1])
metadata["episode_id"] = metadata["episode_uri"].apply(lambda x: x.split(":")[-1])
metadata["rss_link"] = metadata["rss_link"].astype(str).str.strip()

                
# Keep only relevant columns
metadata = metadata[[
    "episode_name", "episode_id", "show_name", "show_id", "rss_link", "language", "publisher", "episode_description", "duration"
]]

# === Get all unique RSS links ===
unique_rss_links = metadata["rss_link"].dropna().unique()

# === Initialize lists and counters ===
episodes_with_audio = []
unmatched_count = 0

# === Function to process a single RSS link ===
def process_rss_link(rss_url):
    global unmatched_count
    try:
        # Check if the URL is accessible within 5 seconds
        response = requests.get(rss_url, timeout=5)
        if response.status_code != 200:
            #print(f"⚠️ Unable to access RSS feed: {rss_url} (Status code: {response.status_code})")
            return []

        # Parse the RSS Feed
        feed = feedparser.parse(rss_url)

        # Build a dictionary: {guid: entry}
        rss_entries = {}
        for entry in feed.entries:
            guid = entry.get("guid") or entry.get("id") or entry.get("link")
            if guid:
                rss_entries[guid] = entry

        # Get show-level image
        show_image = ""
        if "itunes_image" in feed.feed:
            show_image = feed.feed["itunes_image"].get("href", "")
        elif "image" in feed.feed and "href" in feed.feed["image"]:
            show_image = feed.feed["image"]["href"]

        # Filter metadata for the current RSS link
        filtered_metadata = metadata[metadata["rss_link"] == rss_url].copy()
        filtered_metadata["image_show"] = show_image
        filtered_metadata["image_episode"] = ""
        filtered_metadata["audio_url"] = ""

        matched_episodes = []
        for idx, row in filtered_metadata.iterrows():
            metadata_title = row["episode_name"].strip().lower()  # Normalize metadata title
            matched_entry = None

            # Match metadata title with RSS entry title
            for guid, entry in rss_entries.items():
                rss_title = entry.get("title", "").strip().lower()  # Normalize RSS title
                if metadata_title == rss_title:
                    matched_entry = entry
                    break

            if matched_entry:
                # Check for audio enclosure
                if matched_entry.enclosures:
                    audio_url = matched_entry.enclosures[0].get("href", "")
                    filtered_metadata.at[idx, "audio_url"] = audio_url
                    filtered_metadata.at[idx, "image_episode"] = matched_entry.get("itunes_image", {}).get("href", show_image)
                    matched_episodes.append(filtered_metadata.loc[idx])
                else:
                    unmatched_count += 1
            else:
                unmatched_count += 1

        return matched_episodes

    except requests.exceptions.RequestException as e:
        #print(f"⚠️ Timeout or error accessing RSS feed: {rss_url} ({e})")
        return []

# === Start timing ===
start_time = time.time()

# === Process RSS links in parallel ===
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(process_rss_link, rss_url): rss_url for rss_url in unique_rss_links}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing RSS feeds"):
        rss_url = futures[future]
        try:
            episodes_with_audio.extend(future.result())
        except Exception as e:
            next
            #print(f"⚠️ Error processing RSS feed: {rss_url} ({e})")

# === Add unmatched episodes ===
matched_episode_ids = {episode["episode_id"] for episode in episodes_with_audio}
unmatched_metadata = metadata[~metadata["episode_id"].isin(matched_episode_ids)].copy()
unmatched_metadata["image_show"] = ""
unmatched_metadata["image_episode"] = ""
unmatched_metadata["audio_url"] = ""

# Combine matched and unmatched episodes
df_with_audio = pd.DataFrame(episodes_with_audio)
df_with_audio = pd.concat([df_with_audio, unmatched_metadata], ignore_index=True)

# === End timing ===
end_time = time.time()
elapsed_time = end_time - start_time

# === Ensure total entries match ===
total_checked = len(df_with_audio)
print(f"Total entries checked: {total_checked}")
print(f"Entries with audio: {len(df_with_audio) - len(unmatched_metadata)}")
print(f"Unmatched entries: {len(unmatched_metadata)}")
print(f"Time taken to process all RSS links: {elapsed_time:.2f} seconds")

# Save the DataFrame to a CSV file
output_path = "final_podcast_table_all_with_unmatched.csv"
df_with_audio.to_csv(output_path, index=False)
print(f"Saved final table to {output_path}")

Processing RSS feeds: 100%|██████████| 18376/18376 [20:40<00:00, 14.82it/s]


Total entries checked: 105360
Entries with audio: 82436
Unmatched entries: 22924
Time taken to process all RSS links: 1241.40 seconds
Saved final table to final_podcast_table_all_with_unmatched.csv
