In [1]:
!pip install selenium chromedriver-py tqdm pandas requests





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\mihir\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [6]:
# ======================== TED TALKS SCRAPER v5 ========================

import time
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from chromedriver_py import binary_path
import pandas as pd
import requests
import logging
from requests.exceptions import RequestException
import os

# ========================== CONFIGURATION ==========================

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

max_page = 50        # Adjust for full scrape
sleep_time = 1       # Delay between requests

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("window-size=1900,800")

# ===================================================================

def get_browser():
    """Launch Chrome browser in headless mode."""
    service = Service(executable_path=binary_path)
    return webdriver.Chrome(service=service, options=chrome_options)


def safe_api_call(url, payload, max_retries=3):
    """Make API calls with retry logic."""
    headers = {
        'Content-type': 'application/json; charset=UTF-8',
        "User-Agent": "Mozilla/5.0 (compatible; TED-Scraper/5.0)"
    }
    for attempt in range(max_retries):
        try:
            response = requests.post(url, headers=headers, json=payload, timeout=30)
            if response.status_code == 200:
                return response
            else:
                logging.warning(f"Attempt {attempt + 1}: Status code {response.status_code}")
        except RequestException as e:
            logging.warning(f"Attempt {attempt + 1}: Request failed - {e}")
        time.sleep(sleep_time * 2)
    return None


def talks_page():
    """Load TED.com once to handle cookies."""
    url = 'https://www.ted.com/talks?sort=newest'
    logging.info(f'Navigating to {url}')
    browser = get_browser()
    browser.get(url)
    time.sleep(sleep_time * 4)
    try:
        cookie_btn = browser.find_element(By.ID, 'onetrust-accept-btn-handler')
        cookie_btn.click()
        logging.info("Cookie acceptance clicked.")
    except Exception as e:
        logging.warning(f"Cookie acceptance failed: {e}")
    time.sleep(sleep_time)
    browser.quit()


# ========================== MAIN SCRAPER ==========================

talks_page()
logging.info(f"Starting TED Talks scraping for {max_page} pages...")

final = []
for page in tqdm(range(0, max_page), desc="Scraping talk metadata"):
    payload = [
        {"indexName": "newest",
         "params": {"attributeForDistinct": "objectID",
                    "distinct": 1,
                    "facets": ["subtitle_languages", "tags"],
                    "highlightPostTag": "__/ais-highlight__",
                    "highlightPreTag": "__ais-highlight__",
                    "hitsPerPage": 24,
                    "maxValuesPerFacet": 500,
                    "page": page, "query": "",
                    "tagFilters": ""}}
    ]
    response = safe_api_call('https://zenith-prod-alt.ted.com/api/search', payload)
    if response and response.status_code == 200:
        my_tedx = response.json()['results'][0]["hits"]
        final.extend(my_tedx)
    else:
        logging.error(f"Failed to fetch page {page}")
    time.sleep(sleep_time)

# Extract main metadata
final_list = []
for talk in final:
    if all(field in talk for field in ['objectID', 'slug', 'speakers', 'title']):
        slug = talk["slug"]
        final_list.append({
            'id': talk["objectID"],
            'slug': slug,
            'speakers': talk["speakers"],
            'title': talk["title"],
            'url': f'https://www.ted.com/talks/{slug}'
        })
    else:
        logging.warning(f"Skipping invalid talk: {talk.get('objectID', 'Unknown')}")

logging.info(f"Total talks scraped: {len(final_list)}")

# ========================== DETAIL SCRAPING ==========================

details, images, tags, related_videos = [], [], [], []
ready = []

for video in tqdm(final_list, desc="Fetching detailed info"):
    slug = video["slug"]
    if slug not in ready:
        query = f"""
        {{
          video(slug: "{slug}", language: "en") {{
            id
            title
            description
            socialDescription
            duration
            viewedCount
            commentsCount
            publishedAt
            presenterDisplayName
            primaryImageSet {{ url }}
            topics {{ nodes {{ name }} }}
            relatedVideos {{
              id slug title duration viewedCount presenterDisplayName
            }}
          }}
        }}
        """
        payload = [{"operationName": None, "variables": {}, "query": query}]
        response = safe_api_call('https://www.ted.com/graphql', payload)

        if response and response.status_code == 200:
            data = response.json()
            if len(data) > 0 and data[0].get("data", {}).get("video"):
                video_data = data[0]["data"]["video"]

                # Detailed info
                details.append({
                    "id": video["id"],
                    "slug": slug,
                    "title": video_data.get("title"),
                    "speakers": video["speakers"],
                    "presenterDisplayName": video_data.get("presenterDisplayName"),
                    "description": video_data.get("description"),
                    "duration": video_data.get("duration"),
                    "publishedAt": video_data.get("publishedAt"),
                    "views": video_data.get("viewedCount"),
                    "plays": video_data.get("viewedCount"),  # proxy for likes
                    "comments": video_data.get("commentsCount"),
                    "transcript_url": f"https://www.ted.com/talks/{slug}/transcript",
                    "category": ", ".join(
                        [t["name"] for t in video_data.get("topics", {}).get("nodes", [])]
                    )
                })

                # Image data
                if video_data.get("primaryImageSet"):
                    for image in video_data["primaryImageSet"]:
                        images.append({
                            "id": video["id"],
                            "url": image["url"]
                        })

                # Tag data
                if video_data.get("topics", {}).get("nodes"):
                    for topic in video_data["topics"]["nodes"]:
                        tags.append({
                            "id": video["id"],
                            "tag": topic["name"]
                        })

                # Related videos
                if video_data.get("relatedVideos"):
                    for related in video_data["relatedVideos"]:
                        related_videos.append({
                            "id": video["id"],
                            "related_id": related["id"],
                            "slug": related["slug"],
                            "title": related["title"],
                            "presenterDisplayName": related["presenterDisplayName"],
                            "duration": related.get("duration"),
                            "viewedCount": related.get("viewedCount")
                        })
        ready.append(slug)
        time.sleep(sleep_time)

# ========================== SAVE DATA ==========================

os.makedirs('ted_data_2', exist_ok=True)

if final_list:
    pd.DataFrame(final_list).to_csv('ted_data_2/ted_talks_list.csv', index=False)
    logging.info(f"Saved {len(final_list)} records to ted_talks_list.csv")

if details:
    pd.DataFrame(details).to_csv('ted_data_2/ted_talks_details.csv', index=False)
    logging.info(f"Saved {len(details)} records to ted_talks_details.csv")

if images:
    pd.DataFrame(images).to_csv('ted_data_2/ted_talks_images.csv', index=False)
    logging.info(f"Saved {len(images)} records to ted_talks_images.csv")

if tags:
    pd.DataFrame(tags).to_csv('ted_data_2/ted_talks_tags.csv', index=False)
    logging.info(f"Saved {len(tags)} records to ted_talks_tags.csv")

if related_videos:
    pd.DataFrame(related_videos).to_csv('ted_data_2/ted_talks_related_videos.csv', index=False)
    logging.info(f"Saved {len(related_videos)} records to ted_talks_related_videos.csv")

logging.info("✅ Scraping completed! All data saved in 'ted_data_2' folder.")


2025-10-29 15:16:47,630 - INFO - Navigating to https://www.ted.com/talks?sort=newest
  (Session info: chrome=141.0.7390.123); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
	GetHandleVerifier [0x0x7ff6cc1e1eb5+80197]
	GetHandleVerifier [0x0x7ff6cc1e1f10+80288]
	(No symbol) [0x0x7ff6cbf602fa]
	(No symbol) [0x0x7ff6cbfb7cd7]
	(No symbol) [0x0x7ff6cbfb7f9c]
	(No symbol) [0x0x7ff6cc00ba87]
	(No symbol) [0x0x7ff6cbfe03bf]
	(No symbol) [0x0x7ff6cc0087fb]
	(No symbol) [0x0x7ff6cbfe0153]
	(No symbol) [0x0x7ff6cbfa8b02]
	(No symbol) [0x0x7ff6cbfa98d3]
	GetHandleVerifier [0x0x7ff6cc49e83d+2949837]
	GetHandleVerifier [0x0x7ff6cc498c6a+2926330]
	GetHandleVerifier [0x0x7ff6cc4b86c7+3055959]
	GetHandleVerifier [0x0x7ff6cc1fcfee+191102]
	GetHandleVerifier [0x0x7ff6cc2050af+224063]
	GetHandleVerifier [0x0x7ff6cc1eaf64+117236]
	GetHandleVerifier [0x0x7ff6cc1eb119+117673]
	GetHandleVerifier [0x0x7f

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm  # Changed from tqdm.notebook
import time
import json
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the CSV file with TED talk metadata
df = pd.read_csv("ted_data/ted_talks_list.csv")  # Use the correct file name

# Ensure the 'transcript' column exists
if "transcript" not in df.columns:
    df["transcript"] = ""  # Create an empty column for transcripts

print(f"Loaded {len(df)} TED talks for transcript extraction")

# Function to extract transcript from TED talk HTML page
def extract_transcript_from_page(url):
    try:
        # Fetch the TED talk page
        response = requests.get(url, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive"
        }, timeout=30)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            
            # Method 1: Check for JSON script containing the transcript
            script_tag = soup.find("script", type="application/ld+json")
            if script_tag:
                try:
                    data = json.loads(script_tag.string)
                    transcript = data.get("transcript")
                    if transcript:
                        logging.info(f"Found transcript via JSON-LD for {url}")
                        return transcript
                except json.JSONDecodeError as e:
                    logging.warning(f"JSON parsing error for {url}: {e}")
            
            # Method 2: Look for transcript in specific TED talk elements
            # Try multiple possible selectors for transcript content
            transcript_selectors = [
                "div[data-testid='transcript']",
                "div.transcript",
                "div.Grid__cell",
                "div.talk-transcript__body",
                "div.Transcript__content"
            ]
            
            for selector in transcript_selectors:
                transcript_elements = soup.select(selector)
                if transcript_elements:
                    transcript_text = " ".join([elem.get_text(strip=True) for elem in transcript_elements])
                    if len(transcript_text) > 100:  # Only return substantial text
                        logging.info(f"Found transcript via selector '{selector}' for {url}")
                        return transcript_text
            
            # Method 3: Look for paragraphs in main content
            main_content = soup.find("main") or soup.find("div", class_="main") or soup.body
            if main_content:
                paragraphs = main_content.find_all("p")
                transcript_text = " ".join([p.get_text(strip=True) for p in paragraphs])
                if len(transcript_text) > 200:  # Only return if substantial content
                    logging.info(f"Found content via paragraphs for {url}")
                    return transcript_text
            
            logging.warning(f"No transcript found for {url}")
            return ""
            
        else:
            logging.warning(f"Failed to fetch {url}. Status code: {response.status_code}")
            return ""
            
    except requests.exceptions.Timeout:
        logging.error(f"Timeout fetching {url}")
        return ""
    except requests.exceptions.RequestException as e:
        logging.error(f"Request error fetching {url}: {e}")
        return ""
    except Exception as e:
        logging.error(f"Unexpected error processing {url}: {e}")
        return ""

# Check current progress
transcripts_existing = df[df["transcript"].notna() & (df["transcript"] != "")].shape[0]
print(f"Already have {transcripts_existing} transcripts out of {len(df)} total talks")

# Iterate over the DataFrame and extract transcripts
print("Starting transcript extraction...")
success_count = 0

for index, row in tqdm(df.iterrows(), total=len(d`f), desc="Extracting transcripts"):
    # Skip if transcript already exists and is not empty
    if pd.notna(row.get("transcript")) and row["transcript"].strip():
        continue
    
    transcript = extract_transcript_from_page(row["url"])
    df.at[index, "transcript"] = transcript
    
    if transcript.strip():
        success_count += 1
    
    # Save progress periodically
    if (index + 1) % 10 == 0:
        df.to_csv("ted_data/ted_talks_transcripts_partial.csv", index=False)
        logging.info(f"Progress: {index + 1}/{len(df)} processed, {success_count} transcripts found")
    
    # Be respectful to the server
    time.sleep(1.5)

# Final save
output_file = "ted_data/ted_talks_transcripts_updated.csv"
df.to_csv(output_file, index=False)

# Print summary
final_transcripts = df[df["transcript"].notna() & (df["transcript"] != "")].shape[0]
print(f"\nTranscript extraction completed!")
print(f"Successfully extracted {final_transcripts} out of {len(df)} transcripts")
print(f"Results saved to: {output_file}")

# Show some statistics
if final_transcripts > 0:
    # Find a talk with transcript
    talks_with_transcripts = df[df["transcript"] != ""]
    if len(talks_with_transcripts) > 0:
        sample_talk = talks_with_transcripts.iloc[0]
        transcript_length = len(sample_talk["transcript"])
        print(f"\nSample talk: {sample_talk['title']}")
        print(f"Transcript length: {transcript_length} characters")
        print(f"Transcript preview: {sample_talk['transcript'][:200]}...")

    # Show distribution of transcript lengths
    transcript_lengths = talks_with_transcripts["transcript"].str.len()
    print(f"\nTranscript length statistics:")
    print(f"  Shortest: {transcript_lengths.min()} characters")
    print(f"  Longest: {transcript_lengths.max()} characters")
    print(f"  Average: {transcript_lengths.mean():.0f} characters")
else:
    print("\nNo transcripts were found. This could be due to:")
    print("1. Website structure changes")
    print("2. Rate limiting")
    print("3. Transcripts not being publicly available")
    print("Consider using TED's official API or checking if transcripts are available in different formats.")

Loaded 1200 TED talks for transcript extraction
Already have 0 transcripts out of 1200 total talks
Starting transcript extraction...


2025-10-17 11:48:49,740 - INFO - Found transcript via JSON-LD for https://www.ted.com/talks/imogen_ellen_napper_and_jim_bentley_the_weirdest_stuff_orbiting_earth
2025-10-17 11:48:52,643 - INFO - Found transcript via JSON-LD for https://www.ted.com/talks/esther_duflo_tax_the_rich_and_save_the_planet
2025-10-17 11:48:54,635 - INFO - Found transcript via JSON-LD for https://www.ted.com/talks/imran_razik_how_vampire_bats_drink_your_blood_without_you_noticing
2025-10-17 11:48:57,017 - INFO - Found transcript via JSON-LD for https://www.ted.com/talks/amaury_guichon_a_pastry_chef_works_his_chocolatier_magic_live
2025-10-17 11:48:59,057 - INFO - Found transcript via JSON-LD for https://www.ted.com/talks/kate_johnson_the_flourishing_future_of_women_s_sports
2025-10-17 11:49:01,024 - INFO - Found transcript via JSON-LD for https://www.ted.com/talks/claudia_vega_the_hidden_cost_of_buying_gold
2025-10-17 11:49:03,418 - INFO - Found transcript via JSON-LD for https://www.ted.com/talks/xu_hao_how_we


Transcript extraction completed!
Successfully extracted 1021 out of 1200 transcripts
Results saved to: ted_data/ted_talks_transcripts_updated.csv

Sample talk: The weirdest stuff orbiting Earth
Transcript length: 4040 characters
Transcript preview: In July of 1969, Neil Armstrong and Buzz Aldrin left the first human footprints on the moon. They also left two pairs of boots, a handful of tools, and four vomit bags. This lunar litter was far from ...

Transcript length statistics:
  Shortest: 1454 characters
  Longest: 57587 characters
  Average: 9322 characters


In [4]:
# Install necessary libraries if not already installed
# !pip install requests beautifulsoup4

import requests
from bs4 import BeautifulSoup

# URL of the TED Talk
url = "https://www.ted.com/talks/bob_mankoff_can_ai_master_the_art_of_humor?subtitle=en"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the script tag containing the transcript JSON
    script_tag = soup.find("script", type="application/ld+json")

    if script_tag:
        import json
        data = json.loads(script_tag.string)

        # Extract the transcript if available
        transcript = data.get("transcript")

        if transcript:
            print("Transcript extracted successfully!")
            print(transcript)
        else:
            print("Transcript not found in the page data.")
    else:
        print("Transcript data not found in the HTML source.")
else:
    print(f"Failed to fetch the page. Status code: {response.status_code}")


Transcript extracted successfully!
Whether you&apos;re thrilled by what AI can do for us or terrified by what AI is going to do to us, whether it can be funny, is probably not top of mind for you. It is for me. I don&apos;t care if it turns all of us into paperclips, as long as they&apos;re funny paper clips. (Laughter) And the fact that it makes stuff up, hallucinates, for me, that&apos;s not a bug, that&apos;s a feature. My entire career was making stuff up. They&apos;re called cartoons. This is probably the most famous one I hallucinated. There are a number of theories of humor that could explain this cartoon. There&apos;s the superiority theory. You&apos;re the guy on the phone, not on the other end. The incongruity theory. There&apos;s a mismatch between the politeness of the language and the rudeness of the message. And the benign violation theory of humor, which is sort of a golden ratio theory of humor, if you will. See, I got in that term (Laughter) Which says, for something t