In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

In [2]:
def initialize_driver():
    # Initialize WebDriver
    driver = webdriver.Chrome()
    return driver

In [3]:
url = "https://trends.google.com/trending?geo=US&hours=24"

In [4]:
# Initialize the WebDriver
driver = initialize_driver()
driver.get(url)

# Wait for the page to load
time.sleep(5)

# Extract trending topics
trends = driver.find_elements(By.XPATH, "//div[@role='row']")


In [5]:
# Locate the main trends table
try:
    trends_table = driver.find_element(By.XPATH, "//table[contains(@class, 'enOdEe-wZVHld-zg7Cn')]")
except Exception as e:
    print(f"Could not locate the trends table: {e}")


# Find all trend rows within the table, excluding the header
trend_rows = trends_table.find_elements(By.XPATH, ".//tr[@role='row']")

# If the first row is a header, we should skip it
if len(trend_rows) > 1:
    trend_rows = trend_rows[1:]

data = []
for trend in trend_rows[:25]:  # Limit to the top 25 trends
    try:
        # Extract trend name
        trend_name = trend.find_element(By.XPATH, ".//td[contains(@class, 'jvkLtd')]//div").text.strip()

        # Extract search volume
        search_volume = trend.find_element(By.XPATH, ".//td[contains(@class, 'dQOTjf')]//div").text.strip()

        # Extract trend breakdown (related terms)
        breakdown_elements = trend.find_elements(By.XPATH, ".//td[contains(@class, 'xm9Xec')]//button")
        breakdown_keywords = [b.text.strip() for b in breakdown_elements if b.text.strip()]

        # Extract time started
        time_started = trend.find_element(By.XPATH, ".//td[contains(@class, 'WirRge')]//div").text.strip()
        
        data.append({
            "Trend": trend_name,
            "Search Volume": search_volume,
            "Trend Breakdown": ", ".join(breakdown_keywords),
            "Time Started": time_started
        })
    except Exception as e:
        print(f"Error extracting trend: {e}")

In [6]:
driver.quit()

In [7]:
# Convert to DataFrame
df = pd.DataFrame(data)

In [8]:
df

Unnamed: 0,Trend,Search Volume,Trend Breakdown,Time Started
0,aston villa vs chelsea,"50K+\narrow_upward\n1,000%","aston villa - chelsea, chelsea, aston villa",3 hours ago
1,kash patel wife,"100K+\narrow_upward\n1,000%","alexis wilkins, kash patel girlfriend",22 hours ago
2,valencia c. f. - atlético madrid,"50K+\narrow_upward\n1,000%","valencia vs atlético madrid, atletico de madrid",3 hours ago
3,canadien,"50K+\narrow_upward\n1,000%",,14 hours ago
4,pope francis,"50K+\narrow_upward\n1,000%","pope francis news, the pope, pope benedict",1 hour ago
5,beterbiev vs bivol 2,50K+\narrow_upward\n300%,"bivol vs beterbiev 2 time, bivol vs beterbiev ...",19 hours ago
6,torino vs milan,"20K+\narrow_upward\n1,000%","torino - milan, milan vs torino, flashscore",4 hours ago
7,upmc,20K+\narrow_upward\n800%,"upmc memorial shooting, upmc memorial",3 hours ago
8,lisa franchetti,100K+\narrow_upward\n500%,"cq brown, joint chiefs of staff",18 hours ago
9,arsenal vs west ham,"50K+\narrow_upward\n1,000%","arsenal - west ham, west ham, arsenal",6 hours ago


In [9]:
driver.quit()

In [15]:
from transformers import pipeline

In [16]:
# Initialize the zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")


Device set to use mps:0


In [30]:
# Candidate labels
candidate_labels = ["sports", "politics", "entertainment", "science"]

In [31]:
df['Trend'][0]

'aston villa vs chelsea'

In [32]:
classifier(df['Trend'][0], candidate_labels)

{'sequence': 'aston villa vs chelsea',
 'labels': ['sports', 'entertainment', 'politics', 'science'],
 'scores': [0.5877823233604431,
  0.21702110767364502,
  0.11135935038328171,
  0.08383721858263016]}

In [34]:
def get_category(text):
    # Check if the text is null (NaN or None)
    if pd.isnull(text):
        return 'other'
    result = classifier(text, candidate_labels)
    return result['labels'][0]

# Apply the function to create the 'category' column
df['category'] = df['Trend'].apply(get_category)


In [35]:
df

Unnamed: 0,Trend,Search Volume,Trend Breakdown,Time Started,category
0,aston villa vs chelsea,"50K+\narrow_upward\n1,000%","aston villa - chelsea, chelsea, aston villa",3 hours ago,sports
1,kash patel wife,"100K+\narrow_upward\n1,000%","alexis wilkins, kash patel girlfriend",22 hours ago,entertainment
2,valencia c. f. - atlético madrid,"50K+\narrow_upward\n1,000%","valencia vs atlético madrid, atletico de madrid",3 hours ago,entertainment
3,canadien,"50K+\narrow_upward\n1,000%",,14 hours ago,entertainment
4,pope francis,"50K+\narrow_upward\n1,000%","pope francis news, the pope, pope benedict",1 hour ago,entertainment
5,beterbiev vs bivol 2,50K+\narrow_upward\n300%,"bivol vs beterbiev 2 time, bivol vs beterbiev ...",19 hours ago,sports
6,torino vs milan,"20K+\narrow_upward\n1,000%","torino - milan, milan vs torino, flashscore",4 hours ago,sports
7,upmc,20K+\narrow_upward\n800%,"upmc memorial shooting, upmc memorial",3 hours ago,science
8,lisa franchetti,100K+\narrow_upward\n500%,"cq brown, joint chiefs of staff",18 hours ago,entertainment
9,arsenal vs west ham,"50K+\narrow_upward\n1,000%","arsenal - west ham, west ham, arsenal",6 hours ago,sports


Search

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import pandas as pd
from urllib.parse import quote
import time

In [11]:
def get_npr_articles(query, num_pages=1):
    driver = initialize_driver()
    articles = []
    
    try:
        for page in range(1, num_pages + 1):
            encoded_query = quote(query)
            url = f"https://www.npr.org/search/?query={encoded_query}&page={page}&range%5BlastModifiedDate%5D%5Bmin%5D=1739577600"
            print(f"Accessing page: {url}")
            
            driver.get(url)
            time.sleep(3)  # Allow page to load completely
            
            # First try to find any articles
            items = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article.item"))
            )
            print(f"Found {len(items)} articles")
            
            for item in items:
                try:
                    # Find story info within each article
                    story_info = item.find_element(By.CLASS_NAME, "storyInfo")
                    
                    # Get title and link
                    title_element = story_info.find_element(By.CSS_SELECTOR, "h2.title a")
                    title = title_element.text.strip()
                    link = title_element.get_attribute("href")
                    
                    print(f"Found article: {title}")
                    
                    articles.append({
                        "title": title,
                        "url": link
                    })
                    
                except Exception as e:
                    print(f"Error processing individual article: {e}")
                    continue
            
            time.sleep(1)
            
    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:
        driver.quit()
    
    df = pd.DataFrame(articles)
    print(f"Total articles found: {len(df)}")
    return df

In [12]:
query = df['Trend'][0]

In [13]:
articles = get_npr_articles(query)

Accessing page: https://www.npr.org/search/?query=aston%20villa%20vs%20chelsea&page=1&range%5BlastModifiedDate%5D%5Bmin%5D=1739577600
Error during scraping: Message: 

Total articles found: 0


In [14]:
article = articles['url'][0]

KeyError: 'url'

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [None]:
def get_transcript_url(driver, article_url):
    try:
        driver.get(article_url)
        transcript_link = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "a.audio-tool.audio-tool-transcript"))
        )
        return transcript_link.get_attribute("href")
    except TimeoutException:
        print(f"No transcript found for {article_url}")
        return None
    except Exception as e:
        print(f"Error finding transcript: {e}")
        return None
    
def get_npr_transcript(article_url):
    driver = initialize_driver()
    try:
        transcript_url = get_transcript_url(driver, article_url)
        if not transcript_url:
            return None
            
        driver.get(transcript_url)
        
        # Wait for transcript container
        transcript_container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "transcript.storytext"))
        )
        
        # Get all paragraphs
        paragraphs = transcript_container.find_elements(By.TAG_NAME, "p")
        transcript_text = " ".join([p.text for p in paragraphs if p.text.strip()])
        
        return {
            'article_url': article_url,
            'transcript_url': transcript_url,
            'transcript': transcript_text
        }
        
    except Exception as e:
        print(f"Error extracting transcript: {e}")
        return None
    finally:
        driver.quit()

In [None]:
transcript = get_npr_transcript(article)
if transcript:
    print(transcript)

In [None]:
transcript['transcript']

In [None]:
driver.quit()