In [17]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

# Set up Selenium WebDriver (using Chrome)
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode
driver = webdriver.Chrome(options=options)

# Load the webpage
url = "https://www.aajtak.in/"
driver.get(url)

# Wait to ensure JS has loaded
time.sleep(5)

# Get page source and parse with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Close the driver
driver.quit()

# Find headline tags (you may need to inspect the site's structure for accuracy)
headlines_html = soup.find_all('h3')

# Extract and clean text
headlines = [h.get_text(strip=True) for h in headlines_html if h.get_text(strip=True)]

# Create DataFrame
df = pd.DataFrame({
    'headline': headlines,
    'source': url,
    'scraped_time': datetime.now()
})

# Show the first few headlines
print(df.head())


                                            headline                  source  \
0  LIVE: आरसीबी की धमाकेदार शुरुआत, कोहली कर रहे ...  https://www.aajtak.in/   
1  Lucknow Super Giants (LSG) vs Royal Challenger...  https://www.aajtak.in/   
2  20 करोड़ कर्ज, 5 साल से अंडरग्राउंड... कार में...  https://www.aajtak.in/   
3  ऑनस्क्रीन Kiss करने में छूटे थे एक्टर के पसीने...  https://www.aajtak.in/   
4  भारत-पाक युद्ध के बीच शादी टालने का मन बना चुक...  https://www.aajtak.in/   

                scraped_time  
0 2025-05-27 22:23:41.095379  
1 2025-05-27 22:23:41.095379  
2 2025-05-27 22:23:41.095379  
3 2025-05-27 22:23:41.095379  
4 2025-05-27 22:23:41.095379  


### **Practiced the Modified Code**

In [16]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

def scrape_headlines(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    headlines_html = soup.find_all('h3')
    headlines = [h.get_text(strip=True) for h in headlines_html if h.get_text(strip=True)]
    df = pd.DataFrame({
        'headline': headlines,
        'source': url,
        'scraped_time': datetime.now()
    })
    return df

url = "https://www.aajtak.in/"
print(scrape_headlines(url).head())

                                            headline                  source  \
0  LIVE: आरसीबी की धमाकेदार शुरुआत, कोहली कर रहे ...  https://www.aajtak.in/   
1  Lucknow Super Giants (LSG) vs Royal Challenger...  https://www.aajtak.in/   
2  20 करोड़ कर्ज, 5 साल से अंडरग्राउंड... कार में...  https://www.aajtak.in/   
3  गाजा में इजरायली हमले में मरने वालों की संख्या...  https://www.aajtak.in/   
4  ऑनस्क्रीन Kiss करने में छूटे थे एक्टर के पसीने...  https://www.aajtak.in/   

                scraped_time  
0 2025-05-27 22:22:44.595874  
1 2025-05-27 22:22:44.595874  
2 2025-05-27 22:22:44.595874  
3 2025-05-27 22:22:44.595874  
4 2025-05-27 22:22:44.595874  


 ### **USED NEW LINK of "TIME_NOW"**

In [20]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_headlines(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0.4472.124')
    driver = webdriver.Chrome(options=options)
    driver.set_page_load_timeout(300)
    try:
        driver.get(url)
        WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "h3")))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        driver.quit()
        headlines_html = soup.find_all('h3')
        headlines = [h.get_text(strip=True) for h in headlines_html if h.get_text(strip=True)]
        df = pd.DataFrame({
            'headline': headlines,
            'source': url,
            'scraped_time': datetime.now()
        })
        return df
    except TimeoutError:
        print("Page load timed out")
        driver.quit()
        return pd.DataFrame()

url = "https://www.timesnownews.com/"
print(scrape_headlines(url).head())

                                            headline  \
0  Trump Targets Harvard, Again: $100 Million in ...   
1  Ravi Mohan Sues Estranged Wife Aarti, Mother-I...   
2                  LSG vs RCB Live Score And Updates   
3  India to Develop 5th-Gen Jet: Why AMCA Is Idea...   
4  JAC 10th 2025 Result Live: Jharkhand Class 10 ...   

                          source               scraped_time  
0  https://www.timesnownews.com/ 2025-05-27 22:35:39.954440  
1  https://www.timesnownews.com/ 2025-05-27 22:35:39.954440  
2  https://www.timesnownews.com/ 2025-05-27 22:35:39.954440  
3  https://www.timesnownews.com/ 2025-05-27 22:35:39.954440  
4  https://www.timesnownews.com/ 2025-05-27 22:35:39.954440  
