In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# List to hold all scraped data across all pages
all_scraped_data = []

# Loop through all pages
for i in range(1, 9):
    url = f"https://www.thehindu.com/sport/cricket/?page={i}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all article elements
        articles = soup.find_all('div', class_='element row-element')
        
        # Loop through each article and extract data
        for article in articles:
            try:
                # Extract title
                title_tag = article.find('h3', class_='title big')
                title = title_tag.get_text(strip=True) if title_tag else None
                
                # Extract link
                link = title_tag.find('a')['href'] if title_tag and title_tag.find('a') else None
                
                # Extract by-line
                by_line_tag = article.find('div', class_='by-line')
                by_line = by_line_tag.get_text(strip=True) if by_line_tag else None
                
                # Append the data to the global list
                if title and link:
                    all_scraped_data.append({
                        'title': title,
                        'link': link,
                        'by-line': by_line,
                    })
            except AttributeError:
                # Skip articles with unexpected structures
                continue
    else:
        print(f"Failed to retrieve page {i}. Status code: {response.status_code}")

# Create a DataFrame after the loop is complete
df = pd.DataFrame(all_scraped_data)

# Print the DataFrame or save it to a CSV file
print(df)
# df.to_csv('scraped_data.csv', index=False)


                                                 title  \
0    National junior athletics | Kiran rewrites his...   
1    Syed Mushtaq Ali Trophy | Ghosh holds his nerv...   
2    Pink ball charm — darling of the masses, villa...   
3    World Test Championship table: South Africa go...   
4    Syed Mushtaq Ali Trophy | Shami shows consiste...   
..                                                 ...   
99        Urvil smashes second-fastest century in T20s   
100  Saurashtra puts in all-round show to ease past...   
101  Jaiswal will score more than 40 Test hundreds,...   
102  Phillip Hughes: Australia remembers cricketer ...   
103  IPL Auction 2025: Of high bids and inflated pr...   

                                                  link           by-line  
0    https://www.thehindu.com/sport/cricket/nationa...      Y.B. Sarangi  
1    https://www.thehindu.com/sport/cricket/syed-mu...      Ashwin Achal  
2    https://www.thehindu.com/sport/cricket/pink-ba...        R. Kaushik  
3  

In [13]:
import requests
from bs4 import BeautifulSoup

# URL of the page to scrape
for i in range(1,62):
    url = f"https://www.factcheck.org/scicheck/page/{i}/"

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all articles in the SciCheck section
        articles = soup.find_all('article')
        
        # List to hold the scraped data
        scraped_data = []
        
        # Loop through each article and extract the title, link, date, and summary
        for article in articles:
            title = article.find('h3', class_='entry-title').get_text(strip=True)
            link = article.find('h3', class_='entry-title').find('a')['href']
            date = article.find('div', class_='entry-meta').get_text(strip=True)
            summary = article.find('div', class_='entry-content').get_text(strip=True)
            
            
            # Append the data to the list
            scraped_data.append({
                'title': title,
                'link': link,
                'date': date,
                'summary': summary
            })
        data_list = []

        # Loop through the scraped data to populate the list
        for data in scraped_data:
            data_list.append({
                "title": data['title'],
                "link": data['link'],
                "date": data['date'],
                "summary": data['summary']
            })

        # Create a DataFrame from the list of dictionaries
        df = pd.DataFrame(data_list)

        # Display the DataFrame
        print(df)
        # Print the scraped data with classification
        for data in scraped_data:
            print(f"Title: {data['title']}")
            print(f"Link: {data['link']}")
            print(f"Date: {data['date']}")
            print(f"Summary: {data['summary']}")
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

                                               title  \
0     Sen. Mullin’s Misleading Vaccine Testing Claim   
1  As Trump Taps RFK Jr. for Health Secretary, a ...   
2  Trump Embraces RFK Jr.’s Views on Vaccines, Fl...   
3                 Harris vs. Trump on Climate Change   
4  Florida’s 2024-2025 COVID-19 Vaccine Guidance ...   
5             Q&A on the 2024-2025 COVID-19 Vaccines   
6  Baseless Claims Proliferate on Hurricanes and ...   
7  Posts Sharing Mpox Misinformation Recycle Clai...   
8     Q&A on the Second International Mpox Emergency   
9  Trump Clings to Inaccurate Climate Change Talk...   

                                                link                date  \
0  https://www.factcheck.org/2024/12/sen-mullins-...    December 6, 2024   
1  https://www.factcheck.org/2024/11/as-trump-tap...   November 22, 2024   
2  https://www.factcheck.org/2024/11/trump-embrac...    November 4, 2024   
3  https://www.factcheck.org/2024/11/harris-vs-tr...    November 1, 2024   
4  

In [7]:
df.head()

Unnamed: 0,title,link,by-line
0,National junior athletics | Kiran rewrites his...,https://www.thehindu.com/sport/cricket/nationa...,Y.B. Sarangi
1,Syed Mushtaq Ali Trophy | Ghosh holds his nerv...,https://www.thehindu.com/sport/cricket/syed-mu...,Ashwin Achal
2,"Pink ball charm — darling of the masses, villa...",https://www.thehindu.com/sport/cricket/pink-ba...,R. Kaushik
3,World Test Championship table: South Africa go...,https://www.thehindu.com/sport/cricket/world-t...,PTI
4,Syed Mushtaq Ali Trophy | Shami shows consiste...,https://www.thehindu.com/sport/cricket/shami-b...,Ashwin Achal


In [8]:
df.shape

(104, 3)

In [9]:
df.to_csv("Web_Scrapping_Hindu.csv")