In [17]:
'''
Qn1:
Scrape the details of most viewed videos on YouTube from Wikipedia. Url 
= https://en.wikipedia.org/wiki/List_of_most-viewed_YouTube_videos You need to find following details: A)
Rank 
B) Name 
C) Artist 
D) Upload date 
E) View
'''
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Exception handling block
try:
    # GET request to Wikipedia page
    url = "https://en.wikipedia.org/wiki/List_of_most-viewed_YouTube_videos"
    response = requests.get(url)

    # Raise an exception for failures
    response.raise_for_status()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    
    table = soup.find('table', {'class': 'wikitable sortable'})
    
    if table is None:
        raise Exception("Could not find the table on the Wikipedia page.")

    # Scrape the details (Rank, Name, Artist, Upload date, Views)
    videos = []
    rows = table.find_all('tr')[1:]  # Skip the header row

    for row in rows:
        cols = row.find_all('td')

        # Check if row has expected number of columns
        if len(cols) < 5:
            continue  # Skip rows that don't have enough columns

        try:
            
            rank = cols[0].text.strip()

            # Name 
            name = cols[1].text.strip().replace('"', '')

            # Artist/Creator
            artist = cols[2].text.strip()

            # Upload date
            upload_date = cols[3].text.strip()

            # View count
            views = cols[4].text.strip()

            
            videos.append({
                'Rank': rank,
                'Name': name,
                'Artist': artist,
                'Upload Date': upload_date,
                'Views': views
            })
        
        except IndexError as e:
            print(f"Error while accessing table columns: {e}")
        except Exception as e:
            print(f"An error occurred while scraping the row: {e}")

    # Convert the data to a DataFrame
    df = pd.DataFrame(videos)

    # Display DataFrame
    print(df)

except requests.exceptions.RequestException as e:
    print(f"An error occurred while making the HTTP request: {e}")
except Exception as e:
    print(f"An error occurred: {e}")


                                                 Rank  \
0                               "Baby Shark Dance"[7]   
1                                     "Despacito"[10]   
2                                 "See You Again"[21]   
3                                "Gangnam Style"⁂[31]   
4                                         "Baby"*[69]   
5                                   "Bad Romance"[73]   
6                         "Charlie Bit My Finger"[77]   
7   "Evolution of Dance" (3rd time as most viewed ...   
8                               "Girlfriend"‡[81][82]   
9   "Evolution of Dance" (2nd time as most viewed ...   
10                     "Music Is My Hot Hot Sex"‡[87]   
11                          "Evolution of Dance"*[79]   
12                   "Pokémon Theme Music Video"‡[92]   
13                     "Myspace – The Movie"‡[97][98]   
14                          "Phony Photo Booth"‡[101]   
15                  "The Chronic of Narnia Rap"‡[107]   
16                 "Ronaldinho:

In [1]:
'''Qn2:

Scrape the details team India’s international fixtures from bcci.tv. 
Url = https://www.bcci.tv/. 
You need to find following details: 
A) Series 
B) Place 
C) Date 
D) Time 
Note: - From bcci.tv home page you have reach to the international fixture page through code.

'''

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
import time

# scrape the data
def scrape_bcci_fixtures():
    try:

        chrome_options = Options()
        chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"  # Path

        #Selenium WebDriver with path to chromedriver
        path_chromedriver = r'C:\chromedriver-win64\chromedriver.exe'  
        service = Service(path_chromedriver)
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.get('https://www.bcci.tv/')

        
        time.sleep(5)
        
        # Navigate to international fixtures page
        try:
            fixtures_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//a[@href='/international/fixtures']"))
            )
            fixtures_button.click()
        except TimeoutException:
            print("Couldn't find the Fixtures button.")
            driver.quit()
            return
        
        # Wait for page to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'fixture-card')))

        # Extract fixtures
        fixtures = driver.find_elements(By.CLASS_NAME, 'fixture-card')

        for fixture in fixtures:
            try:
                # Extract series 
                series_name = fixture.find_element(By.CLASS_NAME, 'fixture-card__series-name').text
                
                # Extract location
                place = fixture.find_element(By.CLASS_NAME, 'fixture-card__venue').text
                
                # Extract date,time
                date_time = fixture.find_element(By.CLASS_NAME, 'fixture-card__datetime').text
                
                # Split date,time
                date, time_ = date_time.split("•")

                # Print details
                print(f"Series: {series_name}\nPlace: {place}\nDate: {date.strip()}\nTime: {time_.strip()}\n")

            except NoSuchElementException:
                print("An element was not found on the fixture card. Skipping.")
            except Exception as e:
                print(f"An unexpected error occurred: {str(e)}")

    except WebDriverException as e:
        print(f"Error with the WebDriver: {str(e)}")
    finally:
        # Close the driver
        driver.quit()

# Call scraping function
scrape_bcci_fixtures()


Couldn't find the Fixtures button. Exiting.


In [2]:
'''
Qn3:

Scrape the details of State-wise GDP of India from statisticstime.com. 
Url = http://statisticstimes.com/ 
You have to find following details: A) Rank 
B) State 
C) GSDP(18-19)- at current prices 
D) GSDP(19-20)- at current prices 
E) Share(18-19) 
F) GDP($ billion) 
Note: - From statisticstimes home page you have to reach to economy page through code.

'''

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
import pandas as pd

# Function to scrape state-wise GDP details
def scrape_gdp_data():
    try:

        chrome_options = Options()
        chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"  # Path

        
        path_chromedriver = r'C:\chromedriver-win64\chromedriver.exe'  # path to chromedriver executable
        service = Service(path_chromedriver)
        driver = webdriver.Chrome(service=service, options=chrome_options)    
        
        # Navigate to homepage
        driver.get("http://statisticstimes.com/")

        # Wait for page to load and click on the Economy link
        try:
          
            economy_link = WebDriverWait(driver, 50).until(
                EC.presence_of_element_located((By.LINK_TEXT, "Economy"))
            )
            economy_link.click()

        except TimeoutException:
            print("Economy page link not found or page took too long to load.")
            driver.quit()
            return

        # Wait for the State-wise GDP section to load
        try:
            gdp_link = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.LINK_TEXT, "Indian states by GDP"))
            )
            gdp_link.click()

        except TimeoutException:
            print("GDP page link not found or page took too long to load.")
            driver.quit()
            return

        # scrape the data.
        try:
           
            table = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "display.dataTable"))
            )
            rows = table.find_elements(By.TAG_NAME, "tr")

            # Extract header 
            headers = [header.text for header in rows[0].find_elements(By.TAG_NAME, "th")]

            # Extract data rowwise
            data = []
            for row in rows[1:]:
                cols = row.find_elements(By.TAG_NAME, "td")
                if len(cols) == 6:  # Ensure there are 6 columns as per the required data
                    data.append({
                        'Rank': cols[0].text,
                        'State': cols[1].text,
                        'GSDP(18-19)': cols[2].text,
                        'GSDP(19-20)': cols[3].text,
                        'Share(18-19)': cols[4].text,
                        'GDP($ billion)': cols[5].text
                    })

            # Store the data DataFrame
            df = pd.DataFrame(data)
            print(df)
            # Save to a CSV file
            df.to_csv('india_gdp_by_state.csv', index=False)

        except NoSuchElementException as e:
            print("Unable to locate the GDP data table.", e)
        
    except WebDriverException as e:
        print("Error initializing WebDriver", e)
    
    finally:
        
        driver.quit()

# Call scraper function
scrape_gdp_data()


Economy page link not found or page took too long to load.


In [None]:
'''
Qn4:
Scrape the details of trending repositories on Github.com. 
Url = https://github.com/ 
You have to find the following details: 
A) Repository title 
B) Repository description 
C) Contributors count 
D) Language used 
Note: - From the home page you have to click on the trending option from Explore menu through code.
'''


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
import pandas as pd

# Exception handling block
try:
    
    chrome_options = Options()
    chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"  # Path    
    path_chromedriver = r'C:\chromedriver-win64\chromedriver.exe'  # path to chromedriver executable
    service = Service(path_chromedriver)
    driver = webdriver.Chrome(service=service, options=chrome_options)  

    # Step 1: Open the GitHub homepage
    driver.get("https://github.com/")
    
    # Step 2: Wait until the 'Explore' link is available and click on it
    wait = WebDriverWait(driver, 10)
    explore_menu = wait.until(EC.element_to_be_clickable((By.XPATH, "//summary[contains(text(),'Explore')]")))
    explore_menu.click()
    
    # Step 3: Wait until the 'Trending' link is available and click on it
    trending_link = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[contains(text(),'Trending')]")))
    trending_link.click()

    # Step 4: Scrape details for the trending repositories
    trending_repos = []
    repo_elements = driver.find_elements(By.XPATH, "//article[@class='Box-row']")

    for repo in repo_elements:
        try:
            # Repository Title
            repo_title = repo.find_element(By.XPATH, ".//h1/a").text
            
            # Repository Description
            try:
                repo_description = repo.find_element(By.XPATH, ".//p").text
            except NoSuchElementException:
                repo_description = "No description provided"
            
            # Contributors Count
            try:
                contributors = repo.find_elements(By.XPATH, ".//a[contains(@href, '/graphs/contributors')]")
                contributors_count = len(contributors)
            except NoSuchElementException:
                contributors_count = "Not available"
            
            # Language Used
            try:
                language = repo.find_element(By.XPATH, ".//span[@itemprop='programmingLanguage']").text
            except NoSuchElementException:
                language = "Not specified"

            # Append data to list
            trending_repos.append({
                "Repository Title": repo_title,
                "Repository Description": repo_description,
                "Contributors Count": contributors_count,
                "Language Used": language
            })
        
        except NoSuchElementException as e:
            print(f"Could not find an element for a repository: {e}")

    # Step 5: Convert to DataFrame and display
    df = pd.DataFrame(trending_repos)
    print(df)
    
except NoSuchElementException as e:
    print("Element not found on the page:", e)
except TimeoutException as e:
    print("Loading took too much time:", e)
except Exception as e:
    print("An error occurred:", e)
finally:
    # Close the WebDriver session
    driver.quit()


In [None]:
'''
Qn5:
Scrape the details of top 100 songs on billiboard.com. Url = https:/www.billboard.com/ You have to find the 
following details: 
A) Song name 
B) Artist name 
C) Last week rank 
D) Peak rank 
E) Weeks on board 
 Note: - From the home page you have to click on the charts option then hot 100-page link through code. 
'''
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
import pandas as pd



# Exception handling block
try:
    chrome_options = Options()
    chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"  # Path    
    path_chromedriver = r'C:\chromedriver-win64\chromedriver.exe'  # path to chromedriver executable
    service = Service(path_chromedriver)
    driver = webdriver.Chrome(service=service, options=chrome_options)
        
    # Step 1: Open the Billboard website homepage
    driver.get("https://www.billboard.com/")

    # Step 2: Wait until the 'Charts' link is available and click on it
    wait = WebDriverWait(driver, 10)
    charts_link = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "Charts")))
    charts_link.click()

    # Step 3: Wait until the 'Hot 100' link is available and click on it
    hot_100_link = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "Hot 100")))
    hot_100_link.click()

    # Step 4: Scrape details for the top 100 songs
    songs_data = []
    song_elements = driver.find_elements(By.XPATH, "//ul[@class='o-chart-results-list-row']")

    for song in song_elements[:100]:  # Extract details for the top 100
        try:
            # Song Name
            song_name = song.find_element(By.XPATH, ".//h3").text

            # Artist Name
            artist_name = song.find_element(By.XPATH, ".//span[@class='c-label']").text

            # Last Week Rank
            last_week_rank = song.find_element(By.XPATH, ".//li[@class='lrv-u-width-100p u-width-full']/span[1]").text

            # Peak Rank
            peak_rank = song.find_element(By.XPATH, ".//li[@class='lrv-u-width-100p u-width-full']/span[2]").text

            # Weeks on Board
            weeks_on_board = song.find_element(By.XPATH, ".//li[@class='lrv-u-width-100p u-width-full']/span[3]").text

            # Append data to list
            songs_data.append({
                "Song Name": song_name,
                "Artist Name": artist_name,
                "Last Week Rank": last_week_rank,
                "Peak Rank": peak_rank,
                "Weeks on Board": weeks_on_board
            })
        
        except NoSuchElementException as e:
            print(f"Could not find an element for a song: {e}")

    # Step 5: Convert to DataFrame and display
    df = pd.DataFrame(songs_data)
    print(df)
    
except NoSuchElementException as e:
    print("Element not found on the page:", e)
except TimeoutException as e:
    print("Loading took too much time:", e)
except Exception as e:
    print("An error occurred:", e)
finally:
    # Close the WebDriver session
    driver.quit()


In [3]:
'''
Qn6: Scrape the details of Highest selling novels. 
A) Book name 
B) Author name 
C) Volumes sold 
D) Publisher 
E) Genre 
 Url - https://www.theguardian.com/news/datablog/2012/aug/09/best-selling-books-all-time-fifty-shades-grey-compare
 
''' 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from selenium.webdriver.chrome.options import Options
import pandas as pd

# Function to scrape highest-selling novel details
def scrape_best_selling_books():
    try:

        chrome_options = Options()
        chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"  # Path

        
        path_chromedriver = r'C:\chromedriver-win64\chromedriver.exe'  # path to chromedriver executable
        service = Service(path_chromedriver)
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        # Navigate to the  URL
        driver.get("https://www.theguardian.com/news/datablog/2012/aug/09/best-selling-books-all-time-fifty-shades-grey-compare")

        # Wait for the page to load and locate table containing book data
        try:
            # locate the table using XPath or CSS selector
            table = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//table"))
            )
            print("Table found, extracting data...")

        except TimeoutException:
            print("Table not found or page took too long to load.")
            driver.quit()
            return

        # Scrape table rows
        try:
            # Find all rows in table
            rows = table.find_elements(By.TAG_NAME, "tr")

            # Extract headers 
            headers = [header.text for header in rows[0].find_elements(By.TAG_NAME, "th")]

            # Extract data from each row 
            data = []
            for row in rows[1:]:
                cols = row.find_elements(By.TAG_NAME, "td")
                if len(cols) == 5:  # Ensure there are 5 columns as per the required data
                    data.append({
                        'Book Name': cols[0].text,
                        'Author Name': cols[1].text,
                        'Volumes Sold': cols[2].text,
                        'Publisher': cols[3].text,
                        'Genre': cols[4].text
                    })

            # Store data in a pandas DataFrame
            df = pd.DataFrame(data)
            print(df)

            # Save to a CSV file
            df.to_csv('best_selling_books.csv', index=False)

        except NoSuchElementException as e:
            print("Unable to locate table rows or data.", e)

    except WebDriverException as e:
        print("Error initializing WebDriver", e)

    finally:
        
        driver.quit()

# Call function
scrape_best_selling_books()


Table found, extracting data...
Empty DataFrame
Columns: []
Index: []


In [4]:
'''
Qn 7:

Scrape the details most watched tv series of all time from imdb.com. 
Url = https://www.imdb.com/list/ls095964455/ You have 
to find the following details: 
A) Name 
B) Year span 
C) Genre 
D) Run time 
E) Ratings 
F) Votes

'''
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from selenium.webdriver.chrome.options import Options
import pandas as pd

# Function to scrape most-watched TV series from IMDb
def scrape_most_watched_tv_series():
    try:

        chrome_options = Options()
        chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"  # Path

        #Selenium WebDriver with the correct path to chromedriver
        path_chromedriver = r'C:\chromedriver-win64\chromedriver.exe'  # path to your chromedriver executable
        service = Service(path_chromedriver)
        driver = webdriver.Chrome(service=service, options=chrome_options)

        # Navigate to the IMDb list URL
        driver.get("https://www.imdb.com/list/ls053826112/")

        # Wait for the page to load and locate the TV series elements
        try:
            # Wait until the list of TV series is present on the page
            tv_series_elements = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.XPATH, "//div[@class='lister-item-content']"))
            )
            print("TV series elements found, extracting data...")

        except TimeoutException:
            print("TV series list not found or page took too long to load.")
            driver.quit()
            return

        # Scrape data from the TV series elements
        try:
            data = []
            for series in tv_series_elements:
                try:
                    # Extracting the required fields
                    name = series.find_element(By.XPATH, ".//h3/a").text
                    year_span = series.find_element(By.XPATH, ".//h3/span[contains(@class, 'lister-item-year')]").text
                    genre = series.find_element(By.XPATH, ".//span[@class='genre']").text.strip()
                    run_time = series.find_element(By.XPATH, ".//span[@class='runtime']").text
                    rating = series.find_element(By.XPATH, ".//div[@class='ipl-rating-star small']//span[@class='ipl-rating-star__rating']").text
                    votes = series.find_element(By.XPATH, ".//span[@name='nv']").text

                    # Append the data into a dictionary
                    data.append({
                        'Name': name,
                        'Year Span': year_span,
                        'Genre': genre,
                        'Run Time': run_time,
                        'Ratings': rating,
                        'Votes': votes
                    })

                except NoSuchElementException as e:
                    print(f"Error extracting data for a TV series: {e}")
                    continue

            # Store the data in a pandas DataFrame
            df = pd.DataFrame(data)
            print(df)

            # Save the data to a CSV file
            df.to_csv('most_watched_tv_series.csv', index=False)

        except NoSuchElementException as e:
            print("Unable to locate necessary elements.", e)

    except WebDriverException as e:
        print("Error initializing WebDriver", e)

    finally:
        # Close the browser
        driver.quit()

# Run the scraper function
scrape_most_watched_tv_series()


TV series list not found or page took too long to load.


In [9]:
'''
Qn8:
Details of Datasets from UCI machine learning repositories. 
Url = https://archive.ics.uci.edu/ You 
have to find the following details: 
A) Dataset name 
B) Data type 
C) Task 
D) Attribute type 
E) No of instances 
F) No of attribute G) Year 
 Note: - from the home page you have to go to the Show All Dataset page through code.
 '''
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from selenium.webdriver.chrome.options import Options
import pandas as pd




def get_dataset_details(url):
  

  try:
    
    chrome_options = Options()
    chrome_options.binary_location = r"C:\Program Files\Google\Chrome\Application\chrome.exe"  # Path

      
    path_chromedriver = r'C:\chromedriver-win64\chromedriver.exe'  # path to chromedriver executable
    service = Service(path_chromedriver)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.get(url)

    # Wait for the "View All Datasets" button to be clickable and then click it
    view_all_datasets_button = WebDriverWait(driver, 30).until(
        EC.element_to_be_clickable((By.LINK_TEXT, "View All Datasets"))
    )
    view_all_datasets_button.click()

    # Wait for the dataset table to load
    dataset_table = WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "table.table"))
    )

    # Extract dataset details from the table
    rows = dataset_table.find_elements(By.TAG_NAME, "tr")[1:]  # Skip the header row
    dataset_details = []
    for row in rows:
      columns = row.find_elements(By.TAG_NAME, "td")
      dataset_name = columns[0].text.strip()
      data_type = columns[1].text.strip()
      task = columns[2].text.strip()
      attribute_type = columns[3].text.strip()
      no_of_instances = columns[4].text.strip()
      no_of_attribute = columns[5].text.strip()
      year = columns[6].text.strip()

      dataset_details.append({
          "Dataset Name": dataset_name,
          "Data Type": data_type,
          "Task": task,
          "Attribute Type": attribute_type,
          "No of Instances": no_of_instances,
          "No of Attributes": no_of_attribute,
          "Year": year
      })

    return dataset_details

  except TimeoutException:
    print("Loading took too much time!")
  except NoSuchElementException:
    print("Could not find the element!")
  finally:
    driver.quit()


if __name__ == "__main__":
  url = "https://archive.ics.uci.edu/"
  datasets = get_dataset_details(url)
  if datasets:
    for dataset in datasets:
      print(dataset)

Loading took too much time!
