In [2]:
# Import necessary libraries
import csv
import os
import requests
from bs4 import BeautifulSoup, Tag
from urllib.parse import urljoin

# Define the range of datasets to scrape
start_index = 1
end_index = 623

# Helper function to remove <sup> tags from a given BeautifulSoup tag
def remove_sup_tags(tag):
    for sup in tag.find_all("sup"):
        sup.extract()
    return tag


def has_papers_that_cite_this_data_set(tag):
    """This function checks if a tag has papers that cite this dataset.

    The function is checking for a paragraph tag with class "normal" and checks if the previous sibling tag
    with class "small-heading" contains the text "Papers That Cite This Data Set". 
    """
    if not (tag.name == "p" and tag.get("class") == ["normal"]):
        return False
    
    previous_small_heading = tag.find_previous("p", class_="small-heading")
    
    if not previous_small_heading:
        return False
    
    text = previous_small_heading.get_text(strip=True)
    return "Papers That Cite This Data Set" in text


def get_dataset_details(url):
    """Fetches and parses a webpage to extract specific dataset details.

    The function sends a GET request to the provided URL and creates a BeautifulSoup object 
    for parsing the HTML content. It then searches for a table with a specific border attribute 
    and iterates over its rows to find dataset details. If no table is found, the function returns 
    a tuple of empty strings.

    It also looks for specific information in paragraphs ('p' tags) with class "small-heading" and 
    text that matches certain criteria, such as "Attribute Information:", "Source:", "Data Set Information:", 
    "Relevant Papers:", and papers that cite the data set.
    """
    # Send a GET request to the provided URL
    response = requests.get(url)
    
    # Decode the content of the response
    content = response.content.decode("utf-8", "replace")
    
    # Create a BeautifulSoup object for parsing the HTML content
    soup = BeautifulSoup(content, "html.parser")

    # Find a table with a specific border attribute
    table = soup.find("table", {"border": "1"})

    # If no table is found, return a tuple of empty strings
    if table is None:
        return "", "", "", "", "", "", "", "", ""

    # Find all rows in the table
    rows = table.find_all("tr")

    # Initialize variables to hold the information we want to extract
    area = ""
    date_donated = ""
    web_hits = ""
    attribute_info = ""
    source = ""
    data_set_information = ""
    relevant_papers = ""
    papers_that_cite_this_data_set = ""
    num_papers = 0

    # Iterate over the rows of the table to extract the relevant information
    for row in rows:
        cols = row.find_all("td")

        # If the row has six columns, check if it contains information about the area, date donated or web hits
        if len(cols) == 6:
            if "Area:" in cols[4].get_text(strip=True):
                area = cols[5].get_text(strip=True)
            if "Date Donated" in cols[4].get_text(strip=True):
                date_donated = cols[5].get_text(strip=True)   
            if "Number of Web Hits:" in cols[4].get_text(strip=True):
                web_hits = cols[5].get_text(strip=True)

    # Find and extract the attribute information
    attribute_info_tag = soup.find("p", class_="small-heading", text="Attribute Information:")

    if attribute_info_tag:
        # Store each line of attribute information in a list
        attribute_info_lines = []
        sibling = attribute_info_tag.find_next_sibling()
        
        # Loop through the siblings (following 'p' tags) of the attribute_info_tag
        while sibling and sibling.name == "p" and "class" in sibling.attrs and "normal" in sibling["class"]:
            lines = sibling.decode_contents().split("<br/>")
            for line in lines:
                line = line.strip()
 #               if not line.startswith("Given") and not line.startswith("") and not line.startswith("-") and not line.startswith(" "):
                attribute_info_lines.append(line)
            sibling = sibling.find_next_sibling()

        # Join the attribute information lines into a single string
        attribute_info = "\n".join(attribute_info_lines)
    
    # Similar procedures are followed for 'source', 'data_set_information', and 'relevant_papers'
    # Each involves finding a specific tag, then traversing its siblings to extract information

    # Find and extract the source information
    source_tag = soup.find("p", class_="small-heading", text="Source:")
    if source_tag:
        source_lines = []
        sibling = source_tag.find_next_sibling()
        while sibling and sibling.name == "p" and "class" in sibling.attrs and "normal" in sibling["class"]:
            lines = sibling.decode_contents().split("<br/>")
            for line in lines:
                line = line.strip()
                source_lines.append(line)
            sibling = sibling.find_next_sibling()

        source = "\n".join(source_lines)

    # Find and extract the data set information
    data_set_information_tag = soup.find("p", class_="small-heading", text="Data Set Information:")
    if data_set_information_tag:
        data_set_information_lines = []
        sibling = data_set_information_tag.find_next_sibling()
        while sibling and sibling.name == "p" and "class" in sibling.attrs and "normal" in sibling["class"]:
            lines = sibling.decode_contents().split("<br/>")
            for line in lines:
                line = line.strip()
                data_set_information_lines.append(line)
            sibling = sibling.find_next_sibling()

        data_set_information = "\n".join(data_set_information_lines)

    # Find and extract the relevant papers
    relevant_papers_tag = soup.find("p", class_="small-heading", text="Relevant Papers:")
    if relevant_papers_tag:
        relevant_papers_lines = []
        sibling = relevant_papers_tag.find_next_sibling()
        while sibling and sibling.name == "p" and "class" in sibling.attrs and "normal" in sibling["class"]:
            lines = sibling.decode_contents().split("<br/>")
            for line in lines:
                line = line.strip()
                relevant_papers_lines.append(line)
            sibling = sibling.find_next_sibling()

        relevant_papers = "\n".join(relevant_papers_lines) 
   
    # Find and extract the papers that cite this data set
    papers_that_cite_this_data_set_tag = soup.find(has_papers_that_cite_this_data_set)

    if papers_that_cite_this_data_set_tag:
        # Get the next sibling of the papers_that_cite_this_data_set_tag
        sibling = papers_that_cite_this_data_set_tag

       # Store each paper in a list
        papers = []

        # Initialize a counter for the number of papers
        num_papers = 0

        # Loop through the siblings (following 'p' tags) of the papers_that_cite_this_data_set_tag
        while sibling and sibling.name == "p":
            # Check if the sibling has the class "normal"
            if "class" in sibling.attrs and "normal" in sibling["class"]:
                # Replace <br> tags with a placeholder
                for br in sibling.find_all("br"):
                    br.replace_with("|||")
                
                # Split the text of the sibling by the placeholder to get a list of papers
                paper_list = sibling.text.split("|||")

                # Handle the case where papers are separated by <br><br> instead of just <br>
                if len(paper_list) == 1 and "<br>" in sibling.decode_contents():
                    paper_list = sibling.decode_contents().split("<br><br>")

                # Filter out empty strings from paper_list
                paper_list = [paper.strip() for paper in paper_list if paper.strip()]

                # Add each paper to the list of papers and increment the counter
                for paper in paper_list:
                    papers.append(paper)
                    num_papers += 1
  
            # Get the next sibling of the current sibling
            sibling = sibling.find_next_sibling("p")

        # Join the list of papers into a single string separated by semicolons
        papers_that_cite_this_data_set = "; ".join(papers)
        
    else:
        # If no papers_that_cite_this_data_set_tag is found, set the papers_that_cite_this_data_set and num_papers to their default values
        papers_that_cite_this_data_set = ""
        num_papers = 0
    
    # Return the extracted information as a tuple
    return area, date_donated, web_hits, attribute_info, source, data_set_information, relevant_papers, papers_that_cite_this_data_set, num_papers
   
def fetch_data_folder_url(url):
    # Send a GET request to the provided URL.
    response = requests.get(url)
    # Decode the content of the response to utf-8, replacing any characters that can't be decoded.
    content = response.content.decode("utf-8", "replace")
    # Parse the HTML content of the response with BeautifulSoup.
    soup = BeautifulSoup(content, "html.parser")

    # Search for an 'a' tag whose 'href' attribute contains the string "machine-learning-databases".
    # This is often where the link to the data folder is stored on many machine learning database websites.
    data_folder_tag = soup.find("a", href=lambda x: x and "machine-learning-databases" in x)

    # If the data_folder_tag is found, combine the base URL with the href of the data_folder_tag
    # to create the full URL of the data folder.
    # The urljoin function is used to ensure that the URLs are combined correctly, even if the href is a relative URL.
    if data_folder_tag is not None:
        return urljoin(url, data_folder_tag["href"])

    # If no data_folder_tag is found, return None.
    return None

def fetch_dataset_file(url):
    """This function checks each hyperlink in a web page to find a dataset file. 
    It specifically looks for links ending with ".data" or ".txt", or others, which are common data file formats. 
    If it finds a data file, it returns the complete URL to this file and the file format. 
    If no data file is found, it returns an empty string and an empty file format."""

    # Send a GET request to the provided URL.
    response = requests.get(url)

    # Decode the content of the response to utf-8, replacing any characters that can't be decoded.
    content = response.content.decode("utf-8", "replace")

    # Parse the HTML content of the response with BeautifulSoup.
    soup = BeautifulSoup(content, "html.parser")

    # Find all the 'a' tags on the page, which commonly represent hyperlinks.
    links = soup.find_all("a")

    # Initialize an empty string to hold the file format of the data file.
    file_format = ""

    # Iterate over all the links found on the page.
    for link in links:
        # If the text of the link ends with '.data', this is likely a link to a data file.
        if ".data" in link.text:
            # Set the file format to 'data'.
            file_format = "data"
            # Return a tuple containing the full URL of the data file (created by joining the base URL with the href of the link)
            # and the file format.
            return urljoin(url, link["href"]), file_format
        elif ".txt" in link.text:
            file_format = "txt"
            return urljoin(url, link["href"]), file_format
        elif ".csv" in link.text:
            file_format = "csv"
            return urljoin(url, link["href"]), file_format
        elif ".xls" in link.text:
            file_format = "xls"
            return urljoin(url, link["href"]), file_format
        elif ".xlsx" in link.text:
            file_format = "xlsx"
            return urljoin(url, link["href"]), file_format
        elif ".rar" in link.text:
            file_format = "rar"
            return urljoin(url, link["href"]), file_format 
        elif ".arff" in link.text:
            file_format = "arff"
            return urljoin(url, link["href"]), file_format
        elif ".mat" in link.text:
            file_format = "mat"
            return urljoin(url, link["href"]), file_format          
        elif ".Z" in link.text:
            file_format = "Z"
            return urljoin(url, link["href"]), file_format
        elif ".zip" in link.text:
            file_format = "zip"
            return urljoin(url, link["href"]), file_format
        elif ".Tar" in link.text:
            file_format = "Tar"
            return urljoin(url, link["href"]), file_format
        elif ".gz" in link.text:
            file_format = "gz"
            return urljoin(url, link["href"]), file_format
        elif ".dat" in link.text:
            file_format = "dat"
            return urljoin(url, link["href"]), file_format
        elif ".tar" in link.text:
            file_format = "tar"
            return urljoin(url, link["href"]), file_format 
        elif ".7z" in link.text:
            file_format = "7z"
            return urljoin(url, link["href"]), file_format
        elif ".json" in link.text:
            file_format = "json"
            return urljoin(url, link["href"]), file_format  
        elif ".tsv" in link.text:
            file_format = "tsv"
            return urljoin(url, link["href"]), file_format

    # If no data file is found among the links, return a tuple containing an empty string and the file format (which will also be an empty string).
    return "", file_format

def fetch_names_file(url):
    """
    This function checks each hyperlink in a web page to find a dataset names or info file. 
    It specifically looks for links ending with ".names" or ".info" or others. 
    If it finds a names or info file, it returns the complete URL to this file and the file format. 
    If no names or info file is found, it returns an empty string and an empty file format."""
    # Send a GET request to the provided URL.
    response = requests.get(url)
    # Decode the content of the response to utf-8, replacing any characters that can't be decoded.
    content = response.content.decode("utf-8", "replace")
    # Parse the HTML content of the response with BeautifulSoup.
    soup = BeautifulSoup(content, "html.parser")

    # Find all the 'a' tags on the page, which commonly represent hyperlinks.
    links = soup.find_all("a")
    # Initialize an empty string to hold the file format of the data file.
    file_format = ""

    # Iterate over all the links found on the page.
    for link in links:
        # If the text of the link ends with '.names', this is likely a link to a names file.
        if ".names" in link.text:
            # Set the file format to 'names'.
            file_format = "names"
            # Return a tuple containing the full URL of the names file (created by joining the base URL with the href of the link)
            # and the file format.
            return urljoin(url, link["href"]), file_format
        # If the text of the link ends with '.info', this is likely a link to an info file.
        elif ".info" in link.text:
            # Set the file format to 'info'.
            file_format = "info"
            # Return a tuple containing the full URL of the info file (created by joining the base URL with the href of the link)
            # and the file format.
            return urljoin(url, link["href"]), file_format
        elif ".doc" in link.text:
            file_format = "doc"
            return urljoin(url, link["href"]), file_format
        elif ".docx" in link.text:
            file_format = "docx"
            return urljoin(url, link["href"]), file_format
    
    # If no names or info file is found among the links, return a tuple containing an empty string and the file format (which will also be an empty string).
    return "", file_format

def get_uci_datasets(start_index, end_index):
    """
    This function fetches dataset details from the UCI Machine Learning Repository.

    It starts by sending a GET request to the UCI datasets page and parsing the response content. 
    Then it finds the main table that contains information about the datasets.

    It loops over each row in the table, which corresponds to a dataset, and skips the datasets 
    which are not within the range [start_index, end_index].

    For each dataset within the range, it extracts the details like name, URL, instances, attributes, 
    and year from the table columns. It also fetches additional details by calling the function 
    `get_dataset_details` with the dataset URL.

    All the collected details are then stored in a dictionary and appended to a list. The function 
    finally returns this list of dictionaries, each containing details about a dataset.

    If the main table is not found on the page, the function prints an error message and returns an 
    empty list.
    """
    # Define the URL for the UCI Machine Learning Repository
    url = "https://archive.ics.uci.edu/ml/datasets.php"

    # Send a GET request to the URL
    response = requests.get(url)

    # Decode the content of the response from bytes to UTF-8 string
    content = response.content.decode("utf-8", "replace")

    # Use BeautifulSoup to parse the HTML content of the response
    soup = BeautifulSoup(content, "html.parser")

    try:
        # Try to find the table in the HTML that has a border attribute set to "1"
        table = soup.find("table", {"border": "1"})

        # Find all row elements (tr) in the table
        rows = table.find_all("tr")
    except AttributeError:
        # If the table is not found, print an error message and return an empty list
        print("Table not found")
        return []

    # Initialize an empty list to store the datasets
    dataset_list = []
    
    # Initialize a variable to keep track of how many datasets have been processed   
    processed_datasets = 1

    # Iterate over the rows in the table, skipping the first row (header row)
    for row in rows[1:]:
        # If the number of processed datasets is equal to or exceeds the end_index, stop processing
        if processed_datasets >= end_index:
            break

        # Find all column elements (td) in the row
        cols = row.find_all("td")

        # If the row has less than 9 columns, skip this row and continue with the next one
        if len(cols) < 9:
            continue

        # If the number of processed datasets is less than the start_index, increment the counter and continue with the next row
        if processed_datasets < start_index:
            processed_datasets += 1
            continue

        # Get the dataset name, URL, number of instances, number of attributes, and year from the columns
        dataset_name = cols[0].get_text(strip=True)
        dataset_url = urljoin(url, cols[0].find("a")["href"])
        instances = cols[6].get_text(strip=True)
        attributes = cols[7].get_text(strip=True)
        year = cols[8].get_text(strip=True)

        # Use the get_dataset_details function to get detailed information about the dataset
        area, date_donated, web_hits, attribute_info,source, data_set_information, relevant_papers,papers_that_cite_this_data_set, num_papers = get_dataset_details(dataset_url)

        # Print the number of processed datasets and the number of papers that cite this dataset
        print(processed_datasets, "num_papers", num_papers)
        
        # Append a dictionary with all the dataset information to the dataset_list
        dataset_list.append({
            "index": processed_datasets,
            "name": dataset_name,
            "url": dataset_url,
            "instances": instances,
            "attributes": attributes,
            "year": year,
            "area": area,
            "date_donated" : date_donated,
            "web_hits": web_hits,
            "attribute_info": attribute_info,
            "source": source,
            "data_set_information": data_set_information,
            "relevant_papers": relevant_papers,
            "papers_that_cite_this_data_set": papers_that_cite_this_data_set,
            "num_papers" : num_papers,
            "data_folder_url": "",
            "dataset_file_url": "",
            "dataset_file_format": "",
            "names_file_url": "",
            "names_file_format": "",
        })

        # Increment the count of processed datasets
        processed_datasets += 1

    # After processing all rows, return the list of datasets
    return dataset_list

datasets = get_uci_datasets(start_index, end_index)

# Save results to CSV file
with open("uci_datasets.csv", "w", newline="") as csvfile:
    fieldnames = ["index", "name", "url", "instances", "attributes", "year", "area", "date_donated", "web_hits","data_folder_url", "dataset_file_url", "dataset_file_format", "names_file_url", "names_file_format", "attribute_info", "source","data_set_information","relevant_papers","papers_that_cite_this_data_set","num_papers"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    for i, dataset in enumerate(datasets):
        print(f"Index: {i}")

        data_folder_url = fetch_data_folder_url(dataset["url"])
        dataset["data_folder_url"] = data_folder_url

        if data_folder_url is not None:
            dataset_file_url, dataset_file_format = fetch_dataset_file(data_folder_url)
            dataset["dataset_file_url"] = dataset_file_url
            dataset["dataset_file_format"] = dataset_file_format

            names_file_url, names_file_format = fetch_names_file(data_folder_url)
            dataset["names_file_url"] = names_file_url
            dataset["names_file_format"] = names_file_format
        else:
            dataset["dataset_file_url"] = ""
            dataset["names_file_url"] = ""

        writer.writerow({
            "index": i+1,
            "name": dataset["name"],
            "url": dataset["url"],
            "instances": dataset["instances"],
            "attributes": dataset["attributes"],
            "year": dataset["year"],
            "area": dataset["area"],
            "date_donated": dataset["date_donated"],
            "web_hits": dataset["web_hits"],
            "data_folder_url": dataset["data_folder_url"],
            "dataset_file_url": dataset["dataset_file_url"],
            "dataset_file_format": dataset["dataset_file_format"],
            "names_file_url": dataset["names_file_url"],
            "names_file_format": dataset["names_file_format"],
            "attribute_info": dataset["attribute_info"],
            "source": dataset["source"],
            "data_set_information": dataset["data_set_information"],
            "relevant_papers": dataset["relevant_papers"],
            "papers_that_cite_this_data_set": dataset["papers_that_cite_this_data_set"],
            "num_papers": dataset["num_papers"]
                        
        })





1 num_papers 29
2 num_papers 51
3 num_papers 6
4 num_papers 4
Index: 0
Index: 1
Index: 2
Index: 3
