In [3]:
import csv
import os
import requests
from bs4 import BeautifulSoup, Tag
from urllib.parse import urljoin

start_index = 1
end_index = 623

def remove_sup_tags(tag):
    for sup in tag.find_all("sup"):
        sup.extract()
    return tag

def has_papers_that_cite_this_data_set(tag):
    if not (tag.name == "p" and tag.get("class") == ["normal"]):
        return False
    
    previous_small_heading = tag.find_previous("p", class_="small-heading")
    
    if not previous_small_heading:
        return False
    
    text = previous_small_heading.get_text(strip=True)
    return "Papers That Cite This Data Set" in text

def get_dataset_details(url):
    response = requests.get(url)
    content = response.content.decode("utf-8", "replace")
    soup = BeautifulSoup(content, "html.parser")

    table = soup.find("table", {"border": "1"})

    if table is None:
        return "", "", "", "", "", "", "", "", ""

    rows = table.find_all("tr")

    area = ""
    date_donated = ""
    web_hits = ""
    attribute_info = ""
    source = ""
    data_set_information = ""
    relevant_papers = ""
    papers_that_cite_this_data_set = ""
    num_papers = 0

    for row in rows:
        cols = row.find_all("td")

        if len(cols) == 6:
            if "Area:" in cols[4].get_text(strip=True):
                area = cols[5].get_text(strip=True)
            if "Date Donated" in cols[4].get_text(strip=True):
                date_donated = cols[5].get_text(strip=True)   
            if "Number of Web Hits:" in cols[4].get_text(strip=True):
                web_hits = cols[5].get_text(strip=True)

    attribute_info_tag = soup.find("p", class_="small-heading", text="Attribute Information:")

    if attribute_info_tag:
        attribute_info_lines = []
        sibling = attribute_info_tag.find_next_sibling()
        while sibling and sibling.name == "p" and "class" in sibling.attrs and "normal" in sibling["class"]:
            lines = sibling.decode_contents().split("<br/>")
            for line in lines:
                line = line.strip()
 #               if not line.startswith("Given") and not line.startswith("") and not line.startswith("-") and not line.startswith(" "):
                attribute_info_lines.append(line)
            sibling = sibling.find_next_sibling()

        attribute_info = "\n".join(attribute_info_lines)

    source_tag = soup.find("p", class_="small-heading", text="Source:")
    if source_tag:
        source_lines = []
        sibling = source_tag.find_next_sibling()
        while sibling and sibling.name == "p" and "class" in sibling.attrs and "normal" in sibling["class"]:
            lines = sibling.decode_contents().split("<br/>")
            for line in lines:
                line = line.strip()
                source_lines.append(line)
            sibling = sibling.find_next_sibling()

        source = "\n".join(source_lines)

    data_set_information_tag = soup.find("p", class_="small-heading", text="Data Set Information:")
    if data_set_information_tag:
        data_set_information_lines = []
        sibling = data_set_information_tag.find_next_sibling()
        while sibling and sibling.name == "p" and "class" in sibling.attrs and "normal" in sibling["class"]:
            lines = sibling.decode_contents().split("<br/>")
            for line in lines:
                line = line.strip()
                data_set_information_lines.append(line)
            sibling = sibling.find_next_sibling()

        data_set_information = "\n".join(data_set_information_lines)

    relevant_papers_tag = soup.find("p", class_="small-heading", text="Relevant Papers:")
    if relevant_papers_tag:
        relevant_papers_lines = []
        sibling = relevant_papers_tag.find_next_sibling()
        while sibling and sibling.name == "p" and "class" in sibling.attrs and "normal" in sibling["class"]:
            lines = sibling.decode_contents().split("<br/>")
            for line in lines:
                line = line.strip()
                relevant_papers_lines.append(line)
            sibling = sibling.find_next_sibling()

        relevant_papers = "\n".join(relevant_papers_lines) 
   
    papers_that_cite_this_data_set_tag = soup.find(has_papers_that_cite_this_data_set)

    if papers_that_cite_this_data_set_tag:
        sibling = papers_that_cite_this_data_set_tag
        papers = []
        num_papers = 0
        while sibling and sibling.name == "p":
            if "class" in sibling.attrs and "normal" in sibling["class"]:
                for br in sibling.find_all("br"):
                    br.replace_with("|||")
                paper_list = sibling.text.split("|||")

                # Handle the case where papers are separated by <br><br> instead of just <br>
                if len(paper_list) == 1 and "<br>" in sibling.decode_contents():
                    paper_list = sibling.decode_contents().split("<br><br>")

                # Filter out empty strings from paper_list
                paper_list = [paper.strip() for paper in paper_list if paper.strip()]

                for paper in paper_list:
                    papers.append(paper)
                    num_papers += 1
  
            sibling = sibling.find_next_sibling("p")

        papers_that_cite_this_data_set = "; ".join(papers)
        
    else:
        papers_that_cite_this_data_set = ""
        num_papers = 0
    
    return area, date_donated, web_hits, attribute_info, source, data_set_information, relevant_papers, papers_that_cite_this_data_set, num_papers
   
def fetch_data_folder_url(url):
    response = requests.get(url)
    content = response.content.decode("utf-8", "replace")
    soup = BeautifulSoup(content, "html.parser")

    data_folder_tag = soup.find("a", href=lambda x: x and "machine-learning-databases" in x)

    if data_folder_tag is not None:
        return urljoin(url, data_folder_tag["href"])

    return None

def fetch_dataset_file(url):
    response = requests.get(url)
    content = response.content.decode("utf-8", "replace")
    soup = BeautifulSoup(content, "html.parser")

    links = soup.find_all("a")
    file_format = ""

    for link in links:
        if ".data" in link.text:
            file_format = "data"
            return urljoin(url, link["href"]), file_format
        elif ".txt" in link.text:
            file_format = "txt"
            return urljoin(url, link["href"]), file_format
        elif ".csv" in link.text:
            file_format = "csv"
            return urljoin(url, link["href"]), file_format
        elif ".xls" in link.text:
            file_format = "xls"
            return urljoin(url, link["href"]), file_format
        elif ".xlsx" in link.text:
            file_format = "xlsx"
            return urljoin(url, link["href"]), file_format
        elif ".rar" in link.text:
            file_format = "rar"
            return urljoin(url, link["href"]), file_format 
        elif ".arff" in link.text:
            file_format = "arff"
            return urljoin(url, link["href"]), file_format
        elif ".mat" in link.text:
            file_format = "mat"
            return urljoin(url, link["href"]), file_format          
        elif ".Z" in link.text:
            file_format = "Z"
            return urljoin(url, link["href"]), file_format
        elif ".zip" in link.text:
            file_format = "zip"
            return urljoin(url, link["href"]), file_format
        elif ".Tar" in link.text:
            file_format = "Tar"
            return urljoin(url, link["href"]), file_format
        elif ".gz" in link.text:
            file_format = "gz"
            return urljoin(url, link["href"]), file_format
        elif ".dat" in link.text:
            file_format = "dat"
            return urljoin(url, link["href"]), file_format
        elif ".tar" in link.text:
            file_format = "tar"
            return urljoin(url, link["href"]), file_format 
        elif ".7z" in link.text:
            file_format = "7z"
            return urljoin(url, link["href"]), file_format
        elif ".json" in link.text:
            file_format = "json"
            return urljoin(url, link["href"]), file_format  
        elif ".tsv" in link.text:
            file_format = "tsv"
            return urljoin(url, link["href"]), file_format

    # return an empty string and file format if no data file is found
    return "", file_format

def fetch_names_file(url):
    response = requests.get(url)
    content = response.content.decode("utf-8", "replace")
    soup = BeautifulSoup(content, "html.parser")

    links = soup.find_all("a")
    file_format = ""

    for link in links:
        if ".names" in link.text:
            file_format = "names"
            return urljoin(url, link["href"]), file_format
        elif ".info" in link.text:
            file_format = "info"
            return urljoin(url, link["href"]), file_format
        elif ".doc" in link.text:
            file_format = "doc"
            return urljoin(url, link["href"]), file_format
        elif ".docx" in link.text:
            file_format = "docx"
            return urljoin(url, link["href"]), file_format
    return "", file_format

def get_uci_datasets(start_index, end_index):
    url = "https://archive.ics.uci.edu/ml/datasets.php"
    response = requests.get(url)
    content = response.content.decode("utf-8", "replace")
    soup = BeautifulSoup(content, "html.parser")

    try:
        table = soup.find("table", {"border": "1"})
        rows = table.find_all("tr")
    except AttributeError:
        print("Table not found")
        return []

    dataset_list = []
    processed_datasets = 1

    for row in rows[1:]:
        if processed_datasets >= end_index:
            break

        cols = row.find_all("td")

        if len(cols) < 9:
            continue

        if processed_datasets < start_index:
            processed_datasets += 1
            continue

        dataset_name = cols[0].get_text(strip=True)
        dataset_url = urljoin(url, cols[0].find("a")["href"])
        instances = cols[6].get_text(strip=True)
        attributes = cols[7].get_text(strip=True)
        year = cols[8].get_text(strip=True)

        area, date_donated, web_hits, attribute_info,source, data_set_information, relevant_papers,papers_that_cite_this_data_set, num_papers = get_dataset_details(dataset_url)

        print(processed_datasets, "num_papers", num_papers)
        

        dataset_list.append({
            "index": processed_datasets,
            "name": dataset_name,
            "url": dataset_url,
            "instances": instances,
            "attributes": attributes,
            "year": year,
            "area": area,
            "date_donated" : date_donated,
            "web_hits": web_hits,
            "attribute_info": attribute_info,
            "source": source,
            "data_set_information": data_set_information,
            "relevant_papers": relevant_papers,
            "papers_that_cite_this_data_set": papers_that_cite_this_data_set,
            "num_papers" : num_papers,
            "data_folder_url": "",
            "dataset_file_url": "",
            "dataset_file_format": "",
            "names_file_url": "",
            "names_file_format": "",
        })

        processed_datasets += 1

    return dataset_list

datasets = get_uci_datasets(start_index, end_index)

# Save results to CSV file
with open("uci_datasets.csv", "w", newline="") as csvfile:
    fieldnames = ["index", "name", "url", "instances", "attributes", "year", "area", "date_donated", "web_hits","data_folder_url", "dataset_file_url", "dataset_file_format", "names_file_url", "names_file_format", "attribute_info", "source","data_set_information","relevant_papers","papers_that_cite_this_data_set","num_papers"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    for i, dataset in enumerate(datasets):
        print(f"Index: {i}")

        data_folder_url = fetch_data_folder_url(dataset["url"])
        dataset["data_folder_url"] = data_folder_url

        if data_folder_url is not None:
            dataset_file_url, dataset_file_format = fetch_dataset_file(data_folder_url)
            dataset["dataset_file_url"] = dataset_file_url
            dataset["dataset_file_format"] = dataset_file_format

            names_file_url, names_file_format = fetch_names_file(data_folder_url)
            dataset["names_file_url"] = names_file_url
            dataset["names_file_format"] = names_file_format
        else:
            dataset["dataset_file_url"] = ""
            dataset["names_file_url"] = ""

#        print("Papers That Cite This Data Set:", dataset["papers_that_cite_this_data_set"])

        writer.writerow({
            "index": i+1,
            "name": dataset["name"],
            "url": dataset["url"],
            "instances": dataset["instances"],
            "attributes": dataset["attributes"],
            "year": dataset["year"],
            "area": dataset["area"],
            "date_donated": dataset["date_donated"],
            "web_hits": dataset["web_hits"],
            "data_folder_url": dataset["data_folder_url"],
            "dataset_file_url": dataset["dataset_file_url"],
            "dataset_file_format": dataset["dataset_file_format"],
            "names_file_url": dataset["names_file_url"],
            "names_file_format": dataset["names_file_format"],
            "attribute_info": dataset["attribute_info"],
            "source": dataset["source"],
            "data_set_information": dataset["data_set_information"],
            "relevant_papers": dataset["relevant_papers"],
            "papers_that_cite_this_data_set": dataset["papers_that_cite_this_data_set"],
            "num_papers": dataset["num_papers"]
                        
        })





1 num_papers 29
2 num_papers 51
3 num_papers 6
4 num_papers 4
5 num_papers 3
6 num_papers 0
7 num_papers 19
8 num_papers 19
9 num_papers 12
10 num_papers 3
11 num_papers 0
12 num_papers 8
13 num_papers 1
14 num_papers 91
15 num_papers 40
16 num_papers 40
17 num_papers 40
18 num_papers 2
19 num_papers 16
20 num_papers 24
21 num_papers 26
22 num_papers 26
23 num_papers 26
24 num_papers 1
25 num_papers 5
26 num_papers 2
27 num_papers 4
28 num_papers 1
29 num_papers 2
30 num_papers 3
31 num_papers 6
32 num_papers 6
33 num_papers 8
34 num_papers 53
35 num_papers 0
36 num_papers 0
37 num_papers 0
38 num_papers 5
39 num_papers 12
40 num_papers 3
41 num_papers 0
42 num_papers 52
43 num_papers 4
44 num_papers 5
45 num_papers 58
46 num_papers 33
47 num_papers 13
48 num_papers 0
49 num_papers 15
50 num_papers 3
51 num_papers 55
52 num_papers 100
53 num_papers 11
54 num_papers 0
55 num_papers 17
56 num_papers 9
57 num_papers 11
58 num_papers 16
59 num_papers 23
60 num_papers 0
61 num_papers 3
62 n