# UCI Webscraper

# Setup

## Imports

In [None]:
# Scraping imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import By
import selenium.webdriver.support.expected_conditions as EC

from bs4 import BeautifulSoup
import pandas as pd
from flatten_json import flatten
from tqdm import tqdm

## Variable Initialization

In [None]:
chrome_options = Options()
chrome_options.add_argument('--headless')

In [None]:
driver = webdriver.Chrome(options=chrome_options)

In [None]:
def _combine_paths(base_path, path_dict):
    return {attr: f'{base_path} > {path}' for attr, path in path_dict.items()}

In [None]:
base_url = 'https://archive-beta.ics.uci.edu/ml/datasets'
base_path = 'div:nth-child(2) > div > div.MuiGrid-root.MuiGrid-container.MuiGrid-align-items-xs-flex-start.MuiGrid-justify-xs-center'

wait_path = 'div:nth-child(2) > div > div.MuiGrid-root.MuiGrid-container.MuiGrid-align-items-xs-flex-start.MuiGrid-justify-xs-center > div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 > div:nth-child(4) > div.MuiCollapse-container.MuiCollapse-entered > div > div > div > div > table > tbody > tr:nth-child(1) > td:nth-child(2) > p'

In [None]:
variable_attribute_paths = {
    'creators': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(3) '
                '> div.MuiCollapse-container.MuiCollapse-entered '
                '> div '
                '> div '
                '> div '
                '> div '
                '> ul '
                '> li '
                '> div '
                '> span '
                '> h6',
    'keywords': 'div.MuiGrid-root.MuiGrid-grid-xs-12.MuiGrid-grid-md-3 '
                '> div:nth-child(1) '
                '> div.MuiCardContent-root '
                '> div '
                '> span.MuiChip-label'
}

single_attribute_paths = {
    'abstract': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-direction-xs-column '
                '> div '
                '> p',
    'associated_tasks': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-3 '
                '> div:nth-child(4) '
                '> p',
    'dataset': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div.MuiPaper-root.MuiCard-root.jss15.MuiPaper-elevation3.MuiPaper-rounded '
                '> div.MuiCardHeader-root '
                '> div.MuiCardHeader-content '
                '> h5',
    'dataset_characteristics': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-3 '
                '> div:nth-child(2) '
                '> p',
    'doi': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-3 '
                '> div:nth-child(5) '
                '> p',
    'donation_date': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div.MuiPaper-root.MuiCard-root.jss15.MuiPaper-elevation3.MuiPaper-rounded '
                '> div.MuiCardHeader-root '
                '> div.MuiCardHeader-content '
                '> span '
                '> p',
    'license': 'div.MuiGrid-root.MuiGrid-grid-xs-12.MuiGrid-grid-md-3 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root '
                '> p:nth-child(1) '
                '> a.MuiTypography-root.MuiLink-root.MuiLink-underlineHover.MuiTypography-colorInherit',
    'num_citations': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div.MuiPaper-root.MuiCard-root.jss15.MuiPaper-elevation3.MuiPaper-rounded '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-align-items-xs-center.MuiGrid-justify-xs-space-between '
                '> div:nth-child(1) '
                '> div:nth-child(2) '
                '> p',
    'num_instances': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-3 '
                '> div:nth-child(6) '
                '> p',
    'num_views':'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div.MuiPaper-root.MuiCard-root.jss15.MuiPaper-elevation3.MuiPaper-rounded '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-align-items-xs-center.MuiGrid-justify-xs-space-between '
                '> div:nth-child(1) '
                '> div:nth-child(1) '
                '> p',
    'subject_area': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-3 '
                '> div:nth-child(3) '
                '> p'
}

single_attribute_paths = _combine_paths(base_path, single_attribute_paths)
variable_attribute_paths = _combine_paths(base_path, variable_attribute_paths)

In [None]:
tabular_base_path = 'div:nth-child(2) > div > div.MuiGrid-root.MuiGrid-container.MuiGrid-align-items-xs-flex-start.MuiGrid-justify-xs-center > div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 > div:nth-child(5) > div.MuiCollapse-container.MuiCollapse-entered > div > div > div > div > table > tbody'
descriptive_question_base_path = 'div:nth-child(2) > div > div.MuiGrid-root.MuiGrid-container.MuiGrid-align-items-xs-flex-start.MuiGrid-justify-xs-center > div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 > div:nth-child(4) > div.MuiCollapse-container.MuiCollapse-entered > div > div > div > div > table > tbody'
tabular_attribute_paths = {
    'missing_values': 'tr:nth-child(1) '
                '> td:nth-child(2) '
                '> p',
    'missing_value_placeholder': 'tr:nth-child(2) '
                '> td:nth-child(2) '
                '> p',
    'num_attributes': 'tr:nth-child(3) '
                '> td:nth-child(2) '
                '> p'
}

descriptive_question_attribute_paths = {
    'creation_purpose': 'tr:nth-child(1) '
                '> td:nth-child(2) '
                '> p',
    'funders': 'tr:nth-child(2) '
                '> td:nth-child(2) '
                '> p',
    'instances_represent': 'tr:nth-child(3) '
                '> td:nth-child(2) '
                '> p',
    'recommended_data_split': 'tr:nth-child(4) '
                '> td:nth-child(2) '
                '> p',
    'sensitive_data': 'tr:nth-child(5) '
                '> td:nth-child(2) '
                '> p',
    'preprocessing_done': 'tr:nth-child(6) '
                '> td:nth-child(2) '
                '> p',
    'previous_tasks': 'tr:nth-child(7) '
                '> td:nth-child(2) '
                '> p',
    'additional_info': 'tr:nth-child(8) '
                '> td:nth-child(2) '
                '> p',
    'citation_requests/acknowledgements': 'tr:nth-child(9) '
                '> td:nth-child(2) '
                '> p'
}

tabular_attribute_paths = _combine_paths(tabular_base_path, tabular_attribute_paths)
descriptive_question_attribute_paths = _combine_paths(descriptive_question_base_path, descriptive_question_attribute_paths)

In [None]:
single_attribute_paths = {**single_attribute_paths, **descriptive_question_attribute_paths}

## Helper Functions

In [None]:
def clean_results(results):
    """Cleans the results scraped from the dataset page.
    
    Parameters
    ----------
    results : dict
    
    Returns
    -------
    results : dict
    """
    
    # Remove unnecessary text from temporal/numeric cells
    if 'Donated on' in results.get('donation_date'):
        results['donation_date'] = results['donation_date'].replace('Donated on', '').strip()
    if 'citations' in results.get('num_citations'):
        results['num_citations'] = int(results['num_citations'].replace('citations', '').strip())
    if 'views' in results.get('num_views'):
        results['num_views'] = int(results['num_views'].replace('views', '').strip())
    
    return results

def is_tabular(soup):
    """For a soup object relating to a dataset page, returns if the dataset is tabular.
    
    Parameters
    ----------
    soup : BeautifulSoup
    
    Returns
    -------
    boolean
    """
            
    return 'Tabular Data Properties' in soup.text

# Scraping

## Gather Dataset ID's

In [None]:
def get_dataset_ids(dataset_list_url, instance_path, driver):
    """Returns the dataset ids for all datasets on the given page.
    
    Parameters
    ----------
    dataset_list_url : str
        Web url for page containing links to the datasets to scrape.
    instance_path : str
        CSS Selector path for the datasets on the page.
    driver : WebDriver
        Selenium webdriver to use for html extraction.
    
    Returns
    -------
    dataset_ids : list
    """
    
    # Get the requested url
    driver.get(dataset_list_url)
    
    # Wait for instances to load on page
    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, instance_path)))
    
    # Create parsable html object
    html = driver.page_source
    soup = BeautifulSoup(html)
    
    # Gather the instances and parse the ids
    dataset_ids = [instance.attrs['href'].split('/')[-1] for instance in soup.select(instance_path)]
    
    return dataset_ids

## Scraping Functionality

In [None]:
def get_single_attribute_value(soup, attribute):
    """Retrieves the requested value from the soup object.
    
    For a page attribute with a single value ('abstract', 'num_instances', etc), 
    returns the value. For attributes with potentially multiple values, such as 
    'keywords', use get_variable_attribute_values(...).
    
    Parameters
    ----------
    soup : BeautifulSoup
        BeautifulSoup object containing the html to be parsed.
    attribute : str
        Name of the attribute to extract from the soup.
    
    Returns
    -------
    str
        Value of attribute.
    """
    
    path = single_attribute_paths[attribute]

    return soup.select_one(path).text

In [None]:
def get_variable_attribute_values(soup, attribute):
    """Retrieves the requested value from the soup object.
    
    For a page attribute with potentially multiple values, such as 'keywords', 
    return the values as a list. For attributes with a single value, such as 
    'abstract', use get_single_attribute_value(...).
    
    Parameters
    ----------
    soup : BeautifulSoup
        BeautifulSoup object containing the html to be parsed.
    attribute : str
        Name of the attribute to extract from the soup.
    
    Returns
    -------
    list
        Value(s) of attribute.
    """ 
    
    path = variable_attribute_paths[attribute]
    
    return [tag.text for tag in soup.select(path)]

In [None]:
def get_individual_page_data(url, 
                             driver, 
                             single_attribute_paths=None, 
                             variable_attribute_paths=None, 
                             clean=True,
                             flatten_output=False,
                             **kwargs):
    """Returns all data from the requested page.
    
    Parameters
    ----------
    url : str
    driver : WebDriver
        Selenium webdriver to use for html extraction.
    single_attribute_paths : dict, optional (default=None)
        Selector paths to use for data extraction on
        single-valued attributes.
    variable_attribute_paths : dict, optional (default=None)
        Selector paths to use for data extraction on 
        variable-valued attributes.
    clean : boolean, optional (default=True)
    flatten_output : boolean, optional (default=False)
        Flag for specifying if nested output should be flattened.
    
    Returns
    -------
    result_dict : dict
    """
    
    tabular_attribute_paths = kwargs.get('tabular_attribute_paths', None)
    
    # Get the requested url
    driver.get(url)
    
    # Wait for pertinent sections to load
    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, wait_path)))
    
    # Extract and convert html data
    html = driver.page_source
    soup = BeautifulSoup(html)
    
    # Add tabular info
    if is_tabular(soup) and tabular_attribute_paths:
        try:
            single_attribute_paths = {**single_attribute_paths, **tabular_attribute_paths}
        except NameError:
            single_attribute_paths = tabular_attribute_paths
    
    # Retrieve attribute values from parsed html
    if single_attribute_paths:
        single_values = {attribute: get_single_attribute_value(soup, attribute) 
                         for attribute in single_attribute_paths}
    else:
        single_values = None
    if variable_attribute_paths:
        variable_values = {attribute: get_variable_attribute_values(soup, attribute)
                           for attribute in variable_attribute_paths}
    else:
        single_values = None
    
    result_dict = {**single_values or dict(), **variable_values or dict()}
    
    # Clean results (if instructed)
    if clean:
        result_dict = clean_results(result_dict)
    
    # Flatten output (if instructed)
    if flatten_output:
        result_dict = flatten(result_dict)
    
    return result_dict

In [None]:
def get_all_page_data(base_url, 
                      driver,
                      page_ids,
                      single_attribute_paths=None, 
                      variable_attribute_paths=None, 
                      clean=True,
                      flatten_output=False):
    """Returns data for all pages for the requested base url.
    
    Parameters
    ----------
    base_url : str
    driver : WebDriver
        Selenium webdriver to use for html extraction.
    page_ids : list-like
        dataset ids to use for pulling up each page.
    single_attribute_paths : dict, optional (default=None)
        Selector paths to use for data extraction on 
        single-valued attributes.
    variable_attribute_paths : dict, optional (default=None)
        Selector paths to use for data extraction on 
        variable-valued attributes.
    clean : boolean, optional (default=True)
    flatten_output : boolean, optional (default=False)
        Flag for specifying if nested output should be flattened.
        
    Returns
    -------
    dataset_df : DataFrame
    """
    
    # Create hollow output dataframe
    dataset_df = pd.DataFrame()
    
    # Loop for each dataset page
    for page_id in tqdm(page_ids):
        url = f'{base_url}/{page_id}'
        
        # Retrieve and clean results
        results = get_individual_page_data(url=url, 
                                           driver=driver, 
                                           single_attribute_paths=single_attribute_paths, 
                                           variable_attribute_paths=variable_attribute_paths,
                                           clean=clean,
                                           flatten_output=flatten_output)
        # Add results to total result dataframe
        dataset_df = dataset_df.append(results, ignore_index=True)
    
    # Remove unnecessary nested columns
    # Datasets that don't have nested data will force the DataFrame to keep the nested column names
    if flatten_output:
        dataset_df = dataset_df.drop(columns=variable_attribute_paths.keys())
    
    return dataset_df

## Scraping Process

In [None]:
dataset_ids = get_dataset_ids(dataset_list_url='https://archive-beta.ics.uci.edu/ml/datasets?&p%5Boffset%5D=0&p%5Blimit%5D=591&p%5BorderBy%5D=NumHits&p%5Border%5D=desc',
                              instance_path='div:nth-child(2) > div > div > div.jss10 > div > div.MuiTableContainer-root > table > tbody > tr > div > li > div > div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-11 > div > span > div > div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-8.MuiGrid-grid-sm-10 > p > a',
                              driver=driver)

In [None]:
dataset_df = get_all_page_data(base_url=base_url, 
                               driver=driver, 
                               page_ids=dataset_ids,
                               single_attribute_paths=single_attribute_paths, 
                               variable_attribute_paths=variable_attribute_paths,
                               flatten_output=True)

### View the results

In [None]:
pd.set_option('max_columns', None)
dataset_df.head()