In [None]:
import json

In [None]:
path_dicts_to_export = dict()

# Initialize path dicts

## Dataverse

In [None]:
dataset_metadata_attribute_paths = {
    'deposit_date': '#metadata_dateOfDeposit > td',
    'num_downloads': '#metrics-body > div',
    'language': '#metadata_language > td',
    'topic_class_value': '#metadata_topicClassification > td',
    'kind_of_data': '#metadata_kindOfData > td',
    'date_of_collection': '#metadata_dateOfCollection > td',
    'notes_text': '#metadata_notesText > td',
    'contributor': '#metadata_contributor > td',
    'grant_info': '#metadata_grantNumber > td',
    'publication_citation': '#datasetCitationActionSummaryBlock > div:nth-child(1) > div > div > div.citation.margin-bottom > span',
    'full_author_info': '#metadata_author > td',
    'global_id': '#metadata_persistentId > td'
}

dataset_terms_attribute_paths = {
    'terms_of_use': '#datasetForm\:tabView\:touFragment'
}

dataset_file_attribute_paths = {
    'file_count': '#datasetForm\:tabView\:filesTable\:filesHeaderCount > span',
    'page_size_selection': 'select.ui-paginator-rpp-options',
    'file_table': '#datasetForm\:tabView\:filesTable_data',
    'file_metadata': 'td.col-file-metadata',
    'page_increase': 'a.ui-paginator-next',
    'file_name': 'div > a',
    'date_published': 'div.dateCreatePublish-block',
    'num_downloads': 'div.downloads-block',
    'unf': 'div.unf-block'
}

file_metadata_attribute_paths = {
    'deposit_date': '#fileDepositDateBlock > td',
    'num_downloads': '#metrics-body > div',
    'download_url': '#fileDownloadUrlBlock > td > code'
}

In [None]:
dataverse_path_dicts = {
    'dataset': {
        'file_attribute_paths': dataset_file_attribute_paths,
        'metadata_attribute_paths': dataset_metadata_attribute_paths,
        'terms_attribute_paths': dataset_terms_attribute_paths
    },
    'file': {
        'landing_attribute_paths': file_metadata_attribute_paths
    }
}

In [None]:
path_dicts_to_export['dataverse'] = dataverse_path_dicts

## Dryad

In [None]:
dryad_path_dict = {
    'numViews': '#show_metrics > div:nth-of-type(1) > div.o-metrics__number',
    'numDownloads': '#show_metrics > div:nth-of-type(2) > div.o-metrics__number',
    'numCitations': '#metrics_citation_count',
    'title': '#display_resource > h1'
}

In [None]:
path_dicts_to_export['dryad'] = dryad_path_dict

## OpenML

In [None]:
openml_path_dict = {
    'num_tasks': '#data_overview > h3:nth-of-type(3)',
    'task': f'#data_overview > div:nth-of-type',
    'downloads': '#downloadcount'
}

In [None]:
path_dicts_to_export['openml'] = openml_path_dict

## UCI

In [None]:
def _combine_paths(base_path, path_dict):
    return {attr: f'{base_path} > {path}' for attr, path in path_dict.items()}

In [None]:
base_path = 'div:nth-child(2) > div > div.MuiGrid-root.MuiGrid-container.MuiGrid-align-items-xs-flex-start.MuiGrid-justify-xs-center'

In [None]:
structural_paths = {
    'instance_path': 'div:nth-child(2) > div > div > div.jss11 > div > '
                     'div.MuiTableContainer-root > table > tbody > tr > div > li > div > '
                     'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-11'
                     ' > div > span > div > div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs'
                     '-8.MuiGrid-grid-sm-10 > p > a',
    'wait_path': 'div:nth-child(2) > div > div.MuiGrid-root.MuiGrid-'
                 'container.MuiGrid-align-items-xs-flex-start.MuiGrid-'
                 'justify-xs-center > div.MuiGrid-root.MuiGrid-item.'
                 'MuiGrid-grid-xs-12.MuiGrid-grid-md-9 > '
                 'div:nth-child(4) > div.MuiCollapse-container.'
                 'MuiCollapse-entered > div > div > div > div > table >'
                 ' tbody > tr:nth-child(1) > td:nth-child(2) > p',
    'tabular_wait_path': 'div:nth-child(2) > div > div.MuiGrid-root.MuiGrid-container.'
                         'MuiGrid-align-items-xs-flex-start.MuiGrid-justify-xs-center > '
                         'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9'
                         ' > div:nth-child(5) > div.MuiCollapse-container.MuiCollapse-'
                         'entered > div > div > div > div > table > tbody > tr:nth-child(1)'
                         ' > td:nth-child(2) > p'
}

variable_attribute_paths = {
    'creators': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(3) '
                '> div.MuiCollapse-container.MuiCollapse-entered '
                '> div '
                '> div '
                '> div '
                '> div '
                '> ul '
                '> li '
                '> div '
                '> span '
                '> h6',
    'keywords': 'div.MuiGrid-root.MuiGrid-grid-xs-12.MuiGrid-grid-md-3 '
                '> div:nth-child(1) '
                '> div.MuiCardContent-root '
                '> div '
                '> span.MuiChip-label'
}

single_attribute_paths = {
    'abstract': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-direction-xs-column '
                '> div '
                '> p',
    'associated_tasks': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-3 '
                '> div:nth-child(4) '
                '> p',
    'dataset': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div.MuiPaper-root.MuiCard-root.jss16.MuiPaper-elevation3.MuiPaper-rounded '
                '> div.MuiCardHeader-root '
                '> div.MuiCardHeader-content '
                '> h5',
    'dataset_characteristics': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-3 '
                '> div:nth-child(2) '
                '> p',
    'doi': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-3 '
                '> div:nth-child(5) '
                '> p',
    'donation_date': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div.MuiPaper-root.MuiCard-root.jss16.MuiPaper-elevation3.MuiPaper-rounded '
                '> div.MuiCardHeader-root '
                '> div.MuiCardHeader-content '
                '> span '
                '> p',
    
    'license': 'div.MuiGrid-root.MuiGrid-grid-xs-12.MuiGrid-grid-md-3 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root '
                '> p:nth-child(1) '
                '> a.MuiTypography-root.MuiLink-root.MuiLink-underlineHover.MuiTypography-colorInherit',
    'num_citations': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div.MuiPaper-root.MuiCard-root.jss16.MuiPaper-elevation3.MuiPaper-rounded '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-align-items-xs-center.MuiGrid-justify-xs-space-between '
                '> div:nth-child(1) '
                '> div:nth-child(2) '
                '> p',
    'num_instances': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-3 '
                '> div:nth-child(6) '
                '> p',
    'num_views':'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div.MuiPaper-root.MuiCard-root.jss16.MuiPaper-elevation3.MuiPaper-rounded '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-align-items-xs-center.MuiGrid-justify-xs-space-between '
                '> div:nth-child(1) '
                '> div:nth-child(1) '
                '> p',
    'subject_area': 'div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 '
                '> div:nth-child(2) '
                '> div.MuiCardContent-root.MuiGrid-root.MuiGrid-container.MuiGrid-justify-xs-space-between '
                '> div.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-3 '
                '> div:nth-child(3) '
                '> p'
}

single_attribute_paths = _combine_paths(base_path, single_attribute_paths)
variable_attribute_paths = _combine_paths(base_path, variable_attribute_paths)

In [None]:
tabular_base_path = 'div:nth-child(2) > div > div.MuiGrid-root.MuiGrid-container.MuiGrid-align-items-xs-flex-start.MuiGrid-justify-xs-center > div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 > div:nth-child(5) > div.MuiCollapse-container.MuiCollapse-entered > div > div > div > div > table > tbody'
descriptive_question_base_path = 'div:nth-child(2) > div > div.MuiGrid-root.MuiGrid-container.MuiGrid-align-items-xs-flex-start.MuiGrid-justify-xs-center > div.MuiGrid-root.MuiGrid-item.MuiGrid-grid-xs-12.MuiGrid-grid-md-9 > div:nth-child(4) > div.MuiCollapse-container.MuiCollapse-entered > div > div > div > div > table > tbody'
tabular_attribute_paths = {
    'missing_values': 'tr:nth-child(1) '
                '> td:nth-child(2) '
                '> p',
    'missing_value_placeholder': 'tr:nth-child(2) '
                '> td:nth-child(2) '
                '> p',
    'num_attributes': 'tr:nth-child(3) '
                '> td:nth-child(2) '
                '> p'
}

descriptive_question_attribute_paths = {
    'creation_purpose': 'tr:nth-child(1) '
                '> td:nth-child(2) '
                '> p',
    'funders': 'tr:nth-child(2) '
                '> td:nth-child(2) '
                '> p',
    'instances_represent': 'tr:nth-child(3) '
                '> td:nth-child(2) '
                '> p',
    'recommended_data_split': 'tr:nth-child(4) '
                '> td:nth-child(2) '
                '> p',
    'sensitive_data': 'tr:nth-child(5) '
                '> td:nth-child(2) '
                '> p',
    'preprocessing_done': 'tr:nth-child(6) '
                '> td:nth-child(2) '
                '> p',
    'previous_tasks': 'tr:nth-child(7) '
                '> td:nth-child(2) '
                '> p',
    'additional_info': 'tr:nth-child(8) '
                '> td:nth-child(2) '
                '> p',
    'citation_requests/acknowledgements': 'tr:nth-child(9) '
                '> td:nth-child(2) '
                '> p'
}

tabular_attribute_paths = _combine_paths(tabular_base_path, tabular_attribute_paths)
descriptive_question_attribute_paths = _combine_paths(descriptive_question_base_path, descriptive_question_attribute_paths)

In [None]:
single_attribute_paths = {**single_attribute_paths, **descriptive_question_attribute_paths}

In [None]:
uci_path_dict = {
    'structural_paths': structural_paths,
    'single_attribute_paths': single_attribute_paths,
    'variable_attribute_paths': variable_attribute_paths
}

In [None]:
path_dicts_to_export['uci'] = uci_path_dict

# Export path dicts

In [None]:
for repo_name, path_dict in path_dicts_to_export.items():
    with open(f'../paths/{repo_name}_paths.json', 'w') as f:
        json.dump(path_dict, f)