In [1]:
%load_ext autoreload
%autoreload 2
import arff
import numpy as np
import openml
import sklearn.datasets

In [2]:
# For this example we will upload to the test server to not
# pollute the live server with countless copies of the same
# dataset
openml.config.server = 'https://test.openml.org/api/v1/xml'

In [3]:
# Load an example dataset from scikit-learn which we will 
# upload to OpenML.org via the API
breast_cancer = sklearn.datasets.load_breast_cancer()
name = 'BreastCancer(scikit-learn)'
X = breast_cancer.data
y = breast_cancer.target
attribute_names = breast_cancer.feature_names
targets = breast_cancer.target_names
description = breast_cancer.DESCR

In [4]:
# OpenML does not distinguish between the attributes and
# targets on the data level and stores all data in a 
# single matrix. The target feature is indicated as 
# meta-data of the dataset (and tasks on that data)
data = np.concatenate((X, y.reshape((-1, 1))), axis=1)
attribute_names = list(attribute_names)
attributes = [
    (attribute_name, 'REAL') for attribute_name in attribute_names
] + [('class', tuple(targets))]

In [5]:
arff_object = {
    'relation': name,
    'description': description,
    'attributes': attributes,
    'data': data
}

In [6]:
# Create the dataset object. 
# The definition of all fields can be found in the XSD files
# describing the expected format:
# https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd
dataset = openml.datasets.OpenMLDataset(
    # The dataset ID will be assigned by the server
    dataset_id=None,
    # The name of the dataset (needs to be unique). 
    # Must not be longer than 128 characters and only contain
    # a-z, A-Z, 0-9 and the following special characters: _\-\.(),
    name=name,
    # The dataset version is assigned by the server. If a run with
    # the same name is uploaded multiple times, the version is 
    # incremented (therefore, it should be None in the beginning).
    version=None,
    # Textual description of the dataset
    description=description,
    # ???
    format='ARFF',
    # License under which the data is/will be distributed
    licence='BSD (from scikit-learn)',
    # This will be added by the server and will point to the dataset
    # file that we upload
    url=None,
    # Name of the target. Can also have multiple values (comma-separated).
    default_target_attribute='class',
    # The attribute that represents the row-id column, if present in the dataset.
    row_id_attribute=None,
    # Attributes that should be excluded in modelling, such as identifiers and indexes.
    ignore_attribute=None,
    # A version label which is provided by the user
    version_label='test',
    # How to cite the paper
    citation=(
        "W.N. Street, W.H. Wolberg and O.L. Mangasarian. "
        "Nuclear feature extraction for breast tumor diagnosis. "
        "IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science and Technology, "
        "volume 1905, pages 861-870, San Jose, CA, 1993."
    ),
    # Tag assigned to the run
    tag=None,
    # 
    original_data_url='https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)',
    paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/10.1117/12.148698.short?SSO=1'
)

In [7]:
print(dataset._to_xml())
dataset.publish()

<oml:data_set_description xmlns:oml="http://openml.org/openml">
	<name>BreastCancer(scikit-learn)</name>
	<description>Breast Cancer Wisconsin (Diagnostic) Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in

OpenMLServerException: Problem validating uploaded description file