In [1]:
import openml
import pandas as pd
from openml.datasets import edit_dataset, fork_dataset, get_dataset

## Get list of OpenML datasets

In [5]:
openml_df = openml.datasets.list_datasets(output_format="dataframe")

In [6]:
# show table with some key data properties
datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]

### Sample of datasets in datalist

In [8]:
datalist.head(10)

Unnamed: 0,did,name,NumberOfInstances,NumberOfFeatures,NumberOfClasses
2,2,anneal,898.0,39.0,5.0
3,3,kr-vs-kp,3196.0,37.0,2.0
4,4,labor,57.0,17.0,2.0
5,5,arrhythmia,452.0,280.0,13.0
6,6,letter,20000.0,17.0,26.0
7,7,audiology,226.0,70.0,24.0
8,8,liver-disorders,345.0,6.0,0.0
9,9,autos,205.0,26.0,6.0
10,10,lymph,148.0,19.0,4.0
11,11,balance-scale,625.0,5.0,3.0


## Download dataset using Dataset ID (did)
This example downloads the adult census dataset with did of 1590.
You can find the dataset by filtering the datalist.

`datalist.query('name == "adult"')`

In [10]:
# This is done based on the dataset ID.
dataset = openml.datasets.get_dataset(1590)

# Print a summary
print(
    f"This is dataset '{dataset.name}', the target feature is "
    f"'{dataset.default_target_attribute}'"
)
print(f"URL: {dataset.url}")
print(dataset.description[:500])

This is dataset 'adult', the target feature is 'class'
URL: https://api.openml.org/data/v1/download/1595261/adult.arff
**Author**: Ronny Kohavi and Barry Becker  
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Adult) - 1996  
**Please cite**: Ron Kohavi, "Scaling Up the Accuracy of Naive-Bayes Classifiers: a Decision-Tree Hybrid", Proceedings of the Second International Conference on Knowledge Discovery and Data Mining, 1996  

Prediction task is to determine whether a person makes over 50K a year. Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean records was


## Datasets can be downloaded in different formats. Here are two examples:
1. download data as an array
2. download data as a DataFrame

In [11]:
# download as array, with features as 'X' and target as 'y'
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="array", target=dataset.default_target_attribute
)

# convert arrays to full DataFrame with features and target
adult = pd.DataFrame(X, columns=attribute_names)
adult["class"] = y
print(adult[:10])

    age  workclass    fnlwgt  education  education-num  marital-status  \
0  25.0        0.0  226802.0        2.0            7.0             2.0   
1  38.0        0.0   89814.0        3.0            9.0             0.0   
2  28.0        4.0  336951.0        5.0           12.0             0.0   
3  44.0        0.0  160323.0        1.0           10.0             0.0   
4  18.0        NaN  103497.0        1.0           10.0             2.0   
5  34.0        0.0  198693.0       12.0            6.0             2.0   
6  29.0        NaN  227026.0        3.0            9.0             2.0   
7  63.0        1.0  104626.0        4.0           15.0             0.0   
8  24.0        0.0  369667.0        1.0           10.0             2.0   
9  55.0        0.0  104996.0        8.0            4.0             0.0   

   occupation  relationship  race  sex  capital-gain  capital-loss  \
0         7.0           1.0   4.0  1.0           0.0           0.0   
1         9.0           2.0   0.0  1.0       

In [12]:
# Download as dataframe, with features X and target y in different dataframes
X, y, categorical_indicator, attribute_names = dataset.get_data(
    target=dataset.default_target_attribute, dataset_format="dataframe"
)
print(X.head())
print(y.head())

   age  workclass    fnlwgt     education  education-num      marital-status  \
0   25    Private  226802.0          11th              7       Never-married   
1   38    Private   89814.0       HS-grad              9  Married-civ-spouse   
2   28  Local-gov  336951.0    Assoc-acdm             12  Married-civ-spouse   
3   44    Private  160323.0  Some-college             10  Married-civ-spouse   
4   18        NaN  103497.0  Some-college             10       Never-married   

          occupation relationship   race     sex  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male           0.0           0.0   
1    Farming-fishing      Husband  White    Male           0.0           0.0   
2    Protective-serv      Husband  White    Male           0.0           0.0   
3  Machine-op-inspct      Husband  Black    Male        7688.0           0.0   
4                NaN    Own-child  White  Female           0.0           0.0   

   hours-per-week native-country  
0  