Skip to content

Commit

Permalink
A DatasetLoader for loading data from OpenML via scikit-learn
Browse files Browse the repository at this point in the history
  • Loading branch information
dnouri committed Oct 25, 2018
1 parent 01609ec commit 44e2849
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 1 deletion.
19 changes: 19 additions & 0 deletions palladium/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pandas.io.parsers
import pandas.io.sql
from sklearn.datasets import fetch_openml
from sqlalchemy import create_engine

from .interfaces import DatasetLoader
Expand Down Expand Up @@ -107,6 +108,24 @@ def __call__(self):
return data, target


class OpenML(DatasetLoader):
"""A :class:`~palladium.interfaces.DatasetLoader` that uses
scikit-learn's :func:`sklearn.datasets.fetch_openml` to load data
from OpenML.
"""
def __init__(self, name):
"""
:param str name:
The dataset name from OpenML.
Examples: "wine-quality-red", "diabetes"
"""
self.name = name

def __call__(self):
dataset = fetch_openml(self.name)
return dataset.data, dataset.target


class EmptyDatasetLoader(DatasetLoader):
"""This :class:`~palladium.interfaces.DatasetLoader` can be used if no
actual data should be loaded. Returns a ``(None, None)`` tuple.
Expand Down
13 changes: 13 additions & 0 deletions palladium/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,19 @@ def test_concurrency(self, sql):
[th.join() for th in threads]


class TestOpenML:
@pytest.fixture
def OpenML(self):
from palladium.dataset import OpenML
return OpenML

@pytest.mark.slow
def test_wine_quality(self, OpenML):
X, y = OpenML('wine-quality-red')()
assert X.shape == (1599, 11)
assert y.shape == (1599,)


def test_empty_dataset_loader():
from palladium.dataset import EmptyDatasetLoader
edl = EmptyDatasetLoader()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ psutil==5.4.5
python-dateutil==2.7.2
pytz==2018.4
requests==2.18.4
scikit-learn==0.19.1
scikit-learn==0.20.0
scipy==1.0.1
six==1.11.0
SQLAlchemy==1.2.7
Expand Down

0 comments on commit 44e2849

Please sign in to comment.