Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TabNet Datasets #1153

Merged
merged 9 commits into from
Apr 23, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions ludwig/datasets/ames_housing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
from ludwig.datasets.mixins.process import MultifileJoinProcessMixin


def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False, kaggle_username=None, kaggle_api_key=None):
def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False, kaggle_username=None, kaggle_key=None):
dataset = AmesHousing(
cache_dir=cache_dir,
kaggle_username=kaggle_username,
kaggle_api_key=kaggle_api_key
kaggle_key=kaggle_key
)
return dataset.load(split=split)

Expand All @@ -42,7 +42,7 @@ class AmesHousing(CSVLoadMixin, MultifileJoinProcessMixin, KaggleDownloadMixin,
def __init__(self,
cache_dir=DEFAULT_CACHE_LOCATION,
kaggle_username=None,
kaggle_api_key=None):
kaggle_key=None):
self.kaggle_username = kaggle_username
self.kaggle_api_key = kaggle_api_key
self.kaggle_key = kaggle_key
super().__init__(dataset_name='ames_housing', cache_dir=cache_dir)
3 changes: 0 additions & 3 deletions ludwig/datasets/dbpedia/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,3 @@ def process_downloaded_dataset(self):
os.path.join(self.processed_dataset_path, self.csv_filename),
index=False
)



8 changes: 4 additions & 4 deletions ludwig/datasets/goemotions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
from ludwig.datasets.base_dataset import BaseDataset, DEFAULT_CACHE_LOCATION
from ludwig.datasets.mixins.download import UncompressedFileDownloadMixin
from ludwig.datasets.mixins.load import CSVLoadMixin
from ludwig.datasets.mixins.process import *
from ludwig.datasets.mixins.process import MultifileJoinProcessMixin


def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False):
def load(cache_dir=DEFAULT_CACHE_LOCATION, split=True):
w4nderlust marked this conversation as resolved.
Show resolved Hide resolved
dataset = GoEmotions(cache_dir=cache_dir)
return dataset.load(split=split)

Expand All @@ -37,9 +37,9 @@ class GoEmotions(UncompressedFileDownloadMixin, MultifileJoinProcessMixin,
def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
super().__init__(dataset_name="goemotions", cache_dir=cache_dir)

def read_file(self, filetype, filename):
def read_file(self, filetype, filename, header=None):
w4nderlust marked this conversation as resolved.
Show resolved Hide resolved
file_df = pd.read_table(os.path.join(self.raw_dataset_path, filename),
header=None)
header=header)
return file_df

def process_downloaded_dataset(self):
Expand Down
9 changes: 4 additions & 5 deletions ludwig/datasets/mixins/kaggle.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class KaggleDownloadMixin:
raw_temp_path: str
name: str
kaggle_username: str
kaggle_api_key: str
kaggle_key: str

def download_raw_dataset(self):
"""
Expand All @@ -28,7 +28,7 @@ def download_raw_dataset(self):
kaggle.json file we lookup the passed in username and the api key and
perform authentication.
"""
with self.update_env(KAGGLE_USERNAME=self.kaggle_username, KAGGLE_API_KEY=self.kaggle_api_key):
with self.update_env(KAGGLE_USERNAME=self.kaggle_username, KAGGLE_KEY=self.kaggle_key):
w4nderlust marked this conversation as resolved.
Show resolved Hide resolved
# Call authenticate explicitly to pick up new credentials if necessary
api = create_kaggle_client()
api.authenticate()
Expand All @@ -37,8 +37,8 @@ def download_raw_dataset(self):
# Download all files for a competition
api.competition_download_files(self.competition_name, path=self.raw_temp_path)

titanic_zip = os.path.join(self.raw_temp_path, self.archive_filename)
with ZipFile(titanic_zip, 'r') as z:
competition_zipfile = os.path.join(self.raw_temp_path, self.archive_filename)
with ZipFile(competition_zipfile, 'r') as z:
z.extractall(self.raw_temp_path)
os.rename(self.raw_temp_path, self.raw_dataset_path)

Expand All @@ -60,4 +60,3 @@ def competition_name(self):
@property
def archive_filename(self):
return self.config["archive_filename"]

1 change: 0 additions & 1 deletion ludwig/datasets/mixins/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,3 @@ def load_processed_dataset(self, split) -> Union[pd.DataFrame,
@property
def csv_filename(self):
return self.config["csv_filename"]

6 changes: 3 additions & 3 deletions ludwig/datasets/mixins/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def read_file(self, filetype, filename, header=0):
os.path.join(self.raw_dataset_path, filename), lines=True)
elif filetype == 'tsv':
file_df = pd.read_table(
os.path.join(self.raw_dataset_path, filename))
elif filetype == 'csv':
os.path.join(self.raw_dataset_path, filename))
elif filetype == 'csv' or filetype == 'data':
file_df = pd.read_csv(
os.path.join(self.raw_dataset_path, filename), header=header)
else:
Expand All @@ -56,7 +56,7 @@ def read_file(self, filetype, filename, header=0):
def process_downloaded_dataset(self, header=0):
"""Processes dataset

:param header: indicates whether raw data files contain headers
:param header: indicates whether raw data files contain headers
"""
downloaded_files = self.download_filenames
filetype = self.download_file_type
Expand Down
58 changes: 58 additions & 0 deletions ludwig/datasets/mushroom_edibility/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#! /usr/bin/env python
# coding=utf-8
# Copyright (c) 2021 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os
import pandas as pd

from ludwig.datasets.base_dataset import BaseDataset, DEFAULT_CACHE_LOCATION
from ludwig.datasets.mixins.download import UncompressedFileDownloadMixin
from ludwig.datasets.mixins.load import CSVLoadMixin
from ludwig.datasets.mixins.process import MultifileJoinProcessMixin

def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False):
dataset = MushroomEdibility(cache_dir=cache_dir)
return dataset.load(split=split)

class MushroomEdibility(UncompressedFileDownloadMixin, MultifileJoinProcessMixin,
CSVLoadMixin, BaseDataset):
"""
The Mushroom Edibility dataset

Additional Details:

http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names
"""
def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
super().__init__(dataset_name="mushroom_edibility", cache_dir=cache_dir)

def process_downloaded_dataset(self):
super().process_downloaded_dataset(header=None)
processed_df = pd.read_csv(os.path.join(self.processed_dataset_path,
self.csv_filename))
columns = [
"class", "cap-shape", "cap-surface", "cap-color", "bruises?", "odor",
"gill-attachment", "gill-spacing", "gill-size", "gill-color",
"stalk-shape", "stalk-root", "stalk-surface-above-ring",
"stalk-surface-below-ring", "stalk-color-above-ring",
"stalk-color-below-ring", "veil-type", "veil-color", "ring-number",
"ring-type", "spore-print-color", "population", "habitat", "split"
]
processed_df.columns = columns
processed_df.to_csv(
os.path.join(self.processed_dataset_path, self.csv_filename),
index=False
)

7 changes: 7 additions & 0 deletions ludwig/datasets/mushroom_edibility/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
version: 1.0
download_urls:
- "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
split_filenames:
train_file: agaricus-lepiota.data
download_file_type: data
csv_filename: mushroom_edibility.csv
54 changes: 54 additions & 0 deletions ludwig/datasets/poker_hand/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#! /usr/bin/env python
# coding=utf-8
# Copyright (c) 2021 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os
import pandas as pd

from ludwig.datasets.base_dataset import BaseDataset, DEFAULT_CACHE_LOCATION
from ludwig.datasets.mixins.download import UncompressedFileDownloadMixin
from ludwig.datasets.mixins.load import CSVLoadMixin
from ludwig.datasets.mixins.process import MultifileJoinProcessMixin


def load(cache_dir=DEFAULT_CACHE_LOCATION, split=True):
dataset = PokerHand(cache_dir=cache_dir)
return dataset.load(split=split)

class PokerHand(UncompressedFileDownloadMixin, MultifileJoinProcessMixin,
CSVLoadMixin, BaseDataset):
"""
The Poker Hand dataset

Additional Details:

http://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand.names
"""
def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
super().__init__(dataset_name="poker_hand", cache_dir=cache_dir)

def process_downloaded_dataset(self):
super().process_downloaded_dataset(header=None)
processed_df = pd.read_csv(os.path.join(self.processed_dataset_path,
self.csv_filename))
columns = [
"S1", "C1", "S2", "C2", "S3", "C3", "S4", "C4", "S5", "C5", "hand", "split"
]
processed_df.columns = columns
processed_df.to_csv(
os.path.join(self.processed_dataset_path, self.csv_filename),
index=False
)

9 changes: 9 additions & 0 deletions ludwig/datasets/poker_hand/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
version: 1.0
download_urls:
- "http://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-training-true.data"
- "http://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-testing.data"
split_filenames:
train_file: poker-hand-training-true.data
test_file: poker-hand-testing.data
download_file_type: data
csv_filename: poker_hand.csv
75 changes: 75 additions & 0 deletions ludwig/datasets/sarcos/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#! /usr/bin/env python
# coding=utf-8
# Copyright (c) 2021 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os
from scipy.io import loadmat
import pandas as pd

from ludwig.datasets.base_dataset import BaseDataset, DEFAULT_CACHE_LOCATION
from ludwig.datasets.mixins.download import UncompressedFileDownloadMixin
from ludwig.datasets.mixins.load import CSVLoadMixin
from ludwig.datasets.mixins.process import MultifileJoinProcessMixin

def load(cache_dir=DEFAULT_CACHE_LOCATION, split=True):
dataset = Sarcos(cache_dir=cache_dir)
return dataset.load(split=split)

class Sarcos(UncompressedFileDownloadMixin, MultifileJoinProcessMixin,
CSVLoadMixin, BaseDataset):
"""
The Sarcos dataset

Details:
The data relates to an inverse dynamics problem for a seven
degrees-of-freedom SARCOS anthropomorphic robot arm. The
task is to map from a 21-dimensional input space (7 joint
positions, 7 joint velocities, 7 joint accelerations) to the
corresponding 7 joint torques. There are 44,484 training
examples and 4,449 test examples. The first 21 columns are
the input variables, and the 22nd column is used as the target
variable.

Dataset source:
Locally Weighted Projection RegressionL: An O(n) Algorithm for
Incremental Real Time Learning in High Dimensional Space,
S. Vijayakumar and S. Schaal, Proc ICML 2000.
http://www.gaussianprocess.org/gpml/data/
"""
def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
super().__init__(dataset_name="sarcos", cache_dir=cache_dir)

def read_file(self, filetype, filename, header=0):
mat = loadmat(os.path.join(self.raw_dataset_path, filename))
file_df = pd.DataFrame(mat[filename.split('.')[0]])
return file_df

def process_downloaded_dataset(self):
super().process_downloaded_dataset()
processed_df = pd.read_csv(os.path.join(self.processed_dataset_path,
self.csv_filename))
columns = []
columns += [f'position_{i}' for i in range(1, 8)]
columns += [f'velocity_{i}' for i in range(1, 8)]
columns += [f'acceleration_{i}' for i in range(1, 8)]
columns += [f'torque_{i}' for i in range(1, 8)]
columns += ['split']

processed_df.columns = columns
processed_df.to_csv(
os.path.join(self.processed_dataset_path, self.csv_filename),
index=False
)

9 changes: 9 additions & 0 deletions ludwig/datasets/sarcos/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
version: 1.0
download_urls:
- "http://www.gaussianprocess.org/gpml/data/sarcos_inv.mat"
- "http://www.gaussianprocess.org/gpml/data/sarcos_inv_test.mat"
split_filenames:
train_file: sarcos_inv.mat
test_file: sarcos_inv_test.mat
download_file_type: mat
csv_filename: sarcos.csv
8 changes: 4 additions & 4 deletions ludwig/datasets/titanic/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
from ludwig.datasets.mixins.load import CSVLoadMixin


def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False, kaggle_username=None, kaggle_api_key=None):
def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False, kaggle_username=None, kaggle_key=None):
dataset = Titanic(
cache_dir=cache_dir,
kaggle_username=kaggle_username,
kaggle_api_key=kaggle_api_key
kaggle_key=kaggle_key
)
return dataset.load(split=split)

Expand All @@ -41,9 +41,9 @@ class Titanic(CSVLoadMixin, KaggleDownloadMixin, BaseDataset):
def __init__(self,
cache_dir=DEFAULT_CACHE_LOCATION,
kaggle_username=None,
kaggle_api_key=None):
kaggle_key=None):
self.kaggle_username = kaggle_username
self.kaggle_api_key = kaggle_api_key
self.kaggle_key = kaggle_key
super().__init__(dataset_name='titanic', cache_dir=cache_dir)

def process_downloaded_dataset(self):
Expand Down