Skip to content

Commit

Permalink
[crossport from master] For kdd datasets, do not include unlabeled te…
Browse files Browse the repository at this point in the history
…st data by default (#1704)

Co-authored-by: Anne Holler <anne@vmware.com>
  • Loading branch information
amholler and anneholler committed Jan 25, 2022
1 parent 5515431 commit 4f32c39
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 23 deletions.
8 changes: 4 additions & 4 deletions ludwig/datasets/kdd_appetency/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
from ludwig.datasets.kdd_dataset import KDDCup2009Dataset


def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False):
dataset = KDDAppetency(cache_dir=cache_dir)
def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False, include_test_download=False):
dataset = KDDAppetency(cache_dir=cache_dir, include_test_download=include_test_download)
return dataset.load(split=split)


Expand All @@ -32,5 +32,5 @@ class KDDAppetency(KDDCup2009Dataset):
https://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data
"""

def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
super().__init__(task_name="appetency", cache_dir=cache_dir)
def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION, include_test_download=False):
super().__init__(task_name="appetency", cache_dir=cache_dir, include_test_download=include_test_download)
8 changes: 4 additions & 4 deletions ludwig/datasets/kdd_churn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
from ludwig.datasets.kdd_dataset import KDDCup2009Dataset


def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False):
dataset = KDDChurn(cache_dir=cache_dir)
def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False, include_test_download=False):
dataset = KDDChurn(cache_dir=cache_dir, include_test_download=include_test_download)
return dataset.load(split=split)


Expand All @@ -48,5 +48,5 @@ class KDDChurn(KDDCup2009Dataset):
https://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data
"""

def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
super().__init__(task_name="churn", cache_dir=cache_dir)
def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION, include_test_download=False):
super().__init__(task_name="churn", cache_dir=cache_dir, include_test_download=include_test_download)
24 changes: 13 additions & 11 deletions ludwig/datasets/kdd_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ class KDDCup2009Dataset(UncompressedFileDownloadMixin,
https://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data
"""

def __init__(self, task_name, cache_dir=DEFAULT_CACHE_LOCATION):
def __init__(self, task_name, cache_dir=DEFAULT_CACHE_LOCATION, include_test_download=False):
super().__init__(dataset_name="kdd_" + task_name, cache_dir=cache_dir)
self.task_name = task_name
self.include_test_download = include_test_download

def process_downloaded_dataset(self, header=0):
zip_file = ZipFile(
Expand All @@ -46,12 +47,6 @@ def process_downloaded_dataset(self, header=0):
train_df = pd.read_csv(zip_file.open("orange_small_train.data"),
sep='\t')

zip_file = ZipFile(
os.path.join(self.raw_dataset_path, "orange_small_test.data.zip")
)
test_df = pd.read_csv(zip_file.open("orange_small_test.data"),
sep='\t')

train_df = process_categorical_features(train_df, categorical_features)
train_df = process_numerical_features(train_df, categorical_features)

Expand Down Expand Up @@ -87,10 +82,17 @@ def process_downloaded_dataset(self, header=0):
processed_val_df['target'] = targets.iloc[val_idcs]
processed_val_df['split'] = 1

test_df['target'] = ''
test_df['split'] = 2

df = pd.concat([processed_train_df, processed_val_df, test_df])
if self.include_test_download:
zip_file = ZipFile(
os.path.join(self.raw_dataset_path, "orange_small_test.data.zip")
)
test_df = pd.read_csv(zip_file.open("orange_small_test.data"),
sep='\t')
test_df['target'] = '' # no ground truth labels for test download
test_df['split'] = 2
df = pd.concat([processed_train_df, processed_val_df, test_df])
else:
df = pd.concat([processed_train_df, processed_val_df])

makedirs(self.processed_temp_path, exist_ok=True)
df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename),
Expand Down
8 changes: 4 additions & 4 deletions ludwig/datasets/kdd_upselling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
from ludwig.datasets.kdd_dataset import KDDCup2009Dataset


def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False):
dataset = KDDUpselling(cache_dir=cache_dir)
def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False, include_test_download=False):
dataset = KDDUpselling(cache_dir=cache_dir, include_test_download=include_test_download)
return dataset.load(split=split)


Expand All @@ -48,5 +48,5 @@ class KDDUpselling(KDDCup2009Dataset):
https://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data
"""

def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
super().__init__(task_name="upselling", cache_dir=cache_dir)
def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION, include_test_download=False):
super().__init__(task_name="upselling", cache_dir=cache_dir, include_test_download=include_test_download)

0 comments on commit 4f32c39

Please sign in to comment.