[crossport from master] For kdd datasets, do not include unlabeled te…

…st data by default (#1704) Co-authored-by: Anne Holler <anne@vmware.com>
ludwig-ai · Jan 25, 2022 · 4f32c39 · 4f32c39
1 parent 5515431
commit 4f32c39
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 23 deletions.
diff --git a/ludwig/datasets/kdd_appetency/__init__.py b/ludwig/datasets/kdd_appetency/__init__.py
@@ -18,8 +18,8 @@
 from ludwig.datasets.kdd_dataset import KDDCup2009Dataset
 
 
-def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False):
-    dataset = KDDAppetency(cache_dir=cache_dir)
+def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False, include_test_download=False):
+    dataset = KDDAppetency(cache_dir=cache_dir, include_test_download=include_test_download)
     return dataset.load(split=split)
 
 
@@ -32,5 +32,5 @@ class KDDAppetency(KDDCup2009Dataset):
     https://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data
     """
 
-    def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
-        super().__init__(task_name="appetency", cache_dir=cache_dir)
+    def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION, include_test_download=False):
+        super().__init__(task_name="appetency", cache_dir=cache_dir, include_test_download=include_test_download)
diff --git a/ludwig/datasets/kdd_churn/__init__.py b/ludwig/datasets/kdd_churn/__init__.py
@@ -34,8 +34,8 @@
 from ludwig.datasets.kdd_dataset import KDDCup2009Dataset
 
 
-def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False):
-    dataset = KDDChurn(cache_dir=cache_dir)
+def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False, include_test_download=False):
+    dataset = KDDChurn(cache_dir=cache_dir, include_test_download=include_test_download)
     return dataset.load(split=split)
 
 
@@ -48,5 +48,5 @@ class KDDChurn(KDDCup2009Dataset):
     https://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data
     """
 
-    def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
-        super().__init__(task_name="churn", cache_dir=cache_dir)
+    def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION, include_test_download=False):
+        super().__init__(task_name="churn", cache_dir=cache_dir, include_test_download=include_test_download)
diff --git a/ludwig/datasets/kdd_dataset.py b/ludwig/datasets/kdd_dataset.py
@@ -35,9 +35,10 @@ class KDDCup2009Dataset(UncompressedFileDownloadMixin,
     https://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data
     """
 
-    def __init__(self, task_name, cache_dir=DEFAULT_CACHE_LOCATION):
+    def __init__(self, task_name, cache_dir=DEFAULT_CACHE_LOCATION, include_test_download=False):
         super().__init__(dataset_name="kdd_" + task_name, cache_dir=cache_dir)
         self.task_name = task_name
+        self.include_test_download = include_test_download
 
     def process_downloaded_dataset(self, header=0):
         zip_file = ZipFile(
@@ -46,12 +47,6 @@ def process_downloaded_dataset(self, header=0):
         train_df = pd.read_csv(zip_file.open("orange_small_train.data"),
                                sep='\t')
 
-        zip_file = ZipFile(
-            os.path.join(self.raw_dataset_path, "orange_small_test.data.zip")
-        )
-        test_df = pd.read_csv(zip_file.open("orange_small_test.data"),
-                              sep='\t')
-
         train_df = process_categorical_features(train_df, categorical_features)
         train_df = process_numerical_features(train_df, categorical_features)
 
@@ -87,10 +82,17 @@ def process_downloaded_dataset(self, header=0):
         processed_val_df['target'] = targets.iloc[val_idcs]
         processed_val_df['split'] = 1
 
-        test_df['target'] = ''
-        test_df['split'] = 2
-
-        df = pd.concat([processed_train_df, processed_val_df, test_df])
+        if self.include_test_download:
+            zip_file = ZipFile(
+                os.path.join(self.raw_dataset_path, "orange_small_test.data.zip")
+            )
+            test_df = pd.read_csv(zip_file.open("orange_small_test.data"),
+                                  sep='\t')
+            test_df['target'] = ''  # no ground truth labels for test download
+            test_df['split'] = 2
+            df = pd.concat([processed_train_df, processed_val_df, test_df])
+        else:
+            df = pd.concat([processed_train_df, processed_val_df])
 
         makedirs(self.processed_temp_path, exist_ok=True)
         df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename),

diff --git a/ludwig/datasets/kdd_upselling/__init__.py b/ludwig/datasets/kdd_upselling/__init__.py
@@ -34,8 +34,8 @@
 from ludwig.datasets.kdd_dataset import KDDCup2009Dataset
 
 
-def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False):
-    dataset = KDDUpselling(cache_dir=cache_dir)
+def load(cache_dir=DEFAULT_CACHE_LOCATION, split=False, include_test_download=False):
+    dataset = KDDUpselling(cache_dir=cache_dir, include_test_download=include_test_download)
     return dataset.load(split=split)
 
 
@@ -48,5 +48,5 @@ class KDDUpselling(KDDCup2009Dataset):
     https://www.kdd.org/kdd-cup/view/kdd-cup-2009/Data
     """
 
-    def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
-        super().__init__(task_name="upselling", cache_dir=cache_dir)
+    def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION, include_test_download=False):
+        super().__init__(task_name="upselling", cache_dir=cache_dir, include_test_download=include_test_download)