diff --git a/src/python/nimbusml/datasets/datasets.py b/src/python/nimbusml/datasets/datasets.py index 56c325a6..9f040ff1 100644 --- a/src/python/nimbusml/datasets/datasets.py +++ b/src/python/nimbusml/datasets/datasets.py @@ -3,7 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- """ -Datasets used in MicrosoftML unittests. +Datasets used in MicrosoftML unittests. """ import copy import os @@ -15,6 +15,8 @@ __all__ = ["get_dataset", "available_datasets"] +DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') + class DataSet: """ @@ -175,11 +177,7 @@ def load(self): # isCase ~ age + parity + education + spontaneous + induced # education age parity induced case spontaneous stratum # pooled.stratum - this = os.path.join( - os.path.dirname(__file__), - "data", - "gplv2", - "infert.csv") + this = os.path.join(DATA_DIR, "gplv2", "infert.csv") self.__dict__['_data'] = pandas.read_csv(this) self.__dict__['case'] = self._data["case"] self._finalize() @@ -229,11 +227,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "gplv2", - "infert.csv") + return os.path.join(DATA_DIR, "gplv2", "infert.csv") class DataSetAirQuality(DataSet): @@ -262,11 +256,7 @@ def load(self): # isCase ~ age + parity + education + spontaneous + induced # education age parity induced case spontaneous stratum # pooled.stratum - this = os.path.join( - os.path.dirname(__file__), - "data", - "gplv2", - "airquality.csv") + this = os.path.join(DATA_DIR, "gplv2", "airquality.csv") self.__dict__['_data'] = pandas.read_csv(this) self._finalize() @@ -294,11 +284,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "gplv2", - "airquality.csv") + return os.path.join(DATA_DIR, "gplv2", "airquality.csv") class Topics(DataSet): @@ -324,8 +310,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join(os.path.dirname(__file__), "data", - "topics.csv") + return os.path.join(DATA_DIR, "topics.csv") class Timeseries(DataSet): @@ -351,10 +336,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "timeseries.csv") + return os.path.join(DATA_DIR, "timeseries.csv") class WikiDetox_Train(DataSet): @@ -379,10 +361,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train-250.wikipedia.sample.tsv") + return os.path.join(DATA_DIR, "train-250.wikipedia.sample.tsv") class WikiDetox_Test(DataSet): @@ -407,10 +386,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test.wikipedia.sample.tsv") + return os.path.join(DATA_DIR, "test.wikipedia.sample.tsv") class FS_Train(DataSet): @@ -435,10 +411,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train_fs.csv") + return os.path.join(DATA_DIR, "train_fs.csv") class FS_Test(DataSet): @@ -463,10 +436,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test_fs.csv") + return os.path.join(DATA_DIR, "test_fs.csv") class MSLTR_Train(DataSet): @@ -492,10 +462,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train-msltr.sample.csv") + return os.path.join(DATA_DIR, "train-msltr.sample.csv") class MSLTR_Test(DataSet): @@ -521,10 +488,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test-msltr.sample.csv") + return os.path.join(DATA_DIR, "test-msltr.sample.csv") class Uci_Train(DataSet): @@ -548,10 +512,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train-500.uciadult.sample.csv") + return os.path.join(DATA_DIR, "train-500.uciadult.sample.csv") class Uci_Test(DataSet): @@ -575,10 +536,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test-100.uciadult.sample.csv") + return os.path.join(DATA_DIR, "test-100.uciadult.sample.csv") class Generated_Twitter_Train(DataSet): @@ -603,10 +561,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train-twitter.gen-sample.tsv") + return os.path.join(DATA_DIR, "train-twitter.gen-sample.tsv") class Generated_Twitter_Test(DataSet): @@ -631,10 +586,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test-twitter.gen-sample.tsv") + return os.path.join(DATA_DIR, "test-twitter.gen-sample.tsv") class Generated_Ticket_Train(DataSet): @@ -659,10 +611,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "train-ticketchoice.csv") + return os.path.join(DATA_DIR, "train-ticketchoice.csv") class Generated_Ticket_Test(DataSet): @@ -687,10 +636,7 @@ def as_filepath(self): """ Return file name. """ - return os.path.join( - os.path.dirname(__file__), - "data", - "test-ticketchoice.csv") + return os.path.join(DATA_DIR, "test-ticketchoice.csv") _datasets = dict( diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py index 9a4eba53..074ce92f 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py @@ -2,8 +2,7 @@ # WordEmbedding: pre-trained transform to generate word embeddings import pandas from nimbusml import Pipeline -from nimbusml.feature_extraction.text import WordEmbedding -from nimbusml.feature_extraction.text.ngramfeaturizer import NGramFeaturizer +from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding from nimbusml.feature_extraction.text.extractor import Ngram # create the data @@ -22,5 +21,12 @@ ]) y = pipeline.fit_transform(customer_reviews) -# view the review embeddings -# print(y.head()) +# view a small subset of the review embeddings +print(y.iloc[:5, -3:]) +# review_TransformedText.147 review_TransformedText.148 review_TransformedText.149 +# 0 1.918661 -0.714531 3.062141 +# 1 1.891922 -0.248650 1.706620 +# 2 1.601611 0.309785 3.379576 +# 3 1.970666 1.477450 3.110802 +# 4 2.521791 0.122538 3.129919 +