New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GSOC]add cli executable for data_split #650
Changes from 1 commit
a35c390
de4226c
2cad593
c8a60b2
e41b3db
ef01c05
9e88669
8ad3b90
2417a19
ac016fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ endmacro () | |
|
||
# Recurse into each method mlpack provides. | ||
set(DIRS | ||
preprocess | ||
adaboost | ||
amf | ||
ann | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Define the files we need to compile. | ||
# Anything not in this list will not be compiled into mlpack. | ||
set(SOURCES | ||
) | ||
|
||
# Add directory name to sources. | ||
set(DIR_SRCS) | ||
foreach(file ${SOURCES}) | ||
set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}) | ||
endforeach() | ||
# Append sources (with directory name) to list of all mlpack sources (used at | ||
# the parent scope). | ||
set(MLPACK_SRCS ${MLPACK_SRCS} ${DIR_SRCS} PARENT_SCOPE) | ||
|
||
#add_cli_executable(preprocess_stats) | ||
add_cli_executable(preprocess_split) | ||
#add_cli_executable(preprocess_scan) | ||
#add_cli_executable(preprocess_imputer) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
/** | ||
* @file preprocess_split_main.cpp | ||
* @author Keon Woo Kim | ||
* | ||
* split data CLI executable | ||
*/ | ||
#include <mlpack/core.hpp> | ||
#include <mlpack/core/data/split_data.hpp> | ||
|
||
PROGRAM_INFO("Split into Train and Test Data", "This " | ||
"utility takes data and labels and split into a training " | ||
"set and a test set."); | ||
|
||
// Define parameters for data | ||
PARAM_STRING_REQ("input_file", "File containing data,", "i"); | ||
PARAM_STRING_REQ("output_train_data", "File name to save train data", "d"); | ||
PARAM_STRING_REQ("output_test_data", "File name to save test data", "D"); | ||
|
||
// Define parameters for labels | ||
PARAM_STRING_REQ("input_label", "File containing labels", "I"); | ||
PARAM_STRING_REQ("output_train_label", "File name to save train label", "l"); | ||
PARAM_STRING_REQ("output_test_label", "File name to save test label", "L"); | ||
|
||
// Define optional test ratio, default is 0.2 (Test 20% Train 80%) | ||
PARAM_DOUBLE("test_ratio", "Ratio of test set, defaults to 0.2" | ||
"if not set", "r", 0.2); | ||
|
||
using namespace mlpack; | ||
using namespace arma; | ||
using namespace std; | ||
|
||
int main(int argc, char** argv) | ||
{ | ||
// Parse command line options. | ||
CLI::ParseCommandLine(argc, argv); | ||
|
||
// data | ||
const string inputFile = CLI::GetParam<string>("input_file"); | ||
const string outputTrainData = CLI::GetParam<string>("output_train_data"); | ||
const string outputTestData = CLI::GetParam<string>("output_test_data"); | ||
// labels | ||
const string inputLabel = CLI::GetParam<string>("input_label"); | ||
const string outputTrainLabel = CLI::GetParam<string>("output_train_label"); | ||
const string outputTestLabel = CLI::GetParam<string>("output_test_label"); | ||
|
||
// Ratio | ||
const double testRatio = CLI::GetParam<double>("test_ratio"); | ||
|
||
// container for input data and labels | ||
arma::mat data; | ||
arma::Mat<size_t> labels; | ||
|
||
// Load Data and Labels | ||
data::Load(inputFile, data, true); | ||
data::Load(inputLabel, labels, true); | ||
arma::Row<size_t> labels_row = labels.row(0); // extract first row | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we assume decimal labels, so we can't use the tool in combination with e.g. linear regression? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I only thought of integer labels by looking at the test data. |
||
// Split Data | ||
const auto value = data::TrainTestSplit(data, labels_row, testRatio); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does make_tuple call the copy constructor? If that's the case we should avoid this interface in favor of the pass by reference interface. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think so. As discussed in this pull, I think having 6 parameters for TrainTestSplit(input, train, test, and other 3 for labels) can be used as an alternative. Or we can make it like |
||
Log::Info << "Train Data Count: " << get<0>(value).n_cols << endl; | ||
Log::Info << "Test Data Count: " << get<1>(value).n_cols << endl; | ||
Log::Info << "Train Label Count: " << get<2>(value).n_cols << endl; | ||
Log::Info << "Test Label Count: " << get<3>(value).n_cols << endl; | ||
|
||
// Save Train Data | ||
data::Save(outputTrainData, get<0>(value), false); | ||
|
||
// Save Test Data | ||
data::Save(outputTestData, get<1>(value), false); | ||
|
||
// Save Train Label | ||
data::Save(outputTrainLabel, get<2>(value), false); | ||
|
||
// Save Test Label | ||
data::Save(outputTestLabel, get<3>(value), false); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think for consistency with other programs in mlpack I would go with parameters --training_file (-t), --training_labels_file (-l), --test_file (-T), and --test_labels_file (-L) for the output parameters.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I agree. I will update it.