Skip to content

Commit

Permalink
Draft tokenize_melt() for unpivotr-like import
Browse files Browse the repository at this point in the history
  • Loading branch information
nacnudus authored and jimhester committed Dec 12, 2017
1 parent ddbb5f4 commit c260ffe
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 1 deletion.
1 change: 1 addition & 0 deletions NAMESPACE
Expand Up @@ -90,6 +90,7 @@ export(spec_table)
export(spec_tsv)
export(stop_for_problems)
export(tokenize)
export(tokenize_melt)
export(tokenizer_csv)
export(tokenizer_delim)
export(tokenizer_fwf)
Expand Down
4 changes: 4 additions & 0 deletions R/RcppExports.R
Expand Up @@ -29,6 +29,10 @@ tokenize_ <- function(sourceSpec, tokenizerSpec, n_max) {
.Call(`_readr_tokenize_`, sourceSpec, tokenizerSpec, n_max)
}

tokenize_melt_ <- function(sourceSpec, tokenizerSpec, n_max, locale_) {
.Call(`_readr_tokenize_melt_`, sourceSpec, tokenizerSpec, n_max, locale_)
}

parse_vector_ <- function(x, collectorSpec, locale_, na, trim_ws = TRUE) {
.Call(`_readr_parse_vector_`, x, collectorSpec, locale_, na, trim_ws)
}
Expand Down
7 changes: 7 additions & 0 deletions R/tokenizer.R
Expand Up @@ -21,6 +21,13 @@ tokenize <- function(file, tokenizer = tokenizer_csv(), skip = 0, n_max = -1L) {
tokenize_(ds, tokenizer, n_max)
}

#' @export
#' @rdname Tokenizers
tokenize_melt <- function(file, tokenizer = tokenizer_csv(), skip = 0, n_max = -1L, locale = readr::default_locale()) {
ds <- datasource(file, skip = skip)
tokenize_melt_(ds, tokenizer, n_max, locale)
}

#' Tokenizers.
#'
#' Explicitly create tokenizer objects. Usually you will not call these
Expand Down
6 changes: 5 additions & 1 deletion man/Tokenizers.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/Collector.h
Expand Up @@ -253,5 +253,6 @@ collectorsCreate(Rcpp::ListOf<Rcpp::List> specs, LocaleInfo* pLocale);
void collectorsResize(std::vector<CollectorPtr>& collectors, int n);
void collectorsClear(std::vector<CollectorPtr>& collectors);
std::string collectorGuess(Rcpp::CharacterVector input, Rcpp::List locale_);
std::string collectorGuessInternal(Rcpp::CharacterVector input, Rcpp::List locale_);

#endif
37 changes: 37 additions & 0 deletions src/CollectorGuess.cpp
Expand Up @@ -48,6 +48,17 @@ bool isNumber(const std::string& x, LocaleInfo* pLocale) {
return ok && begin == x.begin() && end == x.end();
}

bool isInteger(const std::string& x, LocaleInfo* pLocale) {
// Leading zero
if (x[0] == '0')
return false;

double res = 0;
std::string::const_iterator begin = x.begin(), end = x.end();

return parseInt(begin, end, res) && begin == end;
}

bool isDouble(const std::string& x, LocaleInfo* pLocale) {
// Leading zero not followed by decimal mark
if (x[0] == '0' && x.size() > 1 && x[1] != pLocale->decimalMark_)
Expand Down Expand Up @@ -118,3 +129,29 @@ std::string collectorGuess(CharacterVector input, List locale_) {
// Otherwise can always parse as a character
return "character";
}

std::string collectorGuessInternal(CharacterVector input, List locale_) {
LocaleInfo locale(locale_);

if (input.size() == 0 || allMissing(input))
return "character";

// Work from strictest to most flexible
if (canParse(input, isLogical, &locale))
return "logical";
if (canParse(input, isInteger, &locale))
return "integer";
if (canParse(input, isDouble, &locale))
return "double";
if (canParse(input, isNumber, &locale))
return "number";
if (canParse(input, isTime, &locale))
return "time";
if (canParse(input, isDate, &locale))
return "date";
if (canParse(input, isDateTime, &locale))
return "datetime";

// Otherwise can always parse as a character
return "character";
}
15 changes: 15 additions & 0 deletions src/RcppExports.cpp
Expand Up @@ -97,6 +97,20 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// tokenize_melt_
List tokenize_melt_(List sourceSpec, List tokenizerSpec, int n_max, List locale_);
RcppExport SEXP _readr_tokenize_melt_(SEXP sourceSpecSEXP, SEXP tokenizerSpecSEXP, SEXP n_maxSEXP, SEXP locale_SEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< List >::type sourceSpec(sourceSpecSEXP);
Rcpp::traits::input_parameter< List >::type tokenizerSpec(tokenizerSpecSEXP);
Rcpp::traits::input_parameter< int >::type n_max(n_maxSEXP);
Rcpp::traits::input_parameter< List >::type locale_(locale_SEXP);
rcpp_result_gen = Rcpp::wrap(tokenize_melt_(sourceSpec, tokenizerSpec, n_max, locale_));
return rcpp_result_gen;
END_RCPP
}
// parse_vector_
SEXP parse_vector_(CharacterVector x, List collectorSpec, List locale_, const std::vector<std::string>& na, const bool trim_ws);
RcppExport SEXP _readr_parse_vector_(SEXP xSEXP, SEXP collectorSpecSEXP, SEXP locale_SEXP, SEXP naSEXP, SEXP trim_wsSEXP) {
Expand Down Expand Up @@ -327,6 +341,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_readr_count_fields_", (DL_FUNC) &_readr_count_fields_, 3},
{"_readr_guess_header_", (DL_FUNC) &_readr_guess_header_, 3},
{"_readr_tokenize_", (DL_FUNC) &_readr_tokenize_, 3},
{"_readr_tokenize_melt_", (DL_FUNC) &_readr_tokenize_melt_, 4},
{"_readr_parse_vector_", (DL_FUNC) &_readr_parse_vector_, 5},
{"_readr_read_file_", (DL_FUNC) &_readr_read_file_, 2},
{"_readr_read_file_raw_", (DL_FUNC) &_readr_read_file_raw_, 1},
Expand Down
43 changes: 43 additions & 0 deletions src/parse.cpp
Expand Up @@ -110,6 +110,49 @@ RObject tokenize_(List sourceSpec, List tokenizerSpec, int n_max) {
return warnings.addAsAttribute(out);
}

// [[Rcpp::export]]
List tokenize_melt_(List sourceSpec, List tokenizerSpec, int n_max, List locale_) {
Warnings warnings;

SourcePtr source = Source::create(sourceSpec);
TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec);
tokenizer->tokenize(source->begin(), source->end());
tokenizer->setWarnings(&warnings);

std::vector<int> row;
std::vector<int> col;
std::vector<std::string> val;
std::vector<std::string> type;

int i(0);
for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF;
t = tokenizer->nextToken()) {
if (n_max > 0 && t.row() >= (size_t)n_max)
break;

if (i >= row.size()) {
row.resize(i + 1);
col.resize(i + 1);
val.resize(i + 1);
type.resize(i + 1);
}

row[i] = t.row();
col[i] = t.col();
val[i] = t.asString();
type[i] = collectorGuessInternal(t.asString(), locale_);
++i;
}

List out = List::create(_["row"] = row,
_["col"] = col,
_["value"] = val,
_["data_type"] = type);
out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame");
out.attr("row.names") = IntegerVector::create(NA_INTEGER, -i);
return out;
}

// [[Rcpp::export]]
SEXP parse_vector_(
CharacterVector x,
Expand Down

0 comments on commit c260ffe

Please sign in to comment.