Skip to content

Commit

Permalink
Merge 'develop' gtools-0.7.2 (2017-09-28); gisid, glevelsof, bugfixes
Browse files Browse the repository at this point in the history
Features

* `gisid` is added as a working replacement for `isid` and `isid, missok`.
  `gisid` taks `if` and `in` statements; however, it does not implement
  `isid, sort` or `isid using`.
* `glevelsof` is added as a working replacement for `levelsof`.
  All `levelsof` features are available.
* Temporary variable no longer created for `egen, tag` or `egen, group`
* Fixes #6
    * Variables are sorted internally for `egen, group`, which matches `egen`.
    * Variables are sorted internally for `gcollapse`, which is faster.
* Various internal enhancements:
    * The hash is validated faster
    * Hash validation is also used to read in group variables
    * Integer bijection now sorts by the integers correctly,
      obviating the need for a second sort.
    * No need to validate the hash with integer bijection.
    * The memory usage is marginally leaner.
    * Reorganized all the files, making the code-base easier to maintain.
* Various commented internal code deleted.

Enhancements

* Fixes #13 so
  `gcollapse` maintains source formats on targets.
* Improved internal handling of if conditions for `egen`.
* `egen` now only processes observations in range for `id, group`
* `egen, group` now marginally faster when all vars are integers

Bug fixes

* Prior versions de-facto used a 64-bit hash instead of a 128-bit hash.
  The new version should use the 128-bit hash correctly.
* Prior versions would fail if there was only 1 observation.
* Fixes #15
  which was introduced trying to fix
  #15

Backwards-incompatible

* `gcollapse, unsorted` no longer supported (due to internal sorting)
  • Loading branch information
mcaceresb committed Sep 29, 2017
2 parents feff102 + 82c2458 commit 1832fe8
Show file tree
Hide file tree
Showing 90 changed files with 9,137 additions and 6,081 deletions.
54 changes: 31 additions & 23 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,48 +1,49 @@
EXECUTION=normal
LEGACY=

ifeq ($(OS),Windows_NT)
SPOOKYLIB = -l:spookyhash.dll
SPOOKYLIB = spookyhash.dll
OSFLAGS = -shared
GCC = x86_64-w64-mingw32-gcc-5.4.0.exe
PREMAKE = premake5.exe
OUT = build/gtools_windows.plugin
OUTM = build/gtools_windows_multi.plugin build/gtools_multi.o
OUTE = build/env_set_windows.plugin
OUT = build/gtools_windows$(LEGACY).plugin
OUTM = build/gtools_windows_multi$(LEGACY).plugin build/gtools_multi$(LEGACY).o
OUTE = build/env_set_windows$(LEGACY).plugin
OPENMP = -fopenmp -DGMULTI=1
else
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Linux)
OSFLAGS = -shared -fPIC -DSYSTEM=OPUNIX
OUT = build/gtools_unix.plugin build/gtools.o
OUTM = build/gtools_unix_multi.plugin build/gtools_multi.o
OUTE = build/env_set_unix.plugin
SPOOKYLIB = -l:libspookyhash.a
OUT = build/gtools_unix$(LEGACY).plugin build/gtools$(LEGACY).o
OUTM = build/gtools_unix_multi$(LEGACY).plugin build/gtools_multi$(LEGACY).o
OUTE = build/env_set_unix$(LEGACY).plugin
SPOOKYLIB = libspookyhash.a
endif
ifeq ($(UNAME_S),Darwin)
OSFLAGS = -bundle -DSYSTEM=APPLEMAC
OUT = build/gtools_macosx.plugin
OUTM = build/gtools_macosx_multi.plugin build/gtools_multi.o
OUTE = build/env_set_macosx.plugin
SPOOKYLIB = -l:libspookyhash.so
OUT = build/gtools_macosx$(LEGACY).plugin
OUTM = build/gtools_macosx_multi$(LEGACY).plugin build/gtools_multi$(LEGACY).o
OUTE = build/env_set_macosx$(LEGACY).plugin
SPOOKYLIB = libspookyhash.a
endif
GCC = gcc
PREMAKE = premake5
OPENMP = -fopenmp -DGMULTI=1
endif

ifeq ($(EXECUTION),windows)
SPOOKYLIB = -l:spookyhash.dll
SPOOKYLIB = spookyhash.dll
OSFLAGS = -shared
GCC = x86_64-w64-mingw32-gcc
OUT = build/gtools_windows.plugin
OUTM = build/gtools_windows_multi.plugin build/gtools_multi.o
OUTE = build/env_set_windows.plugin
OUT = build/gtools_windows$(LEGACY).plugin
OUTM = build/gtools_windows_multi$(LEGACY).plugin build/gtools_multi$(LEGACY).o
OUTE = build/env_set_windows$(LEGACY).plugin
endif

SPI = 2.0
SPT = 0.2
CFLAGS = -Wall -O2 $(OSFLAGS)
SPOOKY = -L./lib/spookyhash/build/bin/Release -L./lib/spookyhash/build $(SPOOKYLIB)
CFLAGS = -Wall -O3 $(OSFLAGS)
SPOOKY = -L./lib/spookyhash/build/bin/Release -L./lib/spookyhash/build -l:$(SPOOKYLIB)
AUX = build/stplugin.o

# OpenMP only tested on Linux
Expand Down Expand Up @@ -76,13 +77,20 @@ spooky:
cd lib/spookyhash/build && make clean
cd lib/spookyhash/build && make
mkdir -p ./build
cp -f ./lib/spookyhash/build/libspookyhash.so ./build/libspookyhash.so
# cp -f ./lib/spookyhash/build/$(SPOOKYLIB) ./build/$(SPOOKYLIB)
else ifeq ($(UNAME_S),Linux)
ifeq ($(LEGACY),_legacy)
spooky:
# cd lib/spookyhash/build && $(PREMAKE) gmake
cd lib/spookyhash/build && make clean
cd lib/spookyhash/build && make CFLAGS+=-fPIC
else
spooky:
cd lib/spookyhash/build && $(PREMAKE) gmake
cd lib/spookyhash/build && make clean
cd lib/spookyhash/build && make
endif
endif

spookytest:
cd lib/spookyhash/build && ./bin/Release/spookyhash-test
Expand All @@ -102,18 +110,18 @@ gtools_other: src/plugin/gtools.c src/plugin/spi/stplugin.c
mkdir -p ./lib/spookyhash/build/bin/Release
$(GCC) $(CFLAGS) -o $(OUT) src/plugin/spi/stplugin.c src/plugin/gtools.c $(SPOOKY)
# $(GCC) $(CFLAGS) -c -o build/stplugin.o src/plugin/spi/stplugin.c
# $(GCC) $(CFLAGS) -c -o build/gtools_multi.o src/plugin/gtools.c $(OPENMP)
# $(GCC) $(CFLAGS) -c -o build/gtools_multi$(LEGACY).o src/plugin/gtools.c $(OPENMP)
# $(GCC) $(CFLAGS) -o $(OUTM) $(AUX) $(SPOOKY) $(OPENMP) # Does not load
# $(GCC) -Wall -O2 -o $(OUTM) $(AUX) $(SPOOKY) $(OPENMP) # Crashes
# $(GCC) -Wall -O3 -o $(OUTM) $(AUX) $(SPOOKY) $(OPENMP) # Crashes
$(GCC) $(CFLAGS) -o $(OUTE) src/plugin/spi/stplugin.c src/plugin/env_set.c

gtools_nix: src/plugin/gtools.c src/plugin/spi/stplugin.c
mkdir -p ./build
mkdir -p ./lib/spookyhash/build/bin/Release
$(GCC) $(CFLAGS) -c -o build/stplugin.o src/plugin/spi/stplugin.c
$(GCC) $(CFLAGS) -c -o build/gtools.o src/plugin/gtools.c
$(GCC) $(CFLAGS) -c -o build/gtools$(LEGACY).o src/plugin/gtools.c
$(GCC) $(CFLAGS) -o $(OUT) $(AUX) $(SPOOKY)
$(GCC) $(CFLAGS) -c -o build/gtools_multi.o src/plugin/gtools.c $(OPENMP)
$(GCC) $(CFLAGS) -c -o build/gtools_multi$(LEGACY).o src/plugin/gtools.c $(OPENMP)
$(GCC) $(CFLAGS) -o $(OUTM) $(AUX) $(SPOOKY) $(OPENMP)
$(GCC) $(CFLAGS) -o $(OUTE) src/plugin/spi/stplugin.c src/plugin/env_set.c

Expand Down
93 changes: 57 additions & 36 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,49 @@

[Overview](#faster-stata-for-group-operations)
| [Installation](#installation)
| [Benchmarks](#benchmarks)
| [Benchmarks](#collapse-benchmarks)
| [Building](#building)
| [FAQs](#faqs)
| [License](#license)

_Gtools_ is a Stata package that provides a fast implementation of
common group commands like collapse and egen using C plugins for a
_Gtools_ is a Stata package that provides a fast implementation of common
group commands like collapse, egen, isid, and levelsof using C plugins for a
massive speed improvement.

`version 0.6.16 13Sep2017`
`version 0.7.2 28Sep2017`
Builds: Linux [![Travis Build Status](https://travis-ci.org/mcaceresb/stata-gtools.svg?branch=master)](https://travis-ci.org/mcaceresb/stata-gtools),
Windows (Cygwin) [![Appveyor Build status](https://ci.appveyor.com/api/projects/status/2bh1q9bulx3pl81p/branch/master?svg=true)](https://ci.appveyor.com/project/mcaceresb/stata-gtools)

Faster Stata for Group Operations
---------------------------------

This package's aim is to provide a fast implementation of group commands in
Stata using C plugins. At the moment, the package's main feature is a faster
implementation of `collapse`, called `gcollapse`, that is also faster than
Sergio Correia's `fcollapse` from `ftools` (further, group variables can be a
mix of string and numeric, like `collapse`). It also provides some (limited)
support for by-able `egen` functions via `gegen`.

In our benchmarks, `gcollapse` was 5 to 120 times faster than `collapse`
and 3 to 20 times faster than `fcollapse` (the speed gain is smaller for
simpler statistics, such as sums, and larger for complex statistics, such as
percentiles). The key insight is two-fold: First, hashing the data and sorting
the hash is a lot faster than sorting the data before processing it by group.
Second, compiled C code is much faster than Stata commands.
Stata using C plugins. This includes:

| Function | Replaces | Extras | Unsupported |
| ----------- | ---------- | ------------------ | ------------------ |
| `gcollapse` | `collapse` | Quantiles, `merge` | Weights |
| `gegen` | `egen` | Quantiles | See [FAQs](#faqs) for available functions |
| `gisid` | `isid` | `if`, `in` | `using`, `sort` |
| `glevelsof` | `levelsof` | Multiple variables | |

The key insight is two-fold: First, hashing the data and sorting the hash is
a lot faster than sorting the data before processing it by group. Second,
compiled C code is much faster than Stata commands. This insight is used
in all `gtools` functions to achieve their speedup.

The package's main feature is a faster implementation of `collapse`, called
`gcollapse`, that is also faster than Sergio Correia's `fcollapse` from
`ftools` (further, group variables can be a mix of string and numeric,
like `collapse`). In our benchmarks, `gcollapse` was 5 to 120 times faster
than `collapse` and 3 to 20 times faster than `fcollapse` (the speed gain
is smaller for simpler statistics, such as sums, and larger for complex
statistics, such as percentiles).

The current release only provides Unix (Linux) and Windows versions of the C
plugin. Further, multi-threading is only available on Linux. OSX versions and
a muilti-threaded Windows version are planned for a future release.

If you plan to use the plugin extensively, check out the [FAQs](#faqs) for
caveats and details on the plugin.

Expand All @@ -49,16 +58,27 @@ net install gtools, from(https://raw.githubusercontent.com/mcaceresb/stata-gtool
* ado uninstall gtools
```

The syntax is identical to `collapse`, except weights are not yet supported:
The syntax is generally analogous to the standard commands (see the corresponding
help files for full syntax and options):
```stata
gcollapse (stat) target = source [(stat) target = source ...], by(varlist)
gcollapse (mean) mean_x1 = x1 (median) median_x1 = x1, by(groupvar)
gcollapse (stat) target = source [(stat) target = source ...], by(varlist) [options]
gcollapse (mean) mean_x1 = x1 (median) median_x1 = x1, by(groupvar) [options]
gegen target = stat(source), by(varlist) [options]
gegen mean_x1 = mean(x1), by(groupvar)
gisid varlist [if] [in], [options]
gisid groupvar, missok
glevelsof varlist [if] [in], [options]
glevelsof groupvar, local(levels) sep(" | ")
```

Support for weights is planned for a future release.
Support for weights for `gcollapse` and `gegen` planned for a future
release. See the [FAQs](#faqs) for a list of supported functions.

Benchmarks
----------
Collapse benchmarks
-------------------

See `src/test/bench_gcollapse.do` for the benchmark code. I run 3 sets of benchmarks:
- `ftools`-style benchmarks: Collapse a large number of observations
Expand Down Expand Up @@ -105,7 +125,6 @@ percentiles.

### Benchmark details: In the style of `ftools`


Vary N for J = 100 and collapse 15 variables:
```
vars = y1-y15 ~ 123.456 + U(0, 1)
Expand Down Expand Up @@ -192,7 +211,7 @@ collapsing):
`fcollapse` did better for a modest numbers of groups, but it performed
poorly for very few groups and for a large number of groups. Overall
`gcollapse` was 7-16 times faster. I have not benchmarked `collapsed`
against version `0.6.4` in this case because each run will take over
against version `0.7.0` in this case because each run will take over
an hour and have not found the time. I ran a "smaller" version of this
benchmark: Vary J for N = 5,000,000
```
Expand All @@ -212,8 +231,7 @@ Building

### Requirements

If you want to compile the plugin yourself, atop the C standard library
you will need
If you want to compile the plugin yourself, you will need
- The GNU Compiler Collection (`gcc`)
- [`premake5`](https://premake.github.io)
- [`centaurean`'s implementation of SpookyHash](https://github.com/centaurean/spookyhash)
Expand Down Expand Up @@ -353,14 +371,18 @@ which allows computing quantiles; e.g. 2.5 or 97.5).

### Important differences from `egen`

- Generating group IDs is different than `egen`: `gegen` does not care to
sort the groups before processing; it just groups them together. This means
that **`gegen group` will produce different output than `egen group`**. The
former tags groups as they appear, whereas the latter tags the first group
as it would appear sorted as 1, the second as it would appear sorted as 2,
and so on. This is discussed in [issue #6](https://github.com/mcaceresb/stata-gtools/issues/6)
- Most egen function are not yet supported by `gegen`; only
the functions noted above are currently available.
- Most egen function are not yet supported by `gegen`; only the functions
noted above are currently available.

### Important differences from `flevelsof`

- It can take a `varlist` and not just a `varname`

### Important differences from `isid`

- No support for `using`.
- Option `sort` is not available.
- It can check IDs with `if` and `in`

### Stata on Windows

Expand Down Expand Up @@ -524,7 +546,6 @@ In order of priority:

- [ ] Compile for OSX.
- [ ] Multi-threaded version on windows.
- [ ] Implement a way to sort multi-dimensional mixed-type indeces in C.
- [ ] Fix Windows bug where comma-format is not correctly displayed.
- [ ] Add support for weights.
- [ ] Provide `sumup` and `sum` altetnative, `gsum`.
Expand Down
24 changes: 21 additions & 3 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def is_exe(fpath):
return program
else:
for epath in os.environ["PATH"].split(os.pathsep):
epath = path.strip('"')
epath = epath.strip('"')
exe_file = path.join(epath, program)
if is_exe(exe_file):
return exe_file
Expand All @@ -54,6 +54,13 @@ def is_exe(fpath):
default = None,
required = False,
help = "Arguments to pass to Stata executable")
parser.add_argument('--make-flags',
nargs = 1,
type = str,
metavar = 'MAKE_FLAGS',
default = None,
required = False,
help = "Arguments to pass to make")
parser.add_argument('--clean',
dest = 'clean',
action = 'store_true',
Expand Down Expand Up @@ -91,6 +98,10 @@ def makedirs_safe(directory):
"gcollapse.sthlp",
"gegen.ado",
"gegen.sthlp",
"gisid.ado",
"gisid.sthlp",
"glevelsof.ado",
"glevelsof.sthlp",
"gtools.ado",
"gtools.sthlp"
]
Expand Down Expand Up @@ -161,12 +172,13 @@ def makedirs_safe(directory):
if platform in ["linux", "linux2", "win32", "cygwin", "darwin"]:
print("Trying to compile plugins for -gtools-")
print("(note: this assumes you have already compiled SpookyHash)")
rc = system("make")
make_flags = args['make_flags'] if args['make_flags'] is not None else ""
rc = system("make {0}".format(make_flags))
print("Success!" if rc == 0 else "Failed.")
if args['windows']:
rc = system("make EXECUTION=windows clean")
rc = system("make EXECUTION=windows spooky")
rc = system("make EXECUTION=windows")
rc = system("make EXECUTION=windows {0}".format(make_flags))
else:
print("Don't know platform '{0}'; compile manually.".format(platform))
exit(198)
Expand All @@ -179,6 +191,8 @@ def makedirs_safe(directory):
testfile = open(path.join("src", "test", "gtools_tests.do")).readlines()
files = [path.join("src", "test", "test_gcollapse.do"),
path.join("src", "test", "test_gegen.do"),
path.join("src", "test", "test_gisid.do"),
path.join("src", "test", "test_glevelsof.do"),
path.join("src", "test", "bench_gcollapse.do")]

with open(path.join("build", "gtools_tests.do"), 'w') as outfile:
Expand All @@ -200,9 +214,13 @@ def makedirs_safe(directory):
copy2(path.join("src", "stata.toc"), gdir)
copy2(path.join("src", "ado", "gcollapse.ado"), gdir)
copy2(path.join("src", "ado", "gegen.ado"), gdir)
copy2(path.join("src", "ado", "gisid.ado"), gdir)
copy2(path.join("src", "ado", "glevelsof.ado"), gdir)
copy2(path.join("src", "ado", "gtools.ado"), gdir)
copy2(path.join("doc", "gcollapse.sthlp"), gdir)
copy2(path.join("doc", "gegen.sthlp"), gdir)
copy2(path.join("doc", "gisid.sthlp"), gdir)
copy2(path.join("doc", "glevelsof.sthlp"), gdir)
copy2(path.join("doc", "gtools.sthlp"), gdir)

# Copy files to .zip folder in ./releases
Expand Down
Loading

0 comments on commit 1832fe8

Please sign in to comment.