Skip to content

Commit

Permalink
libc: use RegexCache in regex tests
Browse files Browse the repository at this point in the history
  • Loading branch information
melvinw committed Dec 30, 2023
1 parent 39e1146 commit dddb4da
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 89 deletions.
149 changes: 65 additions & 84 deletions cpp/libc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include <fnmatch.h>
#include <glob.h>
#include <locale.h>
#include <regex.h>
#include <stdlib.h> // getenv()
#include <sys/ioctl.h>
#include <unistd.h> // gethostname()
Expand All @@ -17,110 +16,92 @@

namespace libc {

class RegexCache {
public:
static const int kDefaultSize = 100;

struct CacheEntry {
CacheEntry() = delete;
CacheEntry(const CacheEntry&) = delete;

CacheEntry(BigStr* pat, int cflags) : pat_() {
int status = ::regcomp(&compiled_, pat->data_, cflags);
if (status != 0) {
char error_desc[50];
regerror(status, &compiled_, error_desc, 50);
RegexCache::CacheEntry::CacheEntry(BigStr* pat, int cflags) : pat_() {
int status = ::regcomp(&compiled_, pat->data_, cflags);
if (status != 0) {
char error_desc[50];
regerror(status, &compiled_, error_desc, 50);

char error_message[80];
snprintf(error_message, 80, "Invalid regex %s (%s)", pat->data_,
error_desc);
char error_message[80];
snprintf(error_message, 80, "Invalid regex %s (%s)", pat->data_,
error_desc);

throw Alloc<ValueError>(StrFromC(error_message));
}
throw Alloc<ValueError>(StrFromC(error_message));
}

pat_ = static_cast<char*>(malloc(len(pat) + 1));
memcpy(pat_, pat->data_, len(pat) + 1);
pat_hash_ = hash(pat);
}
pat_ = static_cast<char*>(malloc(len(pat) + 1));
memcpy(pat_, pat->data_, len(pat) + 1);
pat_hash_ = hash(pat);
}

~CacheEntry() {
DCHECK(pat_ != nullptr);
free(pat_);
regfree(&compiled_);
}
RegexCache::CacheEntry::~CacheEntry() {
DCHECK(pat_ != nullptr);
free(pat_);
regfree(&compiled_);
}

char* pat_;
int pat_hash_;
regex_t compiled_;
};

RegexCache(int capacity) : capacity_(capacity), access_list_() {
// Override if env var is set.
char* e = getenv("OILS_REGEX_CACHE_SIZE");
if (e) {
int result;
if (StringToInteger(e, strlen(e), 10, &result)) {
capacity_ = result;
}
RegexCache::RegexCache(int capacity) : capacity_(capacity), access_list_() {
// Override if env var is set.
char* e = getenv("OILS_REGEX_CACHE_SIZE");
if (e) {
int result;
if (StringToInteger(e, strlen(e), 10, &result)) {
capacity_ = result;
}
}
}

~RegexCache() {
for (auto& it : access_list_) {
delete it;
}
RegexCache::~RegexCache() {
for (auto& it : access_list_) {
delete it;
}
}

regex_t* regcomp(BigStr* pat, int cflags) {
CacheEntry* entry = TakeEntry(pat);
if (entry == nullptr) {
// Dealing with a new entry. Make space and compile.
MaybeEvict();
entry = new CacheEntry(pat, cflags);
}

SetMostRecent(entry);

return &entry->compiled_;
regex_t* RegexCache::regcomp(BigStr* pat, int cflags) {
RegexCache::CacheEntry* entry = TakeEntry(pat);
if (entry == nullptr) {
// Dealing with a new entry. Make space and compile.
MaybeEvict();
entry = new RegexCache::CacheEntry(pat, cflags);
}

private:
CacheEntry* TakeEntry(BigStr* pat) {
auto it = std::find_if(access_list_.begin(), access_list_.end(),
[pat](CacheEntry* entry) {
return hash(pat) == entry->pat_hash_ &&
strcmp(pat->data_, entry->pat_) == 0;
});
if (it == access_list_.end()) {
return nullptr;
}
SetMostRecent(entry);

return &entry->compiled_;
}

CacheEntry* ret = *it;
access_list_.erase(it);
return ret;
RegexCache::CacheEntry* RegexCache::TakeEntry(BigStr* pat) {
auto it = std::find_if(access_list_.begin(), access_list_.end(),
[pat](RegexCache::CacheEntry* entry) {
return hash(pat) == entry->pat_hash_ &&
strcmp(pat->data_, entry->pat_) == 0;
});
if (it == access_list_.end()) {
return nullptr;
}

void MaybeEvict() {
if (access_list_.size() < capacity_) {
return;
}
RegexCache::CacheEntry* ret = *it;
access_list_.erase(it);
return ret;
}

// Evict the least recently used entry.
if (access_list_.size()) {
delete *access_list_.begin();
access_list_.erase(access_list_.begin());
}
void RegexCache::MaybeEvict() {
if (access_list_.size() < capacity_) {
return;
}

void SetMostRecent(CacheEntry* entry) {
access_list_.push_back(entry);
// Evict the least recently used entry.
if (access_list_.size()) {
delete *access_list_.begin();
access_list_.erase(access_list_.begin());
}
}

size_t capacity_;
std::vector<CacheEntry*> access_list_;
};
void RegexCache::SetMostRecent(RegexCache::CacheEntry* entry) {
access_list_.push_back(entry);
}

static RegexCache gRegexCache(RegexCache::kDefaultSize);
RegexCache gRegexCache(RegexCache::kDefaultSize);

BigStr* gethostname() {
// Note: Fixed issue #1656 - OS X and FreeBSD don't have HOST_NAME_MAX
Expand Down
33 changes: 33 additions & 0 deletions cpp/libc.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#ifndef LIBC_H
#define LIBC_H

#include <regex.h>
#include <stdlib.h>

#include "mycpp/runtime.h"
Expand Down Expand Up @@ -33,6 +34,38 @@ List<int>* regex_search(BigStr* pattern, int cflags, BigStr* str, int eflags,
int wcswidth(BigStr* str);
int get_terminal_width();

class RegexCache {
public:
static const int kDefaultSize = 100;

struct CacheEntry {
CacheEntry() = delete;
CacheEntry(const CacheEntry&) = delete;

CacheEntry(BigStr* pat, int cflags);
~CacheEntry();

char* pat_;
int pat_hash_;
regex_t compiled_;
};

RegexCache(int capacity);
~RegexCache();

regex_t* regcomp(BigStr* pat, int cflags);

private:
CacheEntry* TakeEntry(BigStr* pat);
void MaybeEvict();
void SetMostRecent(CacheEntry* entry);

size_t capacity_;
std::vector<CacheEntry*> access_list_;
};

extern RegexCache gRegexCache;

} // namespace libc

#endif // LIBC_H
9 changes: 4 additions & 5 deletions cpp/libc_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,13 @@ TEST for_test_coverage() {
}

void FindAll(const char* p, const char* s) {
regex_t pat;
regex_t* pat;

int cflags = REG_EXTENDED;
if (regcomp(&pat, p, cflags) != 0) {
if ((pat = libc::gRegexCache.regcomp(StrFromC(p), cflags)) == nullptr) {
FAIL();
}
int outlen = pat.re_nsub + 1; // number of captures
int outlen = pat->re_nsub + 1; // number of captures

// TODO: Could statically allocate 99, and assert that re_nsub is less than
// 99. Would speed up loops.
Expand All @@ -164,7 +164,7 @@ void FindAll(const char* p, const char* s) {
while (true) {
// Necessary so ^ doesn't match in the middle!
int eflags = cur_pos == 0 ? 0 : REG_NOTBOL;
bool match = regexec(&pat, s + cur_pos, outlen, pmatch, eflags) == 0;
bool match = regexec(pat, s + cur_pos, outlen, pmatch, eflags) == 0;

if (!match) {
break;
Expand All @@ -186,7 +186,6 @@ void FindAll(const char* p, const char* s) {
}

free(pmatch);
regfree(&pat);
}

// adjacent matches
Expand Down

0 comments on commit dddb4da

Please sign in to comment.