Skip to content

Commit

Permalink
varnam stemmer working
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinmartinjos committed Jul 30, 2014
1 parent a552051 commit ea4acac
Show file tree
Hide file tree
Showing 15 changed files with 806 additions and 7 deletions.
7 changes: 7 additions & 0 deletions api.h
Expand Up @@ -479,6 +479,13 @@ VARNAM_EXPORT extern int varnam_is_known_word(
varnam *handle,
const char *word);

/*Creates a stemrule in the varnam symbol table*/
int
varnam_create_stemrule(varnam* handle, const char* old_ending, const char* new_ending);

int
varnam_stem(varnam *handle, const char *word, varray *stem_results);

This comment has been minimized.

Copy link
@navaneeth

navaneeth Aug 5, 2014

Collaborator

I don't think we should expose the varnam_stem as a first class API. Stemming is just an implementation detail and I don't think clients will be interested in it. So move this as a private method to where persisting of stemming happens.


VARNAM_EXPORT extern void
varnam_destroy(varnam *handle);

Expand Down
4 changes: 2 additions & 2 deletions examples/CMakeLists.txt
Expand Up @@ -13,8 +13,8 @@ message ("Generating project ${PROJECT_NAME}")

add_executable(transliteration transliteration.c)
add_executable(learning learning.c)
add_executable(stemmer stemmer.c)

target_link_libraries(transliteration ${VARNAM_LIBRARY_NAME})
target_link_libraries(learning ${VARNAM_LIBRARY_NAME})


target_link_libraries(stemmer ${VARNAM_LIBRARY_NAME})
185 changes: 181 additions & 4 deletions learn.c
Expand Up @@ -10,6 +10,7 @@
#include "api.h"
#include "vtypes.h"
#include "varray.h"
#include "vword.h"
#include "util.h"
#include "result-codes.h"
#include "symbol-table.h"
Expand Down Expand Up @@ -389,10 +390,11 @@ varnam_learn_internal(varnam *handle, const char *word, int confidence)
int
varnam_learn(varnam *handle, const char *word)
{
int rc;
#ifdef _RECORD_EXEC_TIME
V_BEGIN_TIMING
#endif
int rc,i;
varray *stem_results;
#ifdef _RECORD_EXEC_TIME
V_BEGIN_TIMING
#endif

reset_pool (handle);

Expand All @@ -409,6 +411,16 @@ varnam_learn(varnam *handle, const char *word)
return rc;
}


stem_results= varray_init();

This comment has been minimized.

Copy link
@navaneeth

navaneeth Aug 5, 2014

Collaborator

Don't initialize a new array. Use pooled array instances. This improves the memory usage. As learn is called in very frequently in long running applications (varnam server for eg), allocating and deallocating each time is not a good idea. Just use get_pooled_array()

rc = varnam_stem(handle, word, stem_results);
if(rc != VARNAM_SUCCESS)
return rc;
for(i=0;i<=stem_results->index;i++)

This comment has been minimized.

Copy link
@navaneeth

navaneeth Aug 5, 2014

Collaborator

Use brackets {}

varnam_learn_internal(handle, ((vword*)varray_get(stem_results, i))->text, 1);

varray_free(stem_results, *destroy_word);

This comment has been minimized.

Copy link
@navaneeth

navaneeth Aug 5, 2014

Collaborator

No need of this when using pooled array


rc = vwt_end_changes (handle);
if (rc != VARNAM_SUCCESS)
return rc;
Expand Down Expand Up @@ -689,3 +701,168 @@ varnam_is_known_word(varnam* handle, const char* word)
else
return 0;
}

int varnam_check_exception(varnam *handle, strbuf *word_buffer, strbuf *end_buffer)

This comment has been minimized.

Copy link
@navaneeth

navaneeth Aug 5, 2014

Collaborator

Is this a private method? If yes, should be marked as static. And don't prefix with varnam_. It is used for methods exposed by varnam as an API.

{
sqlite3 *db;
sqlite3_stmt *stmt;
strbuf *syllable = strbuf_init(8);

This comment has been minimized.

Copy link
@navaneeth

navaneeth Aug 5, 2014

Collaborator

Get a pooled string. Like I told about pooled array

int rc;
char *sql = "select exception from stem_exceptions where stem = ?1";

db = handle->internal->db;

rc = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL);
if(rc != SQLITE_OK)
{
set_last_error(handle, "Failed to initialize statement : %s", sqlite3_errmsg(db));
sqlite3_finalize( stmt );
return VARNAM_ERROR;
}

rc = sqlite3_bind_text(stmt, 1, strbuf_to_s(end_buffer), -1, NULL);
if(rc != SQLITE_OK)
{
set_last_error(handle, "Failed to initialize statement : %s", sqlite3_errmsg(db));
sqlite3_finalize( stmt );
return VARNAM_ERROR;
}

rc = vst_get_last_syllable(handle, word_buffer, syllable);
if(rc != VARNAM_SUCCESS)
{
set_last_error(handle, "Could not obtain last syllable");
return VARNAM_SUCCESS;
}

rc = sqlite3_step(stmt);
if(rc == SQLITE_ROW)
{
if(sqlite3_column_bytes(stmt,0) != 0)
{
if(strcmp(strbuf_to_s(syllable), (char*)sqlite3_column_blob(stmt, 0)) == 0)
{
strbuf_destroy(syllable);
return VARNAM_STEMRULE_HIT;
}
else
{
strbuf_destroy(syllable);
return VARNAM_STEMRULE_MISS;
}
}
}
else if(rc == SQLITE_DONE)
{
strbuf_destroy(syllable);
return VARNAM_SUCCESS;
}

strbuf_destroy(syllable);
return VARNAM_ERROR;
}

/*Searches the symbol table to see if the old_ending constitutes a stem rule*/
int
get_stem(varnam* handle, strbuf* old_ending, strbuf *new_ending)

This comment has been minimized.

Copy link
@navaneeth

navaneeth Aug 5, 2014

Collaborator

Need to add static for private methods

{
sqlite3 *db;
sqlite3_stmt *stmt;
int rc;
const char *sql="select new_ending from stemrules where old_ending = ?1;";

db = handle->internal->db;

rc = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL);
if(rc != SQLITE_OK)
{
set_last_error(handle, "Failed to prepare statement : %s", sqlite3_errmsg(db));
sqlite3_finalize( stmt );
return VARNAM_ERROR;
}

sqlite3_bind_text(stmt, 1, strbuf_to_s(old_ending), -1, NULL);

rc = sqlite3_step(stmt);

if(rc == SQLITE_ROW)
{
strbuf_clear(new_ending);
strbuf_add(new_ending, (char*)sqlite3_column_text(stmt, 0));
sqlite3_finalize(stmt);
return VARNAM_STEMRULE_HIT;
}
else if(rc == SQLITE_DONE)
{
sqlite3_finalize(stmt);
return VARNAM_STEMRULE_MISS;
}
else
{
sqlite3_finalize(stmt);
set_last_error(handle, "Sqlite error : %s", sqlite3_errmsg(db));
return VARNAM_ERROR;
}

}

int varnam_stem(varnam *handle, const char *word, varray *stem_results)
{
int rc;
strbuf *word_buffer, *end_buffer, *new_ending, *temp;
char *ending;

word_buffer = strbuf_init(8);

This comment has been minimized.

Copy link
@navaneeth

navaneeth Aug 5, 2014

Collaborator

Again, all pooled strings.

end_buffer = strbuf_init(8);

This comment has been minimized.

Copy link
@navaneeth

navaneeth Aug 5, 2014

Collaborator

Name the variables without having buffer suffix. Because, the fact that it is a buffer is understood even without the buffer suffix. Name it something meaningful.

temp = strbuf_init(8);
new_ending = strbuf_init(8);
strbuf_add(word_buffer, word);

while(word_buffer->length > 0)
{
/*the next character of word_buffer should go
to the beginning of the end_bufer. For this
we copy end_buffer to temp, clear end_buffer,
add new ending to end_buffer and append the
contents of temp back to end_buffer*/
strbuf_clear(temp);
strbuf_add(temp, strbuf_to_s(end_buffer));
strbuf_clear(end_buffer);
ending = strbuf_get_ending(word_buffer);
strbuf_add(end_buffer, ending);
strbuf_add(end_buffer, strbuf_to_s(temp));
strbuf_remove_from_last(word_buffer, ending);

rc = get_stem(handle, end_buffer, new_ending);
if(rc == VARNAM_STEMRULE_HIT)
{
rc = varnam_check_exception(handle, word_buffer, end_buffer);
if(rc == VARNAM_STEMRULE_HIT)
continue;


strbuf_add(word_buffer, strbuf_to_s(new_ending));
/*Creating a vword using Word()
word_buffer will change in subsequent iterations of the loop
So pushing a pointer to word_buffer->buffer to varray is of
no use. So we create a vword for each word that is to be learned
and push it to the varray*/
varray_push(stem_results, Word(handle, (char*)strbuf_to_s(word_buffer), 0));
strbuf_clear(end_buffer);
}
else if(rc != VARNAM_STEMRULE_MISS)
{
free(ending);
set_last_error(handle, "stemrule query failed");
return VARNAM_ERROR;
}

free(ending);
}

strbuf_destroy(temp);
strbuf_destroy(word_buffer);
strbuf_destroy(end_buffer);
strbuf_destroy(new_ending);
return VARNAM_SUCCESS;
}
2 changes: 2 additions & 0 deletions result-codes.h
Expand Up @@ -18,5 +18,7 @@
#define VARNAM_PARTIAL_RENDERING 5
#define VARNAM_STORAGE_ERROR 6
#define VARNAM_INVALID_CONFIG 7
#define VARNAM_STEMRULE_HIT 8
#define VARNAM_STEMRULE_MISS 9

#endif

3 comments on commit ea4acac

@navaneeth
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Overall good. Some minor corrections which I have commented inline. I have created issue #52 to track the whole merge and putting into production.

@navaneeth
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CMake is failing. Did you forget to commit something?

CMake Error at examples/CMakeLists.txt:16 (add_executable):
  Cannot find source file:

    stemmer.c

  Tried extensions .c .C .c++ .cc .cpp .cxx .m .M .mm .h .hh .h++ .hm .hpp
  .hxx .in .txx

@kevinmartinjos
Copy link
Owner Author

@kevinmartinjos kevinmartinjos commented on ea4acac Aug 6, 2014 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.