Permalink
Browse files

varnam stemmer working

  • Loading branch information...
lonesword committed Jul 27, 2014
1 parent a552051 commit ea4acac4c777243a6e42ec3809e98e3e5cf20fe7
Showing with 806 additions and 7 deletions.
  1. +7 −0 api.h
  2. +2 −2 examples/CMakeLists.txt
  3. +181 −4 learn.c
  4. +2 −0 result-codes.h
  5. +219 −0 schemes/ml
  6. +24 −0 strbuf.c
  7. +164 −1 symbol-table.c
  8. +7 −0 symbol-table.h
  9. +1 −0 tests/CMakeLists.txt
  10. +111 −0 tests/stemmer_tests.c
  11. +1 −0 tests/test-runner.c
  12. +1 −0 util.h
  13. +49 −0 varnam.c
  14. +35 −0 varnamc
  15. +2 −0 varnamruby.rb
7 api.h
@@ -479,6 +479,13 @@ VARNAM_EXPORT extern int varnam_is_known_word(
varnam *handle,
const char *word);

/*Creates a stemrule in the varnam symbol table*/
int
varnam_create_stemrule(varnam* handle, const char* old_ending, const char* new_ending);

int
varnam_stem(varnam *handle, const char *word, varray *stem_results);

This comment has been minimized.

@navaneeth

navaneeth Aug 5, 2014

Collaborator

I don't think we should expose the varnam_stem as a first class API. Stemming is just an implementation detail and I don't think clients will be interested in it. So move this as a private method to where persisting of stemming happens.


VARNAM_EXPORT extern void
varnam_destroy(varnam *handle);

@@ -13,8 +13,8 @@ message ("Generating project ${PROJECT_NAME}")

add_executable(transliteration transliteration.c)
add_executable(learning learning.c)
add_executable(stemmer stemmer.c)

target_link_libraries(transliteration ${VARNAM_LIBRARY_NAME})
target_link_libraries(learning ${VARNAM_LIBRARY_NAME})


target_link_libraries(stemmer ${VARNAM_LIBRARY_NAME})
185 learn.c
@@ -10,6 +10,7 @@
#include "api.h"
#include "vtypes.h"
#include "varray.h"
#include "vword.h"
#include "util.h"
#include "result-codes.h"
#include "symbol-table.h"
@@ -389,10 +390,11 @@ varnam_learn_internal(varnam *handle, const char *word, int confidence)
int
varnam_learn(varnam *handle, const char *word)
{
int rc;
#ifdef _RECORD_EXEC_TIME
V_BEGIN_TIMING
#endif
int rc,i;
varray *stem_results;
#ifdef _RECORD_EXEC_TIME
V_BEGIN_TIMING
#endif

reset_pool (handle);

@@ -409,6 +411,16 @@ varnam_learn(varnam *handle, const char *word)
return rc;
}


stem_results= varray_init();

This comment has been minimized.

@navaneeth

navaneeth Aug 5, 2014

Collaborator

Don't initialize a new array. Use pooled array instances. This improves the memory usage. As learn is called in very frequently in long running applications (varnam server for eg), allocating and deallocating each time is not a good idea. Just use get_pooled_array()

rc = varnam_stem(handle, word, stem_results);
if(rc != VARNAM_SUCCESS)
return rc;
for(i=0;i<=stem_results->index;i++)

This comment has been minimized.

@navaneeth

navaneeth Aug 5, 2014

Collaborator

Use brackets {}

varnam_learn_internal(handle, ((vword*)varray_get(stem_results, i))->text, 1);

varray_free(stem_results, *destroy_word);

This comment has been minimized.

@navaneeth

navaneeth Aug 5, 2014

Collaborator

No need of this when using pooled array


rc = vwt_end_changes (handle);
if (rc != VARNAM_SUCCESS)
return rc;
@@ -689,3 +701,168 @@ varnam_is_known_word(varnam* handle, const char* word)
else
return 0;
}

int varnam_check_exception(varnam *handle, strbuf *word_buffer, strbuf *end_buffer)

This comment has been minimized.

@navaneeth

navaneeth Aug 5, 2014

Collaborator

Is this a private method? If yes, should be marked as static. And don't prefix with varnam_. It is used for methods exposed by varnam as an API.

{
sqlite3 *db;
sqlite3_stmt *stmt;
strbuf *syllable = strbuf_init(8);

This comment has been minimized.

@navaneeth

navaneeth Aug 5, 2014

Collaborator

Get a pooled string. Like I told about pooled array

int rc;
char *sql = "select exception from stem_exceptions where stem = ?1";

db = handle->internal->db;

rc = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL);
if(rc != SQLITE_OK)
{
set_last_error(handle, "Failed to initialize statement : %s", sqlite3_errmsg(db));
sqlite3_finalize( stmt );
return VARNAM_ERROR;
}

rc = sqlite3_bind_text(stmt, 1, strbuf_to_s(end_buffer), -1, NULL);
if(rc != SQLITE_OK)
{
set_last_error(handle, "Failed to initialize statement : %s", sqlite3_errmsg(db));
sqlite3_finalize( stmt );
return VARNAM_ERROR;
}

rc = vst_get_last_syllable(handle, word_buffer, syllable);
if(rc != VARNAM_SUCCESS)
{
set_last_error(handle, "Could not obtain last syllable");
return VARNAM_SUCCESS;
}

rc = sqlite3_step(stmt);
if(rc == SQLITE_ROW)
{
if(sqlite3_column_bytes(stmt,0) != 0)
{
if(strcmp(strbuf_to_s(syllable), (char*)sqlite3_column_blob(stmt, 0)) == 0)
{
strbuf_destroy(syllable);
return VARNAM_STEMRULE_HIT;
}
else
{
strbuf_destroy(syllable);
return VARNAM_STEMRULE_MISS;
}
}
}
else if(rc == SQLITE_DONE)
{
strbuf_destroy(syllable);
return VARNAM_SUCCESS;
}

strbuf_destroy(syllable);
return VARNAM_ERROR;
}

/*Searches the symbol table to see if the old_ending constitutes a stem rule*/
int
get_stem(varnam* handle, strbuf* old_ending, strbuf *new_ending)

This comment has been minimized.

@navaneeth

navaneeth Aug 5, 2014

Collaborator

Need to add static for private methods

{
sqlite3 *db;
sqlite3_stmt *stmt;
int rc;
const char *sql="select new_ending from stemrules where old_ending = ?1;";

db = handle->internal->db;

rc = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL);
if(rc != SQLITE_OK)
{
set_last_error(handle, "Failed to prepare statement : %s", sqlite3_errmsg(db));
sqlite3_finalize( stmt );
return VARNAM_ERROR;
}

sqlite3_bind_text(stmt, 1, strbuf_to_s(old_ending), -1, NULL);

rc = sqlite3_step(stmt);

if(rc == SQLITE_ROW)
{
strbuf_clear(new_ending);
strbuf_add(new_ending, (char*)sqlite3_column_text(stmt, 0));
sqlite3_finalize(stmt);
return VARNAM_STEMRULE_HIT;
}
else if(rc == SQLITE_DONE)
{
sqlite3_finalize(stmt);
return VARNAM_STEMRULE_MISS;
}
else
{
sqlite3_finalize(stmt);
set_last_error(handle, "Sqlite error : %s", sqlite3_errmsg(db));
return VARNAM_ERROR;
}

}

int varnam_stem(varnam *handle, const char *word, varray *stem_results)
{
int rc;
strbuf *word_buffer, *end_buffer, *new_ending, *temp;
char *ending;

word_buffer = strbuf_init(8);

This comment has been minimized.

@navaneeth

navaneeth Aug 5, 2014

Collaborator

Again, all pooled strings.

end_buffer = strbuf_init(8);

This comment has been minimized.

@navaneeth

navaneeth Aug 5, 2014

Collaborator

Name the variables without having buffer suffix. Because, the fact that it is a buffer is understood even without the buffer suffix. Name it something meaningful.

temp = strbuf_init(8);
new_ending = strbuf_init(8);
strbuf_add(word_buffer, word);

while(word_buffer->length > 0)
{
/*the next character of word_buffer should go
to the beginning of the end_bufer. For this
we copy end_buffer to temp, clear end_buffer,
add new ending to end_buffer and append the
contents of temp back to end_buffer*/
strbuf_clear(temp);
strbuf_add(temp, strbuf_to_s(end_buffer));
strbuf_clear(end_buffer);
ending = strbuf_get_ending(word_buffer);
strbuf_add(end_buffer, ending);
strbuf_add(end_buffer, strbuf_to_s(temp));
strbuf_remove_from_last(word_buffer, ending);

rc = get_stem(handle, end_buffer, new_ending);
if(rc == VARNAM_STEMRULE_HIT)
{
rc = varnam_check_exception(handle, word_buffer, end_buffer);
if(rc == VARNAM_STEMRULE_HIT)
continue;


strbuf_add(word_buffer, strbuf_to_s(new_ending));
/*Creating a vword using Word()
word_buffer will change in subsequent iterations of the loop
So pushing a pointer to word_buffer->buffer to varray is of
no use. So we create a vword for each word that is to be learned
and push it to the varray*/
varray_push(stem_results, Word(handle, (char*)strbuf_to_s(word_buffer), 0));
strbuf_clear(end_buffer);
}
else if(rc != VARNAM_STEMRULE_MISS)
{
free(ending);
set_last_error(handle, "stemrule query failed");
return VARNAM_ERROR;
}

free(ending);
}

strbuf_destroy(temp);
strbuf_destroy(word_buffer);
strbuf_destroy(end_buffer);
strbuf_destroy(new_ending);
return VARNAM_SUCCESS;
}
@@ -18,5 +18,7 @@
#define VARNAM_PARTIAL_RENDERING 5
#define VARNAM_STORAGE_ERROR 6
#define VARNAM_INVALID_CONFIG 7
#define VARNAM_STEMRULE_HIT 8
#define VARNAM_STEMRULE_MISS 9

#endif
Oops, something went wrong.

3 comments on commit ea4acac

@navaneeth

This comment has been minimized.

Copy link
Collaborator

navaneeth replied Aug 5, 2014

Overall good. Some minor corrections which I have commented inline. I have created issue #52 to track the whole merge and putting into production.

@navaneeth

This comment has been minimized.

Copy link
Collaborator

navaneeth replied Aug 5, 2014

CMake is failing. Did you forget to commit something?

CMake Error at examples/CMakeLists.txt:16 (add_executable):
  Cannot find source file:

    stemmer.c

  Tried extensions .c .C .c++ .cc .cpp .cxx .m .M .mm .h .hh .h++ .hm .hpp
  .hxx .in .txx
@lonesword

This comment has been minimized.

Copy link
Owner Author

lonesword replied Aug 6, 2014

Please sign in to comment.