Olanto Foundation edited this page May 30, 2018

public class Tokenization...

public class Tokenization...

next(): method to define what is a token

public final void next(DoParse a) {

int c = 0; char r;

try {

// loop until it begin with a Letter

while (!(Character.isLetter((char) c)) && (c != EOF)) { 

     c =;  // find next char

      a.poschar++; // add one the position

}; // clear the precedent token

 r = (char) c; // init with the first char

 // loop until it is a Letter or a digit or hyphenate

  while ((Character.isLetter(r)

         || Character.isDigit(r)

         || (char) c == '-') && (c != EOF)) { // get word

;  // add to the current token

          c =; // read next value

          a.poschar++; // add one the position

          r = (char) c;


    } catch (Exception e) { e.printStackTrace();}

    a.EOFflag = (c == EOF); // test end of file


normaliseWord(): method to normalised token

     public final String normaliseWord(IdxStructure id, String w) {

     // put all char into lowercase

     w = w.toLowerCase();

     // truncate if to long

     if (w.length() > WORD_MAXLENGTH) {w = w.substring(0, WORD_MAXLENGTH);}

     // replace with the stem

     if (WORD_USE_STEMMER) {w = Stemmer.stemmingOfW(w);}

     return w;
