Skip to content
Olanto Foundation edited this page May 30, 2018 · 9 revisions

public class Tokenization...

next(): method to define what is a token

public final void next(DoParse a) {

int c = 0; char r;

try {

// loop until it begin with a Letter

while (!(Character.isLetter((char) c)) && (c != EOF)) { 

     c = a.in.read();  // find next char

      a.poschar++; // add one the position

}

 a.cw.setLength(0); // clear the precedent token

 r = (char) c; // init with the first char

 // loop until it is a Letter or a digit or hyphenate

  while ((Character.isLetter(r)

         || Character.isDigit(r)

         || (char) c == '-') && (c != EOF)) { // get word

          a.cw.append(r);  // add to the current token

          c = a.in.read(); // read next value

          a.poschar++; // add one the position

          r = (char) c;

    }

    } catch (Exception e) { e.printStackTrace();}

    a.EOFflag = (c == EOF); // test end of file

     }

normaliseWord(): method to normalised token

     public final String normaliseWord(IdxStructure id, String w) {

     // put all char into lowercase

     w = w.toLowerCase();

     // truncate if to long

     if (w.length() > WORD_MAXLENGTH) {w = w.substring(0, WORD_MAXLENGTH);}

     // replace with the stem

     if (WORD_USE_STEMMER) {w = Stemmer.stemmingOfW(w);}

     return w;

     }