-
Notifications
You must be signed in to change notification settings - Fork 1
Token_annexe
Olanto Foundation edited this page May 30, 2018
·
9 revisions
public final void next(DoParse a) {
int c = 0; char r;
try {
// loop until it begin with a Letter
while (!(Character.isLetter((char) c)) && (c != EOF)) {
c = a.in.read(); // find next char
a.poschar++; // add one the position
}
a.cw.setLength(0); // clear the precedent token
r = (char) c; // init with the first char
// loop until it is a Letter or a digit or hyphenate
while ((Character.isLetter(r)
|| Character.isDigit(r)
|| (char) c == '-') && (c != EOF)) { // get word
a.cw.append(r); // add to the current token
c = a.in.read(); // read next value
a.poschar++; // add one the position
r = (char) c;
}
} catch (Exception e) { e.printStackTrace();}
a.EOFflag = (c == EOF); // test end of file
}
public final String normaliseWord(IdxStructure id, String w) {
// put all char into lowercase
w = w.toLowerCase();
// truncate if to long
if (w.length() > WORD_MAXLENGTH) {w = w.substring(0, WORD_MAXLENGTH);}
// replace with the stem
if (WORD_USE_STEMMER) {w = Stemmer.stemmingOfW(w);}
return w;
}