/
build2.js
48 lines (45 loc) · 1.39 KB
/
build2.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import { readLines } from "https://deno.land/std/io/mod.ts";
const lemmatizationDict = new Map();
lemmatizationDict.set("an", "a");
let fileReader = await Deno.open("vendor/agid-2016.01.19/infl.txt");
for await (const line of readLines(fileReader)) {
if (!line) continue;
const [toStr, fromStr] = line.split(": ");
if (!toStr.includes("?")) {
const [to, _toPos] = toStr.split(" ");
const froms = [];
fromStr.split(" | ").forEach((forms) => {
forms.split(", ").forEach((entry) => {
if (!entry.match(/[~<!?]/)) {
const word = entry.split(" ")[0];
froms.push(word);
}
});
});
froms.forEach((from) => {
lemmatizationDict.set(from, to);
});
}
}
lemmatizationDict.delete("danger");
const gsl = new Map();
fileReader = await Deno.open("dist/mGSL.raw.lst");
for await (const line of readLines(fileReader)) {
if (!line) continue;
let [lemma, count] = line.split("\t");
count = parseInt(count);
if (lemmatizationDict.has(lemma)) {
const newLemma = lemmatizationDict.get(lemma);
if (gsl.has(newLemma)) {
gsl.set(newLemma, count + gsl.get(newLemma));
} else {
gsl.set(newLemma, count);
}
} else {
gsl.set(lemma, count);
}
}
const mgsl = [...gsl.entries()];
mgsl.sort((a, b) => b[1] - a[1]);
const tsv = mgsl.map((arr) => arr.join("\t")).join("\n");
Deno.writeTextFileSync("dist/mGSL.lemmatized.lst", tsv);