Major stopword differences with "stopword" package (inaccurate data?) #651

titanism · 2022-06-12T08:22:20Z

Hello @Hugo-ter-Doest - we may need to update stopwords here, as they are drastically different than the stopword package.

I ran the following scripts and received diff output, which shows how drastic it is.

Script that compares natural to stopword

const _ = require('lodash');
const sw = require('stopword');

const stopwordsEn = _.difference(
  require('natural/lib/natural/util/stopwords').words,
  sw.eng
);

console.log('EN');
console.log(JSON.stringify(stopwordsEn, null, 2));

const stopwordsEs = _.difference(
  require('natural/lib/natural/util/stopwords_es').words,
  sw.spa
);

console.log('ES');
console.log(JSON.stringify(stopwordsEs, null, 2));

const stopwordsFa = _.difference(
  require('natural/lib/natural/util/stopwords_fa').words,
  sw.fas
);

console.log('FA');
console.log(JSON.stringify(stopwordsFa, null, 2));

const stopwordsFr = _.difference(
  require('natural/lib/natural/util/stopwords_fr').words,
  sw.fra
);

console.log('FR');
console.log(JSON.stringify(stopwordsFr, null, 2));

const stopwordsId = _.difference(
  require('natural/lib/natural/util/stopwords_id').words,
  sw.ind
);

console.log('ID');
console.log(JSON.stringify(stopwordsId, null, 2));

const stopwordsJa = _.difference(
  require('natural/lib/natural/util/stopwords_ja').words,
  sw.jpn
);

console.log('JA');
console.log(JSON.stringify(stopwordsJa, null, 2));

const stopwordsIt = _.difference(
  require('natural/lib/natural/util/stopwords_it').words,
  sw.ita
);

console.log('IT');
console.log(JSON.stringify(stopwordsIt, null, 2));

const stopwordsNl = _.difference(
  require('natural/lib/natural/util/stopwords_nl').words,
  sw.nld
);

console.log('NL');
console.log(JSON.stringify(stopwordsNl, null, 2));

const stopwordsNo = _.difference(
  require('natural/lib/natural/util/stopwords_no').words,
  sw.nob
);

console.log('NO');
console.log(JSON.stringify(stopwordsNo, null, 2));

const stopwordsPl = _.difference(
  require('natural/lib/natural/util/stopwords_pl').words,
  sw.pol
);

console.log('PL');
console.log(JSON.stringify(stopwordsPl, null, 2));

const stopwordsPt = _.difference(
  require('natural/lib/natural/util/stopwords_pt').words,
  [...sw.por, ...sw.porBr]
);

console.log('PT');
console.log(JSON.stringify(stopwordsPt, null, 2));

const stopwordsRu = _.difference(
  require('natural/lib/natural/util/stopwords_ru').words,
  sw.rus
);

console.log('RU');
console.log(JSON.stringify(stopwordsRu, null, 2));

const stopwordsSv = _.difference(
  require('natural/lib/natural/util/stopwords_sv').words,
  sw.swe
);

console.log('SV');
console.log(JSON.stringify(stopwordsSv, null, 2));

const stopwordsZh = _.difference(
  require('natural/lib/natural/util/stopwords_zh').words,
  sw.zho
);

console.log('ZH');
console.log(JSON.stringify(stopwordsZh, null, 2));

Output:

❯ node test
EN
[
  "above",
  "again",
  "below",
  "cannot",
  "does",
  "doing",
  "during",
  "few",
  "further",
  "its",
  "itself",
  "myself",
  "ours",
  "ourselves",
  "own",
  "see",
  "so",
  "theirs",
  "themselves",
  "until",
  "when",
  "whom",
  "why",
  "yours",
  "yourself",
  "b",
  "c",
  "d",
  "e",
  "f",
  "g",
  "h",
  "j",
  "k",
  "l",
  "m",
  "n",
  "o",
  "p",
  "q",
  "r",
  "s",
  "t",
  "u",
  "v",
  "w",
  "x",
  "y",
  "z",
  "$",
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "0",
  "_"
]
ES
[
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "0",
  "_"
]
FA
[
  "یه",
  "؟",
  "!",
  "٪",
  ".",
  "،",
  "؛",
  ":",
  ";",
  ",",
  "۱",
  "۲",
  "۳",
  "۴",
  "۵",
  "۶",
  "۷",
  "۸",
  "۹",
  "۰"
]
FR
[]
ID
[
  "a",
  "arti",
  "b",
  "bagainamakah",
  "bahwasannya",
  "baik",
  "baiklah",
  "c",
  "d",
  "dua",
  "e",
  "enak",
  "f",
  "g",
  "h",
  "hadap",
  "hai",
  "halo",
  "hallo",
  "hari",
  "helo",
  "hello",
  "i",
  "ibu",
  "j",
  "k",
  "kadar",
  "kali",
  "kena",
  "kerja",
  "khusus",
  "l",
  "laku",
  "langsung",
  "lihat",
  "m",
  "maksud",
  "masuk",
  "mata",
  "mohon",
  "n",
  "nya",
  "nyata",
  "o",
  "orang",
  "p",
  "pak",
  "q",
  "r",
  "rupa",
  "s",
  "salam",
  "sangkut",
  "sekaranglah",
  "t",
  "tuju",
  "u",
  "v",
  "w",
  "x",
  "y",
  "ya",
  "z"
]
JA
[]
IT
[
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "0",
  "_"
]
NL
[
  "a",
  "b",
  "c",
  "d",
  "e",
  "f",
  "g",
  "h",
  "i",
  "j",
  "k",
  "l",
  "m",
  "n",
  "o",
  "p",
  "q",
  "r",
  "s",
  "t",
  "v",
  "w",
  "x",
  "y",
  "z",
  "$",
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "0",
  "_",
  "-"
]
NO
[
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "0",
  "_"
]
PL
[
  "$",
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "0",
  "_"
]
PT
[
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "0",
  "_"
]
RU
[
  "также",
  "другие",
  "являюсь",
  "иди",
  "могу",
  "подойди",
  "мог",
  "делал",
  "делаю",
  "каждый",
  "откуда",
  "иметь",
  "имел",
  "имеет",
  "её",
  "здесь",
  "оно",
  "делать",
  "может быть",
  "самый",
  "должен",
  "сейчас",
  "другая",
  "другое",
  "наше",
  "вне",
  "конец",
  "сказала",
  "также",
  "видел",
  "c",
  "немного",
  "все еще",
  "затем",
  "те",
  "очень",
  "путь",
  "хорошо",
  "который",
  "пока",
  "с кем",
  "хотел бы",
  "твои",
  "б",
  "г",
  "д",
  "е",
  "ё",
  "з",
  "й",
  "л",
  "м",
  "н",
  "o",
  "п",
  "р",
  "т",
  "ф",
  "х",
  "ц",
  "ч",
  "ш",
  "щ",
  "ъ",
  "ы",
  "ь",
  "э",
  "ю",
  "$",
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "0",
  "_"
]
SV
[
  "aderton",
  "adertonde",
  "adjö",
  "aldrig",
  "allas",
  "alltid",
  "alltså",
  "andra",
  "andras",
  "annan",
  "annat",
  "artonde",
  "artonn",
  "bakom",
  "behöva",
  "behövas",
  "behövde",
  "behövt",
  "beslut",
  "beslutat",
  "beslutit",
  "bland",
  "blivit",
  "bort",
  "borta",
  "bra",
  "bäst",
  "bättre",
  "båda",
  "bådas",
  "dag",
  "dagar",
  "dagarna",
  "dagen",
  "del",
  "delen",
  "dess",
  "dessa",
  "dit",
  "dock",
  "därför",
  "efter",
  "eftersom",
  "ej",
  "elfte",
  "elva",
  "enkel",
  "enkelt",
  "enkla",
  "enligt",
  "ettusen",
  "fanns",
  "femte",
  "femtio",
  "femtionde",
  "femton",
  "femtonde",
  "fick",
  "fin",
  "finnas",
  "finns",
  "fjorton",
  "fjortonde",
  "fjärde",
  "fler",
  "flera",
  "flesta",
  "fram",
  "framför",
  "fyrtio",
  "fyrtionde",
  "få",
  "fått",
  "följande",
  "före",
  "förlåt",
  "förra",
  "första",
  "genast",
  "genom",
  "gick",
  "gjorde",
  "gjort",
  "god",
  "goda",
  "godare",
  "godast",
  "gott",
  "gälla",
  "gäller",
  "gällt",
  "gärna",
  "gå",
  "går",
  "gått",
  "gör",
  "göra",
  "haft",
  "heller",
  "hellre",
  "helst",
  "helt",
  "hennes",
  "hit",
  "hundra",
  "hundraen",
  "hundraett",
  "hög",
  "höger",
  "högre",
  "högst",
  "ibland",
  "icke",
  "idag",
  "igår",
  "imorgon",
  "inför",
  "ingenting",
  "inget",
  "innan",
  "inne",
  "inom",
  "inuti",
  "jämfört",
  "kanske",
  "knappast",
  "komma",
  "kommer",
  "kommit",
  "kr",
  "kunde",
  "kunna",
  "kunnat",
  "kvar",
  "legat",
  "ligga",
  "ligger",
  "lika",
  "likställd",
  "likställda",
  "lilla",
  "lite",
  "liten",
  "litet",
  "länge",
  "längre",
  "längst",
  "lätt",
  "lättare",
  "lättast",
  "långsam",
  "långsammare",
  "långsammast",
  "långsamt",
  "långt",
  "mellan",
  "mer",
  "mera",
  "mest",
  "mindre",
  "minst",
  "mittemot",
  "mot",
  "mycket",
  "måste",
  "möjlig",
  "möjligen",
  "möjligt",
  "möjligtvis",
  "nederst",
  "nedersta",
  "nedre",
  "ner",
  "nionde",
  "nittio",
  "nittionde",
  "nitton",
  "nittonde",
  "nog",
  "noll",
  "nr",
  "nummer",
  "nästa",
  "någon",
  "någonting",
  "något",
  "några",
  "nödvändig",
  "nödvändiga",
  "nödvändigt",
  "nödvändigtvis",
  "ofta",
  "oftast",
  "olika",
  "olikt",
  "rakt",
  "redan",
  "rätt",
  "sade",
  "sagt",
  "samma",
  "sedan",
  "senare",
  "senast",
  "sent",
  "sextio",
  "sextionde",
  "sexton",
  "sextonde",
  "sina",
  "sist",
  "sista",
  "siste",
  "sitta",
  "sjunde",
  "sjuttio",
  "sjuttionde",
  "sjutton",
  "sjuttonde",
  "själv",
  "sjätte",
  "skall",
  "skulle",
  "slutligen",
  "små",
  "smått",
  "snart",
  "stor",
  "stora",
  "stort",
  "större",
  "störst",
  "säga",
  "säger",
  "sämre",
  "sämst",
  "sådan",
  "sådana",
  "sådant",
  "tack",
  "tidig",
  "tidigare",
  "tidigast",
  "tidigt",
  "tills",
  "tillsammans",
  "tionde",
  "tjugo",
  "tjugoen",
  "tjugoett",
  "tjugonde",
  "tjugotre",
  "tjugotvå",
  "tjungo",
  "tolfte",
  "tolv",
  "tre",
  "tredje",
  "trettio",
  "trettionde",
  "tretton",
  "trettonde",
  "tvåhundra",
  "under",
  "ursäkt",
  "utan",
  "utanför",
  "ute",
  "varifrån",
  "varit",
  "varje",
  "varken",
  "vars",
  "varsågod",
  "vems",
  "verkligen",
  "vid",
  "vidare",
  "viktig",
  "viktigare",
  "viktigast",
  "viktigt",
  "vilka",
  "vilkas",
  "vänster",
  "vänstra",
  "värre",
  "vår",
  "våra",
  "ännu",
  "även",
  "åtminstone",
  "åttio",
  "åttionde",
  "åttonde",
  "över",
  "övermorgon",
  "överst",
  "övre",
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "0"
]
ZH
[]

The text was updated successfully, but these errors were encountered:

titanism · 2022-06-12T08:24:26Z

And here's the script comparing in reverse (all the keys in stopword that are not in natural):

const _ = require('lodash');
const sw = require('stopword');

const stopwordsEn = _.difference(
  sw.eng,
  require('natural/lib/natural/util/stopwords').words
);

console.log('EN');
console.log(JSON.stringify(stopwordsEn, null, 2));

const stopwordsEs = _.difference(
  sw.spa,
  require('natural/lib/natural/util/stopwords_es').words
);

console.log('ES');
console.log(JSON.stringify(stopwordsEs, null, 2));

const stopwordsFa = _.difference(
  sw.fas,
  require('natural/lib/natural/util/stopwords_fa').words
);

console.log('FA');
console.log(JSON.stringify(stopwordsFa, null, 2));

const stopwordsFr = _.difference(
  sw.fra,
  require('natural/lib/natural/util/stopwords_fr').words
);

console.log('FR');
console.log(JSON.stringify(stopwordsFr, null, 2));

const stopwordsId = _.difference(
  sw.ind,
  require('natural/lib/natural/util/stopwords_id').words
);

console.log('ID');
console.log(JSON.stringify(stopwordsId, null, 2));

const stopwordsJa = _.difference(
  sw.jpn,
  require('natural/lib/natural/util/stopwords_ja').words
);

console.log('JA');
console.log(JSON.stringify(stopwordsJa, null, 2));

const stopwordsIt = _.difference(
  sw.ita,
  require('natural/lib/natural/util/stopwords_it').words
);

console.log('IT');
console.log(JSON.stringify(stopwordsIt, null, 2));

const stopwordsNl = _.difference(
  sw.nld,
  require('natural/lib/natural/util/stopwords_nl').words
);

console.log('NL');
console.log(JSON.stringify(stopwordsNl, null, 2));

const stopwordsNo = _.difference(
  sw.nob,
  require('natural/lib/natural/util/stopwords_no').words
);

console.log('NO');
console.log(JSON.stringify(stopwordsNo, null, 2));

const stopwordsPl = _.difference(
  sw.pol,
  require('natural/lib/natural/util/stopwords_pl').words
);

console.log('PL');
console.log(JSON.stringify(stopwordsPl, null, 2));

const stopwordsPt = _.difference(
  [...sw.por, ...sw.porBr],
  require('natural/lib/natural/util/stopwords_pt').words
);

console.log('PT');
console.log(JSON.stringify(stopwordsPt, null, 2));

const stopwordsRu = _.difference(
  sw.rus,
  require('natural/lib/natural/util/stopwords_ru').words
);

console.log('RU');
console.log(JSON.stringify(stopwordsRu, null, 2));

const stopwordsSv = _.difference(
  sw.swe,
  require('natural/lib/natural/util/stopwords_sv').words
);

console.log('SV');
console.log(JSON.stringify(stopwordsSv, null, 2));

const stopwordsZh = _.difference(
  sw.zho,
  require('natural/lib/natural/util/stopwords_zh').words
);

console.log('ZH');
console.log(JSON.stringify(stopwordsZh, null, 2));

Output:

❯ node test
EN
[]
ES
[]
FA
[
  "به",
  "اکنون",
  "اگر",
  "اگرچه",
  "الا",
  "اما",
  "اندر",
  "اینکه",
  "باری",
  "بالعکس",
  "بدون",
  "بر",
  "بلکه",
  "بنابراین",
  "بی",
  "پس",
  "تا",
  "جز",
  "چنانچه",
  "چه",
  "چون",
  "در",
  "را",
  "روی",
  "زیرا",
  "سپس",
  "غیر",
  "که",
  "لیکن",
  "مانند",
  "مثل",
  "مگر",
  "نه",
  "نیز",
  "هرچند",
  "هم",
  "همان",
  "وانگهی",
  "ولی",
  "ولو",
  "همانند",
  "همچو"
]
FR
[]
ID
[
  "rata",
  "tahun",
  "tengah",
  "tinggi",
  "umum",
  "waktu"
]
JA
[]
IT
[]
NL
[]
NO
[]
PL
[]
PT
[
  "adeus",
  "agora",
  "aí",
  "ainda",
  "além",
  "algo",
  "alguém",
  "algum",
  "alguma",
  "algumas",
  "alguns",
  "ali",
  "ampla",
  "amplas",
  "amplo",
  "amplos",
  "ano",
  "anos",
  "ante",
  "antes",
  "apenas",
  "apoio",
  "após",
  "aqui",
  "área",
  "assim",
  "atrás",
  "através",
  "baixo",
  "bastante",
  "bem",
  "boa",
  "boas",
  "bom",
  "bons",
  "breve",
  "cá",
  "cada",
  "catorze",
  "cedo",
  "cento",
  "certamente",
  "certeza",
  "cima",
  "cinco",
  "coisa",
  "coisas",
  "conselho",
  "contra",
  "contudo",
  "custa",
  "dá",
  "dão",
  "daquela",
  "daquelas",
  "daquele",
  "daqueles",
  "dar",
  "debaixo",
  "demais",
  "dentro",
  "desde",
  "dessa",
  "dessas",
  "desse",
  "desses",
  "desta",
  "destas",
  "deste",
  "destes",
  "deve",
  "devem",
  "devendo",
  "dever",
  "deverá",
  "deverão",
  "deveria",
  "deveriam",
  "devia",
  "deviam",
  "dez",
  "dezanove",
  "dezasseis",
  "dezassete",
  "dezoito",
  "dia",
  "diante",
  "disse",
  "disso",
  "disto",
  "dito",
  "diz",
  "dizem",
  "dizer",
  "dois",
  "doze",
  "duas",
  "dúvida",
  "é",
  "embora",
  "enquanto",
  "era",
  "eram",
  "éramos",
  "és",
  "está",
  "estamos",
  "estão",
  "estar",
  "estás",
  "estava",
  "estavam",
  "estávamos",
  "esteja",
  "estejam",
  "estejamos",
  "esteve",
  "estive",
  "estivemos",
  "estiver",
  "estivera",
  "estiveram",
  "estivéramos",
  "estiverem",
  "estivermos",
  "estivesse",
  "estivessem",
  "estivéssemos",
  "estiveste",
  "estivestes",
  "estou",
  "etc",
  "exemplo",
  "faço",
  "falta",
  "favor",
  "faz",
  "fazeis",
  "fazem",
  "fazemos",
  "fazendo",
  "fazer",
  "fazes",
  "feita",
  "feitas",
  "feito",
  "feitos",
  "fez",
  "fim",
  "final",
  "foi",
  "fomos",
  "for",
  "fora",
  "foram",
  "fôramos",
  "forem",
  "forma",
  "formos",
  "fosse",
  "fossem",
  "fôssemos",
  "foste",
  "fostes",
  "fui",
  "geral",
  "grande",
  "grandes",
  "grupo",
  "há",
  "haja",
  "hajam",
  "hajamos",
  "hão",
  "havemos",
  "havia",
  "hei",
  "hoje",
  "hora",
  "horas",
  "houve",
  "houvemos",
  "houver",
  "houvera",
  "houverá",
  "houveram",
  "houvéramos",
  "houverão",
  "houverei",
  "houverem",
  "houveremos",
  "houveria",
  "houveriam",
  "houveríamos",
  "houvermos",
  "houvesse",
  "houvessem",
  "houvéssemos",
  "la",
  "lá",
  "lado",
  "lo",
  "local",
  "logo",
  "longe",
  "lugar",
  "maior",
  "maioria",
  "mal",
  "máximo",
  "meio",
  "menor",
  "menos",
  "mês",
  "meses",
  "mesma",
  "mesmas",
  "mesmos",
  "mil",
  "momento",
  "muita",
  "muitas",
  "nada",
  "naquela",
  "naquelas",
  "naquele",
  "naqueles",
  "nenhum",
  "nenhuma",
  "nessa",
  "nessas",
  "nesse",
  "nesses",
  "nesta",
  "nestas",
  "neste",
  "nestes",
  "ninguém",
  "nível",
  "noite",
  "nome",
  "nova",
  "novas",
  "nove",
  "novo",
  "novos",
  "número",
  "nunca",
  "obra",
  "obrigada",
  "obrigado",
  "oitava",
  "oitavo",
  "oito",
  "onde",
  "ontem",
  "onze",
  "outra",
  "outras",
  "outro",
  "outros",
  "parece",
  "parte",
  "partir",
  "paucas",
  "pequena",
  "pequenas",
  "pequeno",
  "pequenos",
  "per",
  "perante",
  "perto",
  "pode",
  "pude",
  "pôde",
  "podem",
  "podendo",
  "poder",
  "poderia",
  "poderiam",
  "podia",
  "podiam",
  "põe",
  "põem",
  "pois",
  "ponto",
  "pontos",
  "porém",
  "porque",
  "porquê",
  "posição",
  "possível",
  "possivelmente",
  "posso",
  "pouca",
  "poucas",
  "pouco",
  "poucos",
  "primeira",
  "primeiras",
  "primeiro",
  "primeiros",
  "própria",
  "próprias",
  "próprio",
  "próprios",
  "próxima",
  "próximas",
  "próximo",
  "próximos",
  "pude",
  "puderam",
  "quáis",
  "quanto",
  "quantos",
  "quarta",
  "quarto",
  "quatro",
  "quê",
  "quer",
  "quereis",
  "querem",
  "queremas",
  "queres",
  "quero",
  "questão",
  "quinta",
  "quinto",
  "quinze",
  "relação",
  "sabe",
  "sabem",
  "são",
  "segunda",
  "segundo",
  "sei",
  "seis",
  "seja",
  "sejam",
  "sejamos",
  "sempre",
  "sendo",
  "ser",
  "será",
  "serão",
  "serei",
  "seremos",
  "seria",
  "seriam",
  "seríamos",
  "sete",
  "sétima",
  "sétimo",
  "sexta",
  "sexto",
  "si",
  "sido",
  "sim",
  "sistema",
  "sob",
  "sobre",
  "sois",
  "somos",
  "sou",
  "tal",
  "talvez",
  "tampouco",
  "tanta",
  "tantas",
  "tanto",
  "tão",
  "tarde",
  "tem",
  "tém",
  "têm",
  "temos",
  "tendes",
  "tendo",
  "tenha",
  "tenham",
  "tenhamos",
  "tenho",
  "tens",
  "ter",
  "terá",
  "terão",
  "terceira",
  "terceiro",
  "terei",
  "teremos",
  "teria",
  "teriam",
  "teríamos",
  "teve",
  "ti",
  "tido",
  "tinha",
  "tinham",
  "tínhamos",
  "tive",
  "tivemos",
  "tiver",
  "tivera",
  "tiveram",
  "tivéramos",
  "tiverem",
  "tivermos",
  "tivesse",
  "tivessem",
  "tivéssemos",
  "tiveste",
  "tivestes",
  "toda",
  "todas",
  "todavia",
  "todo",
  "todos",
  "trabalho",
  "três",
  "treze",
  "tudo",
  "última",
  "últimas",
  "último",
  "últimos",
  "uns",
  "vai",
  "vais",
  "vão",
  "vários",
  "vem",
  "vêm",
  "vendo",
  "vens",
  "ver",
  "vez",
  "vezes",
  "viagem",
  "vindo",
  "vinte",
  "vir",
  "vós",
  "vossa",
  "vossas",
  "zero"
]
RU
[
  "не",
  "со",
  "то",
  "она",
  "да",
  "же",
  "вы",
  "бы",
  "по",
  "ее",
  "мне",
  "было",
  "вот",
  "меня",
  "еще",
  "нет",
  "ему",
  "теперь",
  "когда",
  "даже",
  "ну",
  "ли",
  "уже",
  "ни",
  "него",
  "вас",
  "нибудь",
  "уж",
  "вам",
  "ведь",
  "потом",
  "себя",
  "ничего",
  "ей",
  "может",
  "тут",
  "есть",
  "надо",
  "ней",
  "тебя",
  "чем",
  "была",
  "сам",
  "чтоб",
  "без",
  "будто",
  "чего",
  "раз",
  "себе",
  "будет",
  "тогда",
  "того",
  "этого",
  "какой",
  "совсем",
  "ним",
  "этом",
  "почти",
  "тем",
  "чтобы",
  "нее",
  "были",
  "куда",
  "всех",
  "сегодня",
  "можно",
  "при",
  "об",
  "хоть",
  "больше",
  "эти",
  "нас",
  "про",
  "всего",
  "них",
  "какая",
  "разве",
  "эту",
  "моя",
  "свою",
  "этой",
  "перед",
  "иногда",
  "лучше",
  "чуть",
  "том",
  "нельзя",
  "такой",
  "им",
  "всегда",
  "конечно",
  "всю",
  "это",
  "лишь"
]
SV
[]
ZH
[]

titanism · 2022-06-12T08:28:37Z

I found these differences due to the word "hello" being in Natural's "id" stopwords list, while not in stopword.ind list.

❯ node
Welcome to Node.js v16.15.1.
Type ".help" for more information.
> require('stopword').ind.indexOf('hello')
-1
> require('natural/lib/natural/util/stopwords_id').words.indexOf('hello')
267

Hugo-ter-Doest · 2022-07-10T22:03:41Z

Thanks for your comparison. I think it is a matter of taste and application which words are considers stopwords. For now I will leave it like it is. Maybe in the future we re-use the stopwords package.

titanism mentioned this issue Jun 12, 2022

Major stopword differences with "natural" package (inaccurate data?) fergiemcdowall/stopword#286

Closed

titanism changed the title ~~Major stemword differences with "stopword" package (inaccurate data?)~~ Major stopword differences with "stopword" package (inaccurate data?) Jun 12, 2022

Hugo-ter-Doest closed this as completed Jul 10, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Major stopword differences with "stopword" package (inaccurate data?) #651

Major stopword differences with "stopword" package (inaccurate data?) #651

titanism commented Jun 12, 2022

titanism commented Jun 12, 2022

titanism commented Jun 12, 2022

Hugo-ter-Doest commented Jul 10, 2022

Major stopword differences with "stopword" package (inaccurate data?) #651

Major stopword differences with "stopword" package (inaccurate data?) #651

Comments

titanism commented Jun 12, 2022

titanism commented Jun 12, 2022

titanism commented Jun 12, 2022

Hugo-ter-Doest commented Jul 10, 2022