Skip to content

Commit

Permalink
Add multilingual graph topic modeling, closes #511
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmezzetti committed Jul 29, 2023
1 parent febf636 commit 26634f8
Showing 1 changed file with 2 additions and 5 deletions.
7 changes: 2 additions & 5 deletions src/python/txtai/graph/topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
Topics module
"""

import re

from ..pipeline import Tokenizer
from ..scoring import ScoringFactory

Expand All @@ -22,8 +20,7 @@ def __init__(self, config):
"""

self.config = config
self.pattern = re.compile(r"(?u)\b\w\w+\b")
self.tokenizer = Tokenizer()
self.tokenizer = Tokenizer(stopwords=True)

# Additional stopwords to ignore when building topic names
self.stopwords = set()
Expand Down Expand Up @@ -109,7 +106,7 @@ def tokenize(self, graph, node):
"""

text = graph.attribute(node, "text")
return self.pattern.findall(text.lower()) if text else []
return self.tokenizer(text) if text else []

def topn(self, terms, n):
"""
Expand Down

0 comments on commit 26634f8

Please sign in to comment.