In [2]:
import re

# -------------------------------
# ูุงุฆูุฉ ุจุงูุฒูุงุฆุฏ (ุงูููุฏูุงุช ูุงูููุงูุงุช)
# -------------------------------
prefixes = [
    "ุงู", "ูุงู", "ุจุงู", "ูุงู", "ูุงู", "ูู", "ู", "ู", "ุจ", "ู", "ู", "ุณ"
]

suffixes = [
    "ู", "ูุง", "ู", "ู", "ูู", "ูุง", "ููุง", "ูู", "ูู", "ูู", "ูู", "ุงุช", "ุงู", "ุชุงู", "ุชูู", "ุฉ", "ุงุช"
]

# -------------------------------
# ุฏุงูุฉ ุชูุธูู ุงููููุฉ ูู ุงูุนูุงูุงุช ูุงูุฒูุงุฆุฏ
# -------------------------------
def normalize_arabic(word):
    word = re.sub(r'[ููููููููู]', '', word)  # ุฅุฒุงูุฉ ุงูุชุดููู
    word = re.sub(r'[^\u0600-\u06FF]', '', word)  # ุฅุฒุงูุฉ ุงูุฑููุฒ ุบูุฑ ุงูุนุฑุจูุฉ
    return word

# -------------------------------
# ุฏุงูุฉ ุฅุฒุงูุฉ ุงูุฒูุงุฆุฏ
# -------------------------------
def remove_affixes(word):
    for pre in sorted(prefixes, key=len, reverse=True):
        if word.startswith(pre) and len(word) - len(pre) >= 3:
            word = word[len(pre):]
            break
    for suf in sorted(suffixes, key=len, reverse=True):
        if word.endswith(suf) and len(word) - len(suf) >= 3:
            word = word[:-len(suf)]
            break
    return word

# -------------------------------
# ุงูุฃููุงุท ุงูุฌุฐุฑูุฉ ุงูุดุงุฆุนุฉ (patterns)
# -------------------------------
patterns = [
    "ููุนูู", "ูุงุนู", "ูุนูู", "ูุนููุฉ", "ุงูุชุนู", "ุงุณุชูุนู", "ุชูุงุนู", "ูุนุงูุฉ",
    "ูุนูู", "ูุนูู", "ุชูุนูู", "ุงููุนู", "ุงูุนู", "ููุนู", "ููุนู", "ูุนูู", "ูุนูุฉ"
]

# -------------------------------
# ุงุณุชุฎุฑุงุฌ ุงูุฌุฐุฑ ุนุจุฑ ููุงุฑูุฉ ุงูุฃููุงุท
# -------------------------------
def extract_root(word):
    for pattern in patterns:
        # ุชุญููู ุงูููุท ุฅูู ุดูู regex ุจุงุณุชุฎุฏุงู ู ุน ู ููุซุงู
        regex = pattern.replace("ู", "(.?)").replace("ุน", "(.?)").replace("ู", "(.?)")
        match = re.match(regex, word)
        if match:
            root = "".join(match.groups())
            root = re.sub(r'[^ุกุงุฃุฅุขุจุชุซุฌุญุฎุฏุฐุฑุฒุณุดุตุถุทุธุนุบูููููููููุฉ]', '', root)
            if 3 <= len(root) <= 4:
                return root
    return word

# -------------------------------
# ุงูุฏุงูุฉ ุงูุฑุฆูุณูุฉ: Khoja Stemmer
# -------------------------------
def khoja_stem(word):
    word = normalize_arabic(word)
    word = remove_affixes(word)
    root = extract_root(word)
    return root

# -------------------------------
# ุงุฎุชุจุงุฑ ุนููู ุนูู ููุฑุฉ ุนุฑุจูุฉ
# -------------------------------
text = """
ุงููุฏุงุฑุณ ููุชุจูู ุงููุนููุฉ ููุชุจุฉ ุงููุฑุงุกุฉ ูุงูููุชูุจุงุช ูุชุนูู ุงูุทูุงุจ ุงููุชุงุจุฉ ุงูุตุญูุญุฉ.
"""

words = [w for w in text.split() if len(w) > 2]

print("๐น Khoja Stemmer Results:\n")
for w in words:
    print(f"{w:<15} โ {khoja_stem(w)}")


๐น Khoja Stemmer Results:

ุงููุฏุงุฑุณ         โ ุฏุงุฑ
ููุชุจูู          โ ูุชุจ
ุงููุนููุฉ         โ ุนูู
ููุชุจุฉ           โ ูุชุจ
ุงููุฑุงุกุฉ         โ ูุฑุงุก
ูุงูููุชูุจุงุช      โ ูุชุจ
ูุชุนูู           โ ุชุนูู
ุงูุทูุงุจ          โ ุทูุงุจ
ุงููุชุงุจุฉ         โ ูุชุงุจ
ุงูุตุญูุญุฉ.        โ ุตุญูุญ
