In [None]:
!pip install -q py_vncorenlp

import os
import py_vncorenlp
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re

In [None]:
# Ensure the directory exists before downloading
os.makedirs('/kaggle/working/vncorenlp', exist_ok=True)

# Download VnCoreNLP model
py_vncorenlp.download_model(save_dir='/kaggle/working/vncorenlp')

# Load the segmenter
rdrsegmenter = py_vncorenlp.VnCoreNLP(
    annotators=["wseg"], 
    save_dir='/kaggle/working/vncorenlp'
)

# Test word segmentation
text = "√îng Nguy·ªÖn Kh·∫Øc Ch√∫c ƒëang l√†m vi·ªác t·∫°i ƒê·∫°i h·ªçc Qu·ªëc gia H√† N·ªôi. B√† Lan, v·ª£ √¥ng Ch√∫c, c≈©ng l√†m vi·ªác t·∫°i ƒë√¢y."
output = rdrsegmenter.word_segment(text)

print(output)

In [None]:
# Map labels
label2id = {
    "gambling": 0, "movies": 1, "ecommerce": 2, "government": 3, "education": 4, "technology": 5,
    "tourism": 6, "health": 7, "finance": 8, "media": 9, "nonprofit": 10, "realestate": 11,
    "services": 12, "industries": 13, "agriculture": 14
}
id2label = {v: k for k, v in label2id.items()}

model_path = "insert-path"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

In [None]:
def clean_text(text):
    # === Preserve domain dots, decimal dots, and URL hyphens ===
    text = re.sub(r'(\w)\.(?=\w)', r'\1<DOMAIN>', text)      # domain dots
    text = re.sub(r'(\d)\.(?=\d)', r'\1<DECIMAL>', text)     # decimal dots
    text = re.sub(r'(\w)-(?=\w)', r'\1<HYPHEN>', text)       # hyphen inside words/domains

    # === Remove remaining dots and hyphens ===
    text = text.replace('.', '')
    text = text.replace('-', '')

    # === Replace one or more underscores with a single space ===
    text = re.sub(r'_+', ' ', text)

    # === Restore preserved characters ===
    text = text.replace('<DOMAIN>', '.')
    text = text.replace('<DECIMAL>', '.')
    text = text.replace('<HYPHEN>', '-')

    # === Handle commas ===
    text = re.sub(r'(?<=[a-z0-9]),(?=[a-z])', ' ', text)  # digit/letter ‚Üí letter
    text = re.sub(r'(?<=[a-z]),(?=[0-9])', ' ', text)     # letter ‚Üí digit
    text = re.sub(r',(?=\D)|(?<=\D),', '', text)          # remove other commas

    # === Remove unwanted punctuation (keep quotes, %, /) ===
    text = re.sub(r'[^\w\s\.,/%"]', '', text)

    # === Normalize spaces ===
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [None]:
def predict_batch(text_list):
    def preprocess(text):
        text = clean_text(text)
        segmented = rdrsegmenter.word_segment(text)
        return ' '.join(segmented)
    
    # Apply preprocessing
    processed_texts = [preprocess(text) for text in text_list]
    
    # Tokenize
    inputs = tokenizer(
        processed_texts,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=64
    )
    inputs = {key: value.to(device) for key, value in inputs.items()}
    # Inference
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    predicted_class_ids = torch.argmax(probabilities, dim=-1).cpu().numpy()
    predicted_labels = [id2label[idx] for idx in predicted_class_ids]

    return predicted_labels, probabilities.cpu().numpy()

In [None]:
texts = [
    # "Onbü§ëHuawei Store51.627.392",
    # "Onbü§ëHuawei Store51.627.392 11 thg 7, 2025 ‚Äî Onb   ƒë·∫∑t ch·∫•t l∆∞·ª£ng l√™n h√†ng ƒë·∫ßu v·ªõi ti√™u ch√≠: kh√¥ng gian ch∆°i ph·∫£i c√¥ng b·∫±ng, th∆∞·ªüng ph·∫£i ƒë·ªÅu, tr√≤ ch∆°i ph·∫£i ƒëa d·∫°ng.",
    # "t·∫∑ng code nh·∫≠n kim c∆∞∆°ng‚ù§Ô∏èƒê√°ng tin c·∫≠y v6.7.5 11 thg 7, 2025 ‚Äî Ô∏èTrong ph√¢n kh√∫c xe s·ªë th·ªÉ thao ph·ªï th√¥ng, Yamaha Exciter 155 VVA n·ªïi b·∫≠t v·ªõi ƒë·ªông c∆° 155cc t√≠ch h·ª£p c√¥ng ngh·ªá van bi·∫øn thi√™n VVA, cho c√¥ng su·∫•t ...",
    # "C√¥ng ty KO66üí∞ƒêi·ªán t·ª≠ tr·ª±c tuy·∫øn",
    # "chung c∆∞ d∆∞∆°ng n·ªôi webtretho üßü üá≤üáΩba s√≤ng b·∫°c h√†ng ...",
    # "chung c∆∞ d∆∞∆°ng n·ªôi webtretho üßü üá≤üáΩba s√≤ng b·∫°c h√†ng ... chung c∆∞ d∆∞∆°ng n·ªôi webtretho -Th√¥ng th∆∞·ªùng, c√°c nh√† thi·∫øt k·∫ø RFIC s·ª≠ d·ª•ng c√°c c√¥ng c·ª• chuy√™n d·ª•ng ƒë·ªÉ th·ª±c hi·ªán x√°c minh tu·∫ßn t·ª±, ƒë∆°n mi·ªÅn, ...",
    # "qu√°n nh·∫≠u t√° l·∫£ ph·∫°m vƒÉn ƒë·ªìng ua v√© s·ªë tr·ª±c tuy·∫øn",
    # "qu√°n nh·∫≠u t√° l·∫£ ph·∫°m vƒÉn ƒë·ªìng ua v√© s·ªë tr·ª±c tuy·∫øn qu√°n nh·∫≠u t√° l·∫£ ph·∫°m vƒÉn ƒë·ªìng -Th·ª© tr∆∞·ªüng Nguy·ªÖn B√° Hoan cho bi·∫øt, th√°ng 3 v·ª´a qua, B·ªô Lƒê-TB-XH ƒë√£ ph·ªëi h·ª£p c√πng ph√≠a Nh·∫≠t B·∫£n t·ªï ch·ª©c th√†nh c√¥ng k·ª≥ thi ...",
    # "n·∫°p l·∫ßn ƒë·∫ßu nh·∫≠n g·∫•p ƒë√¥i-ƒêi·ªán t·ª≠ tr·ª±c tuy·∫øn",
    # "n·∫°p l·∫ßn ƒë·∫ßu nh·∫≠n g·∫•p ƒë√¥i-ƒêi·ªán t·ª≠ tr·ª±c tuy·∫øn 3 ng√†y tr∆∞·ªõc ‚Äî n·∫°p l·∫ßn ƒë·∫ßu nh·∫≠n g·∫•p ƒë√¥i   x√¢y d·ª±ng uy t√≠n t·ª´ ch√≠nh s·ª± c√¥ng b·∫±ng, minh b·∫°ch v√† h·ªá th·ªëng ph·∫ßn th∆∞·ªüng t·ª± ƒë·ªông m·ªói ng√†y.",
    # "t·∫∑ng code nh·∫≠n kim c∆∞∆°ng‚ù§Ô∏èƒê√°ng tin c·∫≠y v6.7.5",
    # "t·∫∑ng code nh·∫≠n kim c∆∞∆°ng‚ù§Ô∏èƒê√°ng tin c·∫≠y v6.7.5 11 thg 7, 2025 ‚Äî Ô∏èTrong ph√¢n kh√∫c xe s·ªë th·ªÉ thao ph·ªï th√¥ng, Yamaha Exciter 155 VVA n·ªïi b·∫≠t v·ªõi ƒë·ªông c∆° 155cc t√≠ch h·ª£p c√¥ng ngh·ªá van bi·∫øn thi√™n VVA, cho c√¥ng su·∫•t ...",
    # "KUWiN2 Comüí∑Huawei Store24.123.745",
    # "KUWiN2 Comüí∑Huawei Store24.123.745 4 thg 7, 2025 ‚Äî T·∫°i KUWiN2 Com  , m·ªçi ng∆∞·ªùi ƒë·ªÅu c√≥ c∆° h·ªôi nh∆∞ nhau. H·ªá th·ªëng ki·ªÉm tra ƒë·ªôc l·∫≠p, th∆∞·ªüng ng√†y ·ªïn ƒë·ªãnh v√† kho game ƒëa d·∫°ng khi·∫øn n∆°i ƒë√¢y tr·ªü ...",
    # "Nh√†ü§ëHuawei Store51.627.392",
    # "Nh√†ü§ëHuawei Store51.627.392 2 ng√†y tr∆∞·ªõc ‚Äî Nh√†  B·∫°n ƒëang t√¨m ki·∫øm m·ªôt n∆°i ch∆°i game minh b·∫°ch, c√¥ng b·∫±ng v√† th∆∞·ªüng th·∫≠t m·ªói ng√†y? Nh√†   ch√≠nh l√† s·ª± l·ª±a ch·ªçn ƒë∆∞·ª£c tin t∆∞·ªüng b·ªüi c·ªông ...",
    # "s·∫Ω",
    # "s·∫Ω Cung c·∫•p c√°c d·ªãch v·ª• v√† s·∫£n ph·∫©m ch·∫•t l∆∞·ª£ng c·ªßa s·∫Ω. T·∫≠n h∆∞·ªüng ch·∫•t l∆∞·ª£ng v√† s·ª± h√†i l√≤ng t·ª´ s·∫Ω.Ô∏è",
    # "Khuy·∫øn m√£i c·ª±c hot-Aptoide436.818.3",
    # "th√¥i r·ªìi ta ƒë√£ xa nhau",
    # "t·∫∑ng code nh·∫≠n th·∫ª n·∫°pü§ëApp Store7.37.371",
    # "t·∫∑ng code nh·∫≠n kim c∆∞∆°ng‚ù§Ô∏èƒê√°ng tin c·∫≠y v6.7.5",
    # "X√≥c dƒ©a xanh ch√≠n-ƒêi·ªán t·ª≠ tr·ª±c tuy·∫øn",
    # "share b√†i vi·∫øt nh·∫≠n code-Huawei Store51.627.392",
    # "T·ªâ l·ªá c∆∞·ª£cüí∑Huawei Store24.123.745",
    # "KUWiN2 Comüí∑Huawei Store24.123.745",
    "S∆° ƒë·ªì website - UBND qu·∫≠n C√°i RƒÉng - C·∫ßn Th∆°",
    "S∆° ƒë·ªì website - UBND qu·∫≠n C√°i RƒÉng - C·∫ßn Th∆° Th√¥ng tin tuy√™n truy·ªÅn ¬∑ Gi·ªõi thi·ªáu L·ªÖ h·ªôi b√°nh d√¢n gian nam b·ªô l·∫ßn th·ª© XI nƒÉm 2024 ¬∑ Th∆∞ k√™u g·ªçi h∆∞·ªüng ·ª©ng ‚ÄúChi·∫øn d·ªãch tuy√™n truy·ªÅn, ƒë·∫•u tranh ph√≤ng,¬†...",
    "ƒê√°nh b·∫°c qua m·∫°ng-S√≤ng b·∫°c th√¥ng th∆∞·ªùng c·ªßa Vi·ªát Nam",
    "ƒê√°nh b·∫°c qua m·∫°ng-S√≤ng b·∫°c th√¥ng th∆∞·ªùng c·ªßa Vi·ªát Nam 7 ng√†y tr∆∞·ªõc ‚Äî ƒê√°nh b·∫°c qua m·∫°ng  ƒëem ƒë·∫øn cho b·∫°n tr·∫£i nghi·ªám c√° c∆∞·ª£c c√¥ng b·∫±ng, ƒë∆∞·ª£c ki·ªÉm ch·ª©ng minh b·∫°ch, c√πng c√°c tr√≤ ch∆°i ƒëa d·∫°ng t·ª´ kinh ƒëi·ªÉn ƒë·∫øn m·ªõi¬†...",
    "nh·∫≠n ho√†n tr·∫£ m·ªói ng√†y-Uy T√≠n v√† An To√†n",
    "nh·∫≠n ho√†n tr·∫£ m·ªói ng√†y-Uy T√≠n v√† An To√†n 10 thg 8, 2025 ‚Äî Ô∏èS√°ng 9/8, Th·ªß t∆∞·ªõng Ph·∫°m Minh Ch√≠nh ƒë·∫øn ki·ªÉm tra, ƒë√¥n ƒë·ªëc ti·∫øn ƒë·ªô thi c√¥ng d·ª± √°n ƒê∆∞·ªùng d√¢y 500kV L√†o Cai - Vƒ©nh Y√™n tr√™n ƒë·ªãa b√†n t·ªânh Ph√∫ Th·ªç.",
    "Th·ªß m√¥nüî¥ƒê·∫£m b·∫£o ƒë√°nh gi√° v7.7.1",
    "3 c√¢y t·∫∑ng xu mi·ªÖn ph√≠",
    "tri √¢n ƒë·ªãnh k·ª≥üî¥Th∆∞∆°ng hi·ªáu uy t√≠n v7.3.9",
    "C·∫ßu th·ªß b√≥ng chuy·ªÅn-ƒêƒÉng k√Ω s·∫Ω t·∫∑ng b·∫°n 777K",
    "S·∫£nh ng∆∞·ªùi th·∫≠t-Th∆∞∆°ng hi·ªáu uy t√≠n v7.3.9"
] 

batch_labels, batch_probs = predict_batch(texts)
for t, l, p in zip(texts, batch_labels, batch_probs):
    print(f"Text: {t}\nPredicted: {l}\n")