In [None]:
!pip install -q py_vncorenlp

import py_vncorenlp
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt 
import re
import pandas as pd
import warnings
import os
import random

warnings.filterwarnings(
    "ignore",
    message="Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.",
    category=UserWarning
)

def set_global_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_global_seed(42)

In [None]:
label2id = {
    "gambling": 0,
    "movies": 1,
    "ecommerce": 2,
    "government": 3,
    "education": 4,
    "technology": 5,
    "tourism": 6,
    "health": 7,
    "finance": 8,
    "media": 9,
    "nonprofit": 10,
    "realestate": 11,
    "services": 12,
    "industries": 13,
    "agriculture": 14
}


id2label = {v: k for k, v in label2id.items()}
num_labels = len(label2id)

model_name = "vinai/phobert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label=id2label, label2id=label2id)
model.to(device)
print(f"Model '{model_name}' and tokenizer loaded successfully with {num_labels} labels.")

In [None]:
# Ensure the directory exists before downloading
os.makedirs('/kaggle/working/vncorenlp', exist_ok=True)

# Download VnCoreNLP model
py_vncorenlp.download_model(save_dir='/kaggle/working/vncorenlp')

# Load the segmenter
rdrsegmenter = py_vncorenlp.VnCoreNLP(
    annotators=["wseg"], 
    save_dir='/kaggle/working/vncorenlp'
)

# Test word segmentation
text = "√îng Nguy·ªÖn Kh·∫Øc Ch√∫c ƒëang l√†m vi·ªác t·∫°i ƒê·∫°i h·ªçc Qu·ªëc gia H√† N·ªôi. B√† Lan, v·ª£ √¥ng Ch√∫c, c≈©ng l√†m vi·ªác t·∫°i ƒë√¢y."
output = rdrsegmenter.word_segment(text)

print(output)

In [None]:
dataset = load_dataset('csv', data_files='insert-path')
dataset = dataset['train']
print('dataset loaded!')

def encode_labels(examples):
    examples['label'] = [label2id[label] for label in examples['label']]
    return examples

dataset = dataset.map(encode_labels, batched=True)
print("Labels encoded to numerical IDs.")

token_lengths = []

def preprocess_function(examples):
    # Apply Vietnamese word segmentation
    segmented_texts = [' '.join(rdrsegmenter.word_segment(text)) for text in examples['text']]
    
    # Tokenize segmented text
    tokenized_inputs = tokenizer(
        segmented_texts,
        truncation=True,
        padding='max_length',
        max_length=64
    )
    
    # Count non-padding tokens
    for input_ids in tokenized_inputs['input_ids']:
        length = len([token_id for token_id in input_ids if token_id != tokenizer.pad_token_id])
        token_lengths.append(length)
    
    return tokenized_inputs
    
tokenized_dataset = dataset.map(preprocess_function, batched=True, load_from_cache_file=False)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")    # Hugging Face requires a specific column named 'labels'
tokenized_dataset.set_format("torch")

features = tokenized_dataset
labels = [label.item() for label in tokenized_dataset['labels']]

train_indices, test_indices = train_test_split(
    range(len(labels)),
    test_size=0.1,
    random_state=42,
    stratify=labels
)

train_dataset = features.select(train_indices)
eval_dataset = features.select(test_indices)


max_length = max(token_lengths)
min_length = min(token_lengths)
avg_length = sum(token_lengths) / len(token_lengths)

print(f"Token length stats before padding:")
print(f"  üîπ Max length: {max_length}")
print(f"  üîπ Min length: {min_length}")
print(f"  üîπ Avg length: {avg_length:.2f} tokens")
print(f"Dataset tokenized and split into {len(train_dataset)} training examples and {len(eval_dataset)} evaluation examples.")

labels = [label.item() for label in train_dataset['labels']]
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Emphasize 
# scale_factor = 1.0
# class_weights[0] *= scale_factor

In [None]:
# Plot histogram of token length distribution (after truncation)
plt.figure(figsize=(8, 5))
plt.hist(token_lengths, bins=50, color='skyblue', edgecolor='black')
plt.title("Token Length Distribution (Before Padding)")
plt.xlabel("Token Length")
plt.ylabel("Number of Samples")
plt.axvline(x=64, color='red', linestyle='--', label='max_length=64')
plt.legend()
plt.show()

In [None]:
# # After finish evaluation, re-train on the whole dataset to maximise learning (intentional data leakage)
# train_dataset = tokenized_dataset
# print(f"Training on the entire dataset with {len(train_dataset)} examples.")

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, class_weights=None, gamma=2.0, reduction='mean', label_smoothing=0.2):
        super(FocalLoss, self).__init__()
        self.class_weights = class_weights
        self.gamma = gamma
        self.reduction = reduction
        self.label_smoothing = label_smoothing

    def forward(self, logits, targets):
        num_classes = logits.size(1)
        smoothed_labels = F.one_hot(targets, num_classes).float()
        smoothed_labels = smoothed_labels * (1 - self.label_smoothing) + self.label_smoothing / num_classes

        log_probs = F.log_softmax(logits, dim=1)
        ce_loss = -(smoothed_labels * log_probs).sum(dim=1)

        if self.class_weights is not None:
            weights = self.class_weights[targets]
            ce_loss = ce_loss * weights

        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss

        return focal_loss.mean() if self.reduction == 'mean' else focal_loss.sum()

class CustomTrainer(Trainer):
    def __init__(self, *args, focal_loss=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.focal_loss = focal_loss

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.focal_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
focal_loss = FocalLoss(class_weights=class_weights, gamma=2.0, label_smoothing=0.2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=42,
    report_to="none"
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average='weighted'),
        "recall": recall_score(labels, preds, average='weighted'),
        "f1": f1_score(labels, preds, average='weighted')
    }

In [None]:
# # Reload model if retraining
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label=id2label, label2id=label2id)
# model.to(device)

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    focal_loss=focal_loss,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.001)]
)

In [None]:
print("Starting training...")
trainer.train()
print("Training complete!")

In [None]:
model_save_path = "./fine_tuned_phobert"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Fine-tuned model saved to {model_save_path}")

In [None]:
# # Empty the working directory if retraining
# import os
# import shutil

# working_dir = "/kaggle/working/"
# print(f"Attempting to clear the directory: {working_dir}")

# if os.path.exists(working_dir):
#     for item in os.listdir(working_dir):
#         item_path = os.path.join(working_dir, item)
#         try:
#             if os.path.isfile(item_path) or os.path.islink(item_path):
#                 os.unlink(item_path)
#             elif os.path.isdir(item_path):
#                 shutil.rmtree(item_path)
#         except Exception as e:
#             print(f"Error removing {item_path}: {e}")
#     print(f"Contents of {working_dir} cleared.")
# else:
#     print(f"Directory {working_dir} does not exist.")

# if not os.listdir(working_dir):
#     print(f"Directory {working_dir} is now empty.")
# else:
#     print(f"Directory {working_dir} still contains items: {os.listdir(working_dir)}")

In [None]:
# Evaluate the test set
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

In [None]:
# # Plot the graphs
# log_history = trainer.state.log_history

# epochs = []
# train_losses = []
# eval_losses = []
# eval_accuracies = []
# eval_f1_scores = []
# eval_precisions = []
# eval_recalls = []

# for log in log_history:
#     if 'loss' in log and 'learning_rate' in log and 'epoch' in log:
#         train_losses.append(log['loss'])
#         epochs.append(log['epoch'])
#     elif 'eval_loss' in log:
#         eval_losses.append(log['eval_loss'])
#         eval_accuracies.append(log['eval_accuracy'])
#         eval_f1_scores.append(log['eval_f1'])
#         eval_precisions.append(log['eval_precision'])
#         eval_recalls.append(log['eval_recall'])

# eval_epochs = [log['epoch'] for log in log_history if 'eval_loss' in log]

# # Plotting Loss
# plt.figure(figsize=(12, 5))
# plt.subplot(1, 2, 1)
# plt.plot(epochs[:len(train_losses)], train_losses, label='Training Loss')
# plt.plot(eval_epochs, eval_losses, label='Validation Loss')
# plt.title('Loss over Epochs')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid(True)

# # Plotting Accuracy and F1-score
# plt.subplot(1, 2, 2)
# plt.plot(eval_epochs, eval_accuracies, label='Validation Accuracy')
# plt.plot(eval_epochs, eval_f1_scores, label='Validation F1-score (Weighted)')
# plt.plot(eval_epochs, eval_precisions, label='Validation Precision (Weighted)')
# plt.plot(eval_epochs, eval_recalls, label='Validation Recall (Weighted)')
# plt.title('Metrics over Epochs')
# plt.xlabel('Epoch')
# plt.ylabel('Score')
# plt.legend()
# plt.grid(True)

# plt.tight_layout() 
# plt.show()

In [None]:
# Save misclassified samples to CSV

import pandas as pd
from torch.utils.data import DataLoader
from tqdm import tqdm

# Get predictions on eval_dataset
predictions_output = trainer.predict(eval_dataset)

# Convert logits to predicted class and confidence
logits = predictions_output.predictions
probs = torch.softmax(torch.tensor(logits), dim=1)
predicted_labels = torch.argmax(probs, axis=1)
confidences = torch.max(probs, axis=1).values

# Get ground truth labels
true_labels = predictions_output.label_ids

# Get original text from dataset
original_texts = [tokenizer.decode(inputs['input_ids'], skip_special_tokens=True) for inputs in eval_dataset]

# Map IDs to label names
true_label_names = [id2label[label] for label in true_labels]
predicted_label_names = [id2label[label.item()] for label in predicted_labels]

# Filter misclassified samples
misclassified = []
for text, true, pred, conf in zip(original_texts, true_label_names, predicted_label_names, confidences):
    if true != pred:
        misclassified.append({
            "original_text": text,
            "true_label": true,
            "predicted_label": pred,
            "confidence": round(conf.item(), 4)
        })

# Save to CSV
df_misclassified = pd.DataFrame(misclassified)
df_misclassified.to_csv("misclassified_samples.csv", index=False, encoding="utf-8-sig")
print(f"Saved {len(df_misclassified)} misclassified samples to misclassified_samples.csv")

# Inference

In [None]:
def clean_text(text):
    if pd.isna(text):
        return text
    
    #text = text.lower()

    # === Preserve domain dots, decimal dots, and URL hyphens ===
    text = re.sub(r'(\w)\.(?=\w)', r'\1<DOMAIN>', text)      # domain dots
    text = re.sub(r'(\d)\.(?=\d)', r'\1<DECIMAL>', text)     # decimal dots
    text = re.sub(r'(\w)-(?=\w)', r'\1<HYPHEN>', text)       # hyphen inside words/domains

    # === Remove remaining dots and hyphens ===
    text = text.replace('.', '')
    text = text.replace('-', '')

    # === Replace one or more underscores with a single space ===
    text = re.sub(r'_+', ' ', text)

    # === Restore preserved characters ===
    text = text.replace('<DOMAIN>', '.')
    text = text.replace('<DECIMAL>', '.')
    text = text.replace('<HYPHEN>', '-')

    # === Handle commas ===
    text = re.sub(r'(?<=[a-z0-9]),(?=[a-z])', ' ', text)  # digit/letter ‚Üí letter
    text = re.sub(r'(?<=[a-z]),(?=[0-9])', ' ', text)     # letter ‚Üí digit
    text = re.sub(r',(?=\D)|(?<=\D),', '', text)          # remove other commas

    # === Remove unwanted punctuation (keep quotes, %, /) ===
    text = re.sub(r'[^\w\s\.,/%"]', '', text)

    # === Normalize spaces ===
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [None]:
model.eval()

# Testing manually
def predict_text_class(text_input: str):
    # Clean the text first
    # text = clean_text(text_input)

    # Apply Vietnamese word segmentation (same as training)
    segmented_text = ' '.join(rdrsegmenter.word_segment(text))

    # Tokenize segmented text
    inputs = tokenizer(
        segmented_text,
        return_tensors="pt",
        truncation=True,
        padding='max_length',
        max_length=64
    )

    # Move tensors to the correct device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    predicted_class_id = torch.argmax(logits, dim=-1).item()
    predicted_label = id2label[predicted_class_id]

    return predicted_label


example_texts = [
    "H·ªá th·ªëng nha khoa T√¢m ƒê·ª©c Smile d·∫´n ƒë·∫ßu v·ªÅ c√°c d·ªãch v·ª• c·∫•y gh√©p Implant, rƒÉng s·ª© th·∫©m m·ªπ, ni·ªÅng rƒÉng uy t√≠n h√†ng ƒë·∫ßu, ∆∞u ƒë√£i l√™n ƒë·∫øn 60%.",
    "Nh√† Xe M·ªπ Duy√™n - V·∫≠n chuy·ªÉn h√†nh kh√°ch v√† h√†ng h√≥a chuy√™n tuy·∫øn H·ªì Ch√≠ Minh ƒëi S√≥c TrƒÉng v√† ng∆∞·ª£c l·∫°i Nh·∫≠n v√© Nh·∫≠n m√£ v√©, x√°c nh·∫≠n v√† l√™n xe",
    "C√¥ng ty t∆∞ v·∫•n du h·ªçc M·ªπ c·∫≠p nh·∫≠t m·ªõi nh·∫•t v·ªÅ ƒëi·ªÅu ki·ªán, chi ph√≠, h·ªì s∆° xin visa du h·ªçc m·ªπ, c∆° h·ªôi ƒë·ªãnh c∆∞. C√¥ng ty du h·ªçc √Å - √Çu 24 nƒÉm kinh nghi·ªám, ƒë·ªânh cao uy t√≠n v√† ch·∫•t l∆∞·ª£ng.",
    "m√°y rung c·∫ßm tay t·∫∑ng b·∫°n g√°i",
    "thd cybersecurity",
    "Vay C·∫ßm C·ªë, Gi·∫£i Ng√¢n 15p T·ªïng h·ª£p c√°c b√†i vi·∫øt t·ª´ c∆° quan b√°o ch√≠ truy·ªÅn th√¥ng uy t√≠n, ph·∫£n √°nh ho·∫°t ƒë·ªông, th√†nh t·ª±u n·ªïi b·∫≠t v√† chi·∫øn l∆∞·ª£c ph√°t tri·ªÉn c·ªßa F88.",
    "ƒê∆∞·ªùng d√¢y ƒë√°nh b·∫°c, n·ªó h·ªß, c√° ƒë·ªô b√≥ng ƒë√° r·ª≠a ti·ªÅn l√™n ƒë·∫øn 1.000 t·ª∑ ƒë·ªìng b·ªã tri·ªát ph√°",
    "Samsung Galaxy Z Fold7 | Foldable meets Ultra Sleek | Samsung Australia",
    "ƒë∆∞·ªùng v√†o tim em √¥i bƒÉng gi√°",
    "Ngh·ªã ƒë·ªãnh s·ªë 46/2017/Nƒê-CP ng√†y 21/4/2017 c·ªßa Ch√≠nh ph·ªß quy ƒë·ªãnh v·ªÅ ho·∫°t ƒë·ªông ƒë·∫ßu t∆∞ gi√°o d·ª•c trong c√°c ch∆∞∆°ng tr√¨nh ƒë√†o t·∫°o m·∫ßm non, ph·ªï th√¥ng, ƒë·∫°i h·ªçc; ...",
    "5 ng√†y tr∆∞·ªõc ‚Äî tachi,N·ªÅn t·∫£ng x·ªï s·ªë tr·ª±c tuy·∫øn v·ªõi t·ª∑ l·ªá hoa h·ªìng cho ƒë·∫°i l√Ω cao nh·∫•t. H·ª£p t√°c v√† ki·∫øm thu nh·∫≠p th·ª• ƒë·ªông c√πng ch√∫ng t√¥i.",
    "Th√¥ng b√°o tuy·ªÉn sinh ƒëi h·ªçc t·∫°i CƒÉm-pu-chia di·ªán Hi·ªáp ƒë·ªãnh nƒÉm 2025 ¬∑ TH√îNG B√ÅO TUY·ªÇN SINH ƒêI H·ªåC T·∫†I MA-R·ªêC NƒÇM 2025 ¬∑ TH√îNG B√ÅO TUY·ªÇN SINH ƒêI H·ªåC T·∫†I M√îNG C·ªî ...",
    "7 ng√†y tr∆∞·ªõc ‚Äî qq188bet! N·ªÅn t·∫£ng x·ªï s·ªë tr·ª±c tuy·∫øn v·ªõi t·ª∑ l·ªá hoa h·ªìng cho ƒë·∫°i l√Ω cao nh·∫•t. H·ª£p t√°c v√† ki·∫øm thu nh·∫≠p th·ª• ƒë·ªông c√πng ch√∫ng t√¥i. Tr·ªü th√†nh ƒë·ªëi t√°c ...",
    "b·ªën ƒë√¥i th√¥ng ch·∫∑t ƒë∆∞·ª£c g√¨? c√°ch d√πng b·ªën ƒë√¥i th√¥ng hi·ªáu qu·∫£. 19 thg 10, 2024 - b·ªën ƒë√¥i th√¥ng l√† m·ªôt t·ªï h·ª£p b√†i ƒë·∫∑c bi·ªát trong c√°c tr√≤ ch∆°i b√†i mi·ªÅn nam, ƒë·∫∑c bi·ªát l√† ti·∫øn l√™n. khi m·ªôt ng∆∞·ªùi ch∆°i s·ªü h·ªØu b·ªën ƒë√¥i b√†i gi·ªëng nhau ...",
    "ƒêƒÉng nh·∫≠p VIC",
    "GGBet",
    "K·∫øt qu·∫£ CCHC",
    "S·∫ßu ri√™ng Ri6",
    "V√∫ s·ªØa t√≠m",
    "Tr√† C2",
    "C∆°n_s·ªët_v√†ng_bit C∆°n_s·ªët_v√†ng_bit-ƒê∆∞·ª£c thi ƒë·∫•u tr√™n s√¢n nh√† ·ªü tr·∫≠n chung k·∫øt l∆∞·ª£t v·ªÅ ƒë·ªôi tuy·ªÉn Th√°i Lan nh·∫≠p cu·ªôc v·ªõi tinh th·∫ßn th·∫ßn r·∫•t cao. ƒê·ªôi ch·ªß s√¢n Rajamangala c·∫ßm",
    "kwin congan.khanhhoa.gov.vn",
    "nohu69 congan.khanhhoa.gov.vn",
    "C∆°n_s·ªët_v√†ng_bit",
    "+85 s·∫£n ph·∫©m s√†n g·ªó Malaysia si√™u ch·ªãu n∆∞·ªõc gi√° t·ªët nh·∫•t",
    "S√†n g·ªó Malaysia ch√≠nh h√£ng, ch·∫•t l∆∞·ª£ng, gi√° c·∫£ c·∫°nh tranh ƒë∆∞·ª£c ph√¢n ph·ªëi b·ªüi JANHOME l√† h·ªá th·ªëng b√°n l·∫ª s√†n g·ªó, s√†n nh·ª±a, gi·∫•y d√°n t∆∞·ªùng, v·∫≠t li·ªáu n·ªôi th·∫•t ...",
    "+85 s·∫£n ph·∫©m s√†n g·ªó Malaysia si√™u ch·ªãu n∆∞·ªõc gi√° t·ªët nh·∫•t S√†n g·ªó Malaysia ch√≠nh h√£ng, ch·∫•t l∆∞·ª£ng, gi√° c·∫£ c·∫°nh tranh ƒë∆∞·ª£c ph√¢n ph·ªëi b·ªüi JANHOME l√† h·ªá th·ªëng b√°n l·∫ª s√†n g·ªó, s√†n nh·ª±a, gi·∫•y d√°n t∆∞·ªùng, v·∫≠t li·ªáu n·ªôi th·∫•t ...",
    "gi·ªçng_n·ªØ_tr·∫ßm",
    "gi·ªçng_n·ªØ_tr·∫ßm-Ch·ªß t·ªãch HƒêQT B·ªánh vi·ªán th·∫©m m·ªπ Sao H√†n chia s·∫ª r·∫±ng, th·∫©m m·ªπ hay ph·∫´u thu·∫≠t th·∫©m m·ªπ th√¨ y√™u c·∫ßu, mong mu·ªën ƒë·∫ßu ti√™n ch·∫Øc ch·∫Øn ph·∫£i ƒë·∫πp.",
    "gi·ªçng_n·ªØ_tr·∫ßm gi·ªçng_n·ªØ_tr·∫ßm-Ch·ªß t·ªãch HƒêQT B·ªánh vi·ªán th·∫©m m·ªπ Sao H√†n chia s·∫ª r·∫±ng, th·∫©m m·ªπ hay ph·∫´u thu·∫≠t th·∫©m m·ªπ th√¨ y√™u c·∫ßu, mong mu·ªën ƒë·∫ßu ti√™n ch·∫Øc ch·∫Øn ph·∫£i ƒë·∫πp.",
    "H·ªá th·ªëng QLVBDH: Trang ch·ªß",
    "ƒêƒÉng nh·∫≠p ƒêƒÉng nh·∫≠p. Chuy·ªÉn t·ªõi trang ƒë·∫ßy ƒë·ªß. ƒêƒÉng nh·∫≠p h·ªá th·ªëng. Ph√≠m chuy·ªÉn ch·ªØ hoa ƒëang b·∫≠t. ƒêƒÉng nh·∫≠p. ƒêƒÉng nh·∫≠p qua h·ªá th·ªëng x√°c th·ª±c TP. C·∫ßn Th∆°.",
    "H·ªá th·ªëng QLVBDH: Trang ch·ªß ƒêƒÉng nh·∫≠p ƒêƒÉng nh·∫≠p. Chuy·ªÉn t·ªõi trang ƒë·∫ßy ƒë·ªß. ƒêƒÉng nh·∫≠p h·ªá th·ªëng. Ph√≠m chuy·ªÉn ch·ªØ hoa ƒëang b·∫≠t. ƒêƒÉng nh·∫≠p. ƒêƒÉng nh·∫≠p qua h·ªá th·ªëng x√°c th·ª±c TP. C·∫ßn Th∆°.",
    "ƒêƒÉng nh·∫≠p VIC",
    "T√™n ƒëƒÉng nh·∫≠p : M·∫≠t kh·∫©u : ƒêƒÉng nh·∫≠p. Tho√°t. VIC 6.5 ƒê∆∞·ª£c ph√°t tri·ªÉn b·ªüi c√¥ng ty CINOTEC 282 L√™ Quang ƒê·ªãnh, Ph∆∞·ªùng 11, Qu·∫≠n B√¨nh Th·∫°nh, TP HCM."
    "ƒêƒÉng nh·∫≠p VIC T√™n ƒëƒÉng nh·∫≠p : M·∫≠t kh·∫©u : ƒêƒÉng nh·∫≠p. Tho√°t. VIC 6.5 ƒê∆∞·ª£c ph√°t tri·ªÉn b·ªüi c√¥ng ty CINOTEC 282 L√™ Quang ƒê·ªãnh, Ph∆∞·ªùng 11, Qu·∫≠n B√¨nh Th·∫°nh, TP HCM."
    "ƒê·ªëi t∆∞·ª£ng b·∫£o tr·ª£",
    "Trung t√¢m B·∫£o tr·ª£ X√£ h·ªôi l√† n∆°i qu·∫£n l√Ω chƒÉm s√≥c, nu√¥i d∆∞·ª°ng ƒëi·ªÅu tr·ªã ƒë·ªëi t∆∞·ª£ng B·∫£o tr·ª£ theo quy ƒë·ªãnh c·ªßa nh√† n∆∞·ªõc - ƒê·ªãa ch·ªâ: Khu v·ª±c B√¨nh H√≤a A, ...",
    "ƒê·ªëi t∆∞·ª£ng b·∫£o tr·ª£ Trung t√¢m B·∫£o tr·ª£ X√£ h·ªôi l√† n∆°i qu·∫£n l√Ω chƒÉm s√≥c, nu√¥i d∆∞·ª°ng ƒëi·ªÅu tr·ªã ƒë·ªëi t∆∞·ª£ng B·∫£o tr·ª£ theo quy ƒë·ªãnh c·ªßa nh√† n∆∞·ªõc - ƒê·ªãa ch·ªâ: Khu v·ª±c B√¨nh H√≤a A, ...",
    "ƒêƒÉng k√Ω doanh nghi·ªáp qua m·∫°ng ƒëi·ªán t·ª≠",
    "H√¨nh I.1.2. Bi·ªÉu m·∫´u nh·∫≠p th√¥ng tin ƒëƒÉng k√Ω t√†i kho·∫£n. Qu√Ω doanh nghi·ªáp nh·∫≠p ƒë·∫ßy ƒë·ªß c√°c tr∆∞·ªùng th√¥ng tin c·ªßa bi·ªÉu m·∫´u theo H√¨nh.",
    "ƒêƒÉng k√Ω doanh nghi·ªáp qua m·∫°ng ƒëi·ªán t·ª≠ H√¨nh I.1.2. Bi·ªÉu m·∫´u nh·∫≠p th√¥ng tin ƒëƒÉng k√Ω t√†i kho·∫£n. Qu√Ω doanh nghi·ªáp nh·∫≠p ƒë·∫ßy ƒë·ªß c√°c tr∆∞·ªùng th√¥ng tin c·ªßa bi·ªÉu m·∫´u theo H√¨nh.",
    "ƒêƒÉng nh·∫≠p h·ªá th·ªëng",
    "ƒêƒÉng nh·∫≠p h·ªá th·ªëng. L∆∞u t√†i kho·∫£n ƒëƒÉng nh·∫≠p. ƒêƒÉng nh·∫≠p. Qu√™n m·∫≠t kh·∫©u.",
"ƒêƒÉng nh·∫≠p h·ªá th·ªëng ƒêƒÉng nh·∫≠p h·ªá th·ªëng. L∆∞u t√†i kho·∫£n ƒëƒÉng nh·∫≠p. ƒêƒÉng nh·∫≠p. Qu√™n m·∫≠t kh·∫©u.",
    "Li√™n ƒëo√†n Taekwondo TP C·∫ßn Th∆° tƒÉng c∆∞·ªùng chuy·ªÉn ƒë·ªïi ...",
    "M·ª•c ti√™u c·ªßa Li√™n ƒëo√†n Taekwondo TP C·∫ßn Th∆° trong nƒÉm 2024 l√† ƒë·∫©y m·∫°nh chuy·ªÉn ƒë·ªïi s·ªë, x√¢y d·ª±ng m√¥ h√¨nh qu·∫£n l√Ω ph√π h·ª£p ƒëi·ªÅu ki·ªán, xu th·∫ø v√† quy ƒë·ªãnh c·ªßa Li√™n ...",
    "Li√™n ƒëo√†n Taekwondo TP C·∫ßn Th∆° tƒÉng c∆∞·ªùng chuy·ªÉn ƒë·ªïi ... M·ª•c ti√™u c·ªßa Li√™n ƒëo√†n Taekwondo TP C·∫ßn Th∆° trong nƒÉm 2024 l√† ƒë·∫©y m·∫°nh chuy·ªÉn ƒë·ªïi s·ªë, x√¢y d·ª±ng m√¥ h√¨nh qu·∫£n l√Ω ph√π h·ª£p ƒëi·ªÅu ki·ªán, xu th·∫ø v√† quy ƒë·ªãnh c·ªßa Li√™n ...",
    "M·∫™U CHUY·ªÜN ‚Äúƒê√îI D√âP B√ÅC H·ªí‚Äù",
    "3 thg 10, 2023 ‚Äî M·ªôt anh nhanh tay gi√†nh l·∫•y chi·∫øc d√©p, gi∆° l√™n nh∆∞ng ng·ªõ ra, l√∫ng t√∫ng. Anh b√™n c·∫°nh li·∫øc th·∫•y, ‚Äúv∆∞·ª£t v√¢y‚Äù ch·∫°y bi·∫øn‚Ä¶ B√°c ph·∫£i gi·ª•c:‚Äú∆† k√¨a, ng·∫Øm ...",
    "M·∫™U CHUY·ªÜN ‚Äúƒê√îI D√âP B√ÅC H·ªí‚Äù 3 thg 10, 2023 ‚Äî M·ªôt anh nhanh tay gi√†nh l·∫•y chi·∫øc d√©p, gi∆° l√™n nh∆∞ng ng·ªõ ra, l√∫ng t√∫ng. Anh b√™n c·∫°nh li·∫øc th·∫•y, ‚Äúv∆∞·ª£t v√¢y‚Äù ch·∫°y bi·∫øn‚Ä¶ B√°c ph·∫£i gi·ª•c:‚Äú∆† k√¨a, ng·∫Øm ...",
    "·∫¢nh h∆∞·ªüng c·ªßa c√°c lo·∫°i th·ª©c ƒÉn ƒë·∫øn sinh tr∆∞·ªüng v√† t·ªâ l·ªá ...",
    "6 thg 12, 2022 ‚Äî l√† m·ªôt trong nh·ªØng lo√†i ch√¢n b·ª•ng n∆∞·ªõc ng·ªçt ƒë∆∞·ª£c t√¨m th·∫•y trong ao n∆∞·ªõc ng·ªçt, v≈©ng, b·ªÉ, h·ªì, ƒë·∫ßm l·∫ßy, ru·ªông l√∫a v√† ƒë√¥i khi ·ªü s√¥ng su·ªëi. Hi·ªán nay, ...",
    "·∫¢nh h∆∞·ªüng c·ªßa c√°c lo·∫°i th·ª©c ƒÉn ƒë·∫øn sinh tr∆∞·ªüng v√† t·ªâ l·ªá ... 6 thg 12, 2022 ‚Äî l√† m·ªôt trong nh·ªØng lo√†i ch√¢n b·ª•ng n∆∞·ªõc ng·ªçt ƒë∆∞·ª£c t√¨m th·∫•y trong ao n∆∞·ªõc ng·ªçt, v≈©ng, b·ªÉ, h·ªì, ƒë·∫ßm l·∫ßy, ru·ªông l√∫a v√† ƒë√¥i khi ·ªü s√¥ng su·ªëi. Hi·ªán nay, ...",
    "L·ªãch s·ª≠ h√¨nh th√†nh",
    "Ng√†y 01 th√°ng 01 nƒÉm 2004, t·ªânh C·∫ßn Th∆° ƒë∆∞·ª£c chia t√°ch th√†nh 02 ƒë∆°n v·ªã h√†nh ch√≠nh l√† TP. C·∫ßn Th∆° v√† t·ªânh H·∫≠u Giang. B·∫£o t√†ng t·ªânh C·∫ßn ƒë·ªïi t√™n cho ph√π h·ª£p v·ªõi ...",
    "L·ªãch s·ª≠ h√¨nh th√†nh Ng√†y 01 th√°ng 01 nƒÉm 2004, t·ªânh C·∫ßn Th∆° ƒë∆∞·ª£c chia t√°ch th√†nh 02 ƒë∆°n v·ªã h√†nh ch√≠nh l√† TP. C·∫ßn Th∆° v√† t·ªânh H·∫≠u Giang. B·∫£o t√†ng t·ªânh C·∫ßn ƒë·ªïi t√™n cho ph√π h·ª£p v·ªõi ...",
    "H·ªôi C·ª±u chi·∫øn binh th√†nh ph·ªë C·∫ßn Th∆°",
    "Th√¥ng tin li√™n h·ªá. H·ªôi C·ª±u chi·∫øn binh - Th√†nh ph·ªë C·∫ßn Th∆° ƒê·ªãa ch·ªâ : 22 Tr·∫ßn VƒÉn Ho√†i, P.Xu√¢n Kh√°nh, Q.Ninh Ki·ªÅu, TP C·∫ßn Th∆° ƒêi·ªán tho·∫°i: (0710) 3832735",
    "H·ªôi C·ª±u chi·∫øn binh th√†nh ph·ªë C·∫ßn Th∆° Th√¥ng tin li√™n h·ªá. H·ªôi C·ª±u chi·∫øn binh - Th√†nh ph·ªë C·∫ßn Th∆° ƒê·ªãa ch·ªâ : 22 Tr·∫ßn VƒÉn Ho√†i, P.Xu√¢n Kh√°nh, Q.Ninh Ki·ªÅu, TP C·∫ßn Th∆° ƒêi·ªán tho·∫°i: (0710) 3832735",
    "vƒÉn h√≥a ƒê·ªãa ƒëi·ªÉm Chi·∫øn th·∫Øng √îng ƒê∆∞a nƒÉm 1960",
"DI T√çCH L·ªäCH S·ª¨ - VƒÇN H√ìA ƒê·ªäA ƒêI·ªÇM CHI·∫æN TH·∫ÆNG √îNG ƒê∆ØA NƒÇM 1960 ... Di t√≠ch l·ªãch s·ª≠ - vƒÉn h√≥a ƒê·ªãa ƒëi·ªÉm Chi·∫øn th·∫Øng √îng ƒê∆∞a nƒÉm 1960 t·ªça l·∫°c t·∫°i ·∫•p ƒê·ªãnh Kh√°nh A, ...",
    "vƒÉn h√≥a ƒê·ªãa ƒëi·ªÉm Chi·∫øn th·∫Øng √îng ƒê∆∞a nƒÉm 1960 DI T√çCH L·ªäCH S·ª¨ - VƒÇN H√ìA ƒê·ªäA ƒêI·ªÇM CHI·∫æN TH·∫ÆNG √îNG ƒê∆ØA NƒÇM 1960 ... Di t√≠ch l·ªãch s·ª≠ - vƒÉn h√≥a ƒê·ªãa ƒëi·ªÉm Chi·∫øn th·∫Øng √îng ƒê∆∞a nƒÉm 1960 t·ªça l·∫°c t·∫°i ·∫•p ƒê·ªãnh Kh√°nh A, ...",
    "∆Ø·ªõc ao c·ªßa thi·∫øu nhi qua b√†i h√°t",
    "∆Ø·ªöC AO C·ª¶A THI·∫æU NHI QUA B√ÄI H√ÅT ‚ÄúEM M∆† G·∫∂P B√ÅC H·ªí‚Äù C·ª¶A NH·∫†C Sƒ® XU√ÇN GIAO. Nh·∫°c sƒ© Xu√¢n Giao qu√™ g·ªëc ·ªü Nh∆∞ Qu·ª≥nh, VƒÉn L√¢m, H∆∞ng Y√™n, sinh nƒÉm 1932 t·∫°i H·∫£i ...",
    "∆Ø·ªõc ao c·ªßa thi·∫øu nhi qua b√†i h√°t ∆Ø·ªöC AO C·ª¶A THI·∫æU NHI QUA B√ÄI H√ÅT ‚ÄúEM M∆† G·∫∂P B√ÅC H·ªí‚Äù C·ª¶A NH·∫†C Sƒ® XU√ÇN GIAO. Nh·∫°c sƒ© Xu√¢n Giao qu√™ g·ªëc ·ªü Nh∆∞ Qu·ª≥nh, VƒÉn L√¢m, H∆∞ng Y√™n, sinh nƒÉm 1932 t·∫°i H·∫£i ...",
    "B·∫¢NG THANH TO√ÅN PH·ª§ C·∫§P C√ÅN B·ªò C√îNG ƒêO√ÄN",
    "B·∫¢NG THANH TO√ÅN PH·ª§ C·∫§P C√ÅN B·ªò C√îNG ƒêO√ÄN ¬∑ C√°c bi·ªÉu m·∫´u t√†i ch√≠nh C√¥ng ƒëo√†n c∆° s·ªü ¬∑ M·∫´u h∆∞·ªõng d·∫´n C√¥ng ƒëo√†n c∆° s·ªü ¬∑ M·∫´u bi·ªÉu d·ª± to√°n, quy·∫øt to√°n t√†i ch√≠nh Cƒê ...",
    "B·∫¢NG THANH TO√ÅN PH·ª§ C·∫§P C√ÅN B·ªò C√îNG ƒêO√ÄN B·∫¢NG THANH TO√ÅN PH·ª§ C·∫§P C√ÅN B·ªò C√îNG ƒêO√ÄN ¬∑ C√°c bi·ªÉu m·∫´u t√†i ch√≠nh C√¥ng ƒëo√†n c∆° s·ªü ¬∑ M·∫´u h∆∞·ªõng d·∫´n C√¥ng ƒëo√†n c∆° s·ªü ¬∑ M·∫´u bi·ªÉu d·ª± to√°n, quy·∫øt to√°n t√†i ch√≠nh Cƒê ...",
    "dung tin co ay tap 2 S√≤ng b·∫°c th√¥ng th∆∞·ªùng c·ªßa Vi·ªát Nam",
    "dung tin co ay tap 2 -Xu√°¬∫¬•t hi√°¬ª‚Ä°n c√É¬πng ki√°¬ª∆íu t√É¬≥c layer vu√°¬ª't ng√Ü¬∞√°¬ª¬£c v√°¬ª‚Ä∫i ph√°¬∫¬ßn t√É¬≥c t√°¬ª¬´ hai mang tai √Ñ'√°¬ª u √Ñ'√Ü¬∞√°¬ª¬£c h√°¬∫¬•t ng√Ü¬∞√°¬ª¬£c ra sau v√É ...",
    "dung tin co ay tap 2 S√≤ng b·∫°c th√¥ng th∆∞·ªùng c·ªßa Vi·ªát Nam dung tin co ay tap 2 -Xu√°¬∫¬•t hi√°¬ª‚Ä°n c√É¬πng ki√°¬ª∆íu t√É¬≥c layer vu√°¬ª't ng√Ü¬∞√°¬ª¬£c v√°¬ª‚Ä∫i ph√°¬∫¬ßn t√É¬≥c t√°¬ª¬´ hai mang tai √Ñ'√°¬ª u √Ñ'√Ü¬∞√°¬ª¬£c h√°¬∫¬•t ng√Ü¬∞√°¬ª¬£c ra sau v√É ...",
    "Th√¥ng tin truy n√£, ƒë√¨nh n√£",
    "Th√¥ng tin truy n√£, ƒë√¨nh n√£ ¬∑ Th√¥ng b√°o ¬∑ Th√¥ng tin truy n√£ ¬∑ Li√™n k·∫øt webiste ¬∑ ThƒÉm d√≤ √Ω ki·∫øn ¬∑ S·ªë l∆∞·ª£t truy c·∫≠p. Trong ng√†y: T·∫•t c·∫£:.","C·ªù b·∫°c"
    "Th√¥ng tin truy n√£, ƒë√¨nh n√£ Th√¥ng tin truy n√£, ƒë√¨nh n√£ ¬∑ Th√¥ng b√°o ¬∑ Th√¥ng tin truy n√£ ¬∑ Li√™n k·∫øt webiste ¬∑ ThƒÉm d√≤ √Ω ki·∫øn ¬∑ S·ªë l∆∞·ª£t truy c·∫≠p. Trong ng√†y: T·∫•t c·∫£:.","C·ªù b·∫°c"
    "+79 s·∫£n ph·∫©m s√†n g·ªó Florton ch√≠nh h√£ng, ch·∫•t l∆∞·ª£ng, gi√° r·∫ª",
    "S√ÄN G·ªñ FLORTON ch·∫•t l∆∞·ª£ng, gi√° r·∫ª, ƒë·∫°t ti√™u chu·∫©n Ch√¢u √Çu ƒë∆∞·ª£c cung c·∫•p b·ªüi JANHOME l√† h·ªá th·ªëng b√°n h√†ng t·∫°i kho cung c·∫•p v·∫≠t li·ªáu s√†n g·ªó gi·∫•y d√°n t∆∞·ªùng ...",
"+79 s·∫£n ph·∫©m s√†n g·ªó Florton ch√≠nh h√£ng, ch·∫•t l∆∞·ª£ng, gi√° r·∫ª S√ÄN G·ªñ FLORTON ch·∫•t l∆∞·ª£ng, gi√° r·∫ª, ƒë·∫°t ti√™u chu·∫©n Ch√¢u √Çu ƒë∆∞·ª£c cung c·∫•p b·ªüi JANHOME l√† h·ªá th·ªëng b√°n h√†ng t·∫°i kho cung c·∫•p v·∫≠t li·ªáu s√†n g·ªó gi·∫•y d√°n t∆∞·ªùng ...",
    "Trang ch·ªß - C·∫ßn Th∆°",
    "B·ªô C√¥ng Th∆∞∆°ng v·ª´a ban h√†nh Th√¥ng t∆∞ quy ƒë·ªãnh vi·ªác nh·∫≠p kh·∫©u m·∫∑t h√†ng g·∫°o v√† l√° thu·ªëc l√° kh√¥ c√≥ xu·∫•t x·ª© t·ª´ Campuchia theo h·∫°n ng·∫°ch thu·∫ø quan nƒÉm 2023 v√† 2024.",
    "Trang ch·ªß - C·∫ßn Th∆° B·ªô C√¥ng Th∆∞∆°ng v·ª´a ban h√†nh Th√¥ng t∆∞ quy ƒë·ªãnh vi·ªác nh·∫≠p kh·∫©u m·∫∑t h√†ng g·∫°o v√† l√° thu·ªëc l√° kh√¥ c√≥ xu·∫•t x·ª© t·ª´ Campuchia theo h·∫°n ng·∫°ch thu·∫ø quan nƒÉm 2023 v√† 2024.",
    "Login",
    "???login.label.loginheading.left??? ???login.label.userid??? ???login.label.password??? Help. Product documentation ¬∑ Product wiki ¬∑ Media gallery ...",
    "Login ???login.label.loginheading.left??? ???login.label.userid??? ???login.label.password??? Help. Product documentation ¬∑ Product wiki ¬∑ Media gallery ...",
    "ƒêƒÉng k√Ω t∆∞ v·∫•n t·ª´ QR code",
    "ƒêƒÉng k√Ω t∆∞ v·∫•n t·ª´ QR code. Xo√° D√°n (Paste)",
    "ƒêƒÉng k√Ω t∆∞ v·∫•n t·ª´ QR code ƒêƒÉng k√Ω t∆∞ v·∫•n t·ª´ QR code. Xo√° D√°n (Paste)",
    "GGBET",
    "ggbet",
    "+79 s·∫£n ph·∫©m s√†n g·ªó Florton ch√≠nh h√£ng, ch·∫•t l∆∞·ª£ng, gi√° r·∫ª",
    "S√ÄN G·ªñ FLORTON ch·∫•t l∆∞·ª£ng, gi√° r·∫ª, ƒë·∫°t ti√™u chu·∫©n Ch√¢u √Çu ƒë∆∞·ª£c cung c·∫•p b·ªüi JANHOME l√† h·ªá th·ªëng b√°n h√†ng t·∫°i kho cung c·∫•p v·∫≠t li·ªáu s√†n g·ªó gi·∫•y d√°n t∆∞·ªùng ...",
    "+79 s·∫£n ph·∫©m s√†n g·ªó Florton ch√≠nh h√£ng, ch·∫•t l∆∞·ª£ng, gi√° r·∫ª S√ÄN G·ªñ FLORTON ch·∫•t l∆∞·ª£ng, gi√° r·∫ª, ƒë·∫°t ti√™u chu·∫©n Ch√¢u √Çu ƒë∆∞·ª£c cung c·∫•p b·ªüi JANHOME l√† h·ªá th·ªëng b√°n h√†ng t·∫°i kho cung c·∫•p v·∫≠t li·ªáu s√†n g·ªó gi·∫•y d√°n t∆∞·ªùng ...",
    "k√™nh-xoilac",
    "k√™nh-xoilac-Ng√†y 3.1, HƒêND H.Qu·∫ø S∆°n kh√≥a XII (nhi·ªám k·ª≥ 2021 - 2026) t·ªï ch·ª©c k·ª≥ h·ªçp chuy√™n ƒë·ªÅ th·ª© 15 ƒë·ªÉ b·∫ßu c√°c ch·ª©c danh l√£nh ƒë·∫°o ch·ªß ch·ªët sau s√°p nh·∫≠p ...",
    "k√™nh-xoilac k√™nh-xoilac-Ng√†y 3.1, HƒêND H.Qu·∫ø S∆°n kh√≥a XII (nhi·ªám k·ª≥ 2021 - 2026) t·ªï ch·ª©c k·ª≥ h·ªçp chuy√™n ƒë·ªÅ th·ª© 15 ƒë·ªÉ b·∫ßu c√°c ch·ª©c danh l√£nh ƒë·∫°o ch·ªß ch·ªët sau s√°p nh·∫≠p ...",
    "c√°ch lai t·∫°o m√†u l√¥ng g√† 0209",
    "c√°ch lai t·∫°o m√†u l√¥ng g√† 0209-K·∫ø ti·∫øp l√† cung ƒëi·ªán Minos t·ªça l·∫°c ·ªü Crete, (Hy L·∫°p), ƒë∆∞·ª£c cho l√† x√¢y d·ª±ng v√†o kho·∫£ng nƒÉm 1700 tr∆∞·ªõc C√¥ng nguy√™n, ...",
    "c√°ch lai t·∫°o m√†u l√¥ng g√† 0209 c√°ch lai t·∫°o m√†u l√¥ng g√† 0209-K·∫ø ti·∫øp l√† cung ƒëi·ªán Minos t·ªça l·∫°c ·ªü Crete, (Hy L·∫°p), ƒë∆∞·ª£c cho l√† x√¢y d·ª±ng v√†o kho·∫£ng nƒÉm 1700 tr∆∞·ªõc C√¥ng nguy√™n, ..."
]

for i, text in enumerate(example_texts):
    predicted_label = predict_text_class(text)

    print(f"  Input: '{text}'")
    print(f"  Predicted: {predicted_label}")
    print()

# results = []

# for i, text in enumerate(example_texts):
#     predicted_label = predict_text_class(text)
#     results.append(f"Input: '{text}'\nPredicted: {predicted_label}\n")

# # Save to file
# with open("predictions.txt", "w", encoding="utf-8") as f:
#     f.write("\n".join(results))

# print("Predictions saved to predictions.txt")