In [7]:
# This cell is no longer needed and can be deleted.

In [2]:
!pip install -q --upgrade transformers accelerate bitsandbytes peft trl sentencepiece datasets scikit-learn pandas

[0m

In [5]:
# Standard imports
import pandas as pd
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    EarlyStoppingCallback,
    logging as hf_logging,
)
from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import random
import re
import os
import json # For saving/loading datasets
import textwrap
from huggingface_hub import login


# Set verbosity for Hugging Face libraries
hf_logging.set_verbosity_error() # Only show errors


In [None]:
# login(token='YOUR_HF_TOKEN_HERE')


BASE_DRIVE_PATH = "/home/user/" # ADJUST YOUR GDRIVE PATH HERE

DATA_FILE_PATH = os.path.join(BASE_DRIVE_PATH, "Dataset_ChongPha.csv") # YOUR CSV FILENAME
SUMMARY_COLUMN = 'summary'
COMMENT_COLUMN = 'comment_clean'
LABEL_COLUMN = 'label'
POSSIBLE_LABELS = ["PHAN_DONG", "KHONG_PHAN_DONG", "KHONG_LIEN_QUAN"]
# For robust matching of LLM outputs
POSSIBLE_LABELS_NORMALIZED_FOR_MATCHING = sorted([label.replace("_", "") for label in POSSIBLE_LABELS], key=len, reverse=True)
POSSIBLE_LABELS_VARIANTS_FOR_MATCHING = []
for p_label in POSSIBLE_LABELS:
    POSSIBLE_LABELS_VARIANTS_FOR_MATCHING.append(p_label.upper())
    POSSIBLE_LABELS_VARIANTS_FOR_MATCHING.append(p_label.upper().replace("_", ""))
POSSIBLE_LABELS_VARIANTS_FOR_MATCHING = sorted(list(set(POSSIBLE_LABELS_VARIANTS_FOR_MATCHING)), key=len, reverse=True)


In [7]:
# 1.3. Model Configuration
MODEL_ID = "Viet-Mistral/Vistral-7B-Chat"
TOKENIZER_ID = "Viet-Mistral/Vistral-7B-Chat"

# 1.4. Zero-Shot Testing Configuration
NUM_SAMPLES_TO_TEST_ZERO_SHOT = 9999 # Reduce for quicker initial testing, increase for more thorough
MAX_INPUT_LENGTH_ZERO_SHOT = 400  # Max length of the prompt fed to the model for zero-shot
                                  # Adjust based on analysis of your formatted prompts later
MAX_NEW_TOKENS_ZERO_SHOT = 20     # Max tokens for the model to generate (should be enough for the label)

# 1.5. Fine-tuning Configuration (QLoRA)
OUTPUT_DIR = os.path.join(BASE_DRIVE_PATH, "vistral7b") # Where to save fine-tuned model
MAX_SEQ_LENGTH_SFT = 1024          # Max sequence length for SFTTrainer (prompt + completion)
                                  # This will be refined after analyzing token lengths of formatted data.
                                  # Start with a conservative value.

print("Configurations set.")

Configurations set.


In [8]:
SYSTEM_PROMPT_CONTENT = """Bạn là một chuyên gia phân tích nội dung mạng xã hội có chuyên môn cao về phân loại bình luận chính trị. Nhiệm vụ của bạn là phân tích "Bình luận" dựa trên ngữ cảnh của "Tóm tắt bài đăng" và phân loại chính xác vào một trong ba nhóm.

**HƯỚNG DẪN PHÂN TÍCH:**
- Đọc kỹ tóm tắt bài đăng để hiểu ngữ cảnh
- Phân tích ý định, thái độ và nội dung của bình luận
- Chú ý đến ngôn ngữ ẩn ý, châm biếm, và các biểu hiện gián tiếp

**CÁC NHÃN PHÂN LOẠI (CHỈ CHỌN MỘT NHÃN):**

**PHAN_DONG** - Bình luận thể hiện:
• Công kích, xuyên tạc, hoặc phản đối Đảng, Nhà nước, chính quyền
• Phủ nhận hoặc bóp méo các sự kiện lịch sử quan trọng (30/4, Điện Biên Phủ, Cách mạng Tháng 8...)
• Xúc phạm các lãnh đạo, biểu tượng quốc gia (Bác Hồ, quốc kỳ, quốc ca...)
• Tuyên truyền thông tin sai lệch về lịch sử chính trị Việt Nam
• Tuyên truyền chống phá chế độ xã hội chủ nghĩa
• Đồng tình, ủng hộ quan điểm phản động trong bài đăng
• Kích động bạo lực, thù hận đối với Nhà nước

**KHONG_PHAN_DONG** - Bình luận thể hiện:
• Phản biện ôn hòa, góp ý xây dựng về chính sách
• Ủng hộ, bảo vệ Đảng, Nhà nước, chính quyền
• Phê phán các quan điểm phản động
• Nêu thông tin đúng sự thật về lịch sử chính trị Việt Nam
• Thảo luận trung lập về vấn đề chính trị-xã hội
• Đề xuất cải thiện trong khuôn khổ pháp luật
• Bày tỏ lòng yêu nước một cách tích cực

**KHONG_LIEN_QUAN** - Bình luận:
• Không đề cập đến chính trị, chính sách, lãnh đạo
• Spam, quảng cáo, emoji đơn thuần
• Hỏi thông tin cá nhân, trò chuyện phiếm
• Bàn về thể thao, giải trí, đời sống không liên quan chính trị
• Bình luận kỹ thuật, hướng dẫn không có ý nghĩa chính trị

**LƯU Ý QUAN TRỌNG:**
- Phân biệt rõ phê bình xây dựng và chống phá
- Chú ý ngôn ngữ ẩn ý, mỉa mai, châm biếm
- Xem xét mức độ gay gắt và ý định của người bình luận
- Đánh giá trong bối cảnh tổng thể của bài đăng

**ĐỊNH DẠNG TRẢ LỜI:**
Chỉ trả lời chính xác một trong ba nhãn: PHAN_DONG, KHONG_PHAN_DONG, hoặc KHONG_LIEN_QUAN"""


def create_user_content(summary_text: str, comment_text: str) -> str:
    return f"""TÓM TẮT BÀI ĐĂNG:
"{summary_text}"

BÌNH LUẬN CẦN PHÂN LOẠI:
"{comment_text}"
"""

In [None]:
SYSTEM_PROMPT_CONTENT = """Phân loại bình luận mạng xã hội Việt Nam vào một trong ba nhóm: PHAN_DONG, KHONG_PHAN_DONG, hoặc KHONG_LIEN_QUAN. Phải xem xét cả ngữ cảnh của bài đăng gốc khi đánh giá bình luận.

TIÊU CHÍ PHÂN LOẠI:

PHAN_DONG:
• Kêu gọi/chống đối trực tiếp Đảng, Nhà nước, chính quyền Việt Nam
• Xuyên tạc có chủ đích các sự kiện lịch sử quan trọng (30/4, Điện Biên Phủ...)
• Xúc phạm nghiêm trọng lãnh đạo, biểu tượng quốc gia (Bác Hồ, quốc kỳ...)
• Lan truyền thông tin sai lệch về chính trị Việt Nam với mục đích chống phá
• Kích động bạo lực, thù hận đối với chế độ xã hội chủ nghĩa
• Ủng hộ lực lượng thù địch hoặc quan điểm phản động trong bài đăng gốc
• Phủ nhận tính chính danh của chế độ hiện tại

KHONG_PHAN_DONG:
• Phản biện chính sách cụ thể không nhằm phủ nhận chế độ
• Góp ý xây dựng về đường lối, chính sách của Đảng, Nhà nước
• Ủng hộ, bảo vệ chính quyền hoặc các chính sách đúng đắn
• Thảo luận khách quan về vấn đề chính trị-xã hội
• Phê phán rõ ràng các quan điểm phản động trong bài đăng gốc
• Bày tỏ tinh thần yêu nước, đoàn kết dân tộc
• Tranh luận gay gắt nhưng không kêu gọi lật đổ hay bạo lực

KHONG_LIEN_QUAN:
• Hoàn toàn không đề cập đến chính trị, chính sách
• Spam, quảng cáo, emoji không mang hàm ý chính trị
• Nội dung về đời sống cá nhân, thông tin riêng tư
• Trao đổi về giải trí, thể thao không liên quan chính trị
• Bình luận không liên quan đến nội dung chính trị trong bài đăng gốc
• Câu hỏi hoặc nhận xét trung tính không mang tính chính trị

HƯỚNG DẪN XỬ LÝ TRƯỜNG HỢP ĐẶC BIỆT:
• Khi bình luận có cả đặc điểm PHAN_DONG và KHONG_PHAN_DONG, ưu tiên xem xét mức độ nghiêm trọng và ý đồ chống phá
• Phản biện gay gắt nhưng mang tính xây dựng vẫn là KHONG_PHAN_DONG
• Bình luận mỉa mai, châm biếm cần đánh giá dựa trên nội dung thực chất, không chỉ hình thức
• Bình luận liên quan đến bài đăng chính trị nhưng nội dung hoàn toàn trung lập là KHONG_LIEN_QUAN"""

def create_user_content(summary_text: str, comment_text: str) -> str:
    return f"""TÓM TẮT BÀI ĐĂNG:
"{summary_text}"

BÌNH LUẬN CẦN PHÂN LOẠI:
"{comment_text}"
"""

In [9]:
# 2.2. Load Raw Data
try:
    df_full = pd.read_csv(DATA_FILE_PATH)
    print(f"Successfully loaded data from {DATA_FILE_PATH}. Shape: {df_full.shape}")
    if not all(col in df_full.columns for col in [SUMMARY_COLUMN, COMMENT_COLUMN, LABEL_COLUMN]):
        raise ValueError(f"Missing one or more required columns: {SUMMARY_COLUMN}, {COMMENT_COLUMN}, {LABEL_COLUMN}")

    df_full[SUMMARY_COLUMN] = df_full[SUMMARY_COLUMN].fillna('').astype(str)
    df_full[COMMENT_COLUMN] = df_full[COMMENT_COLUMN].fillna('').astype(str)
    df_full[LABEL_COLUMN] = df_full[LABEL_COLUMN].astype(str).str.strip().str.upper() # Normalize labels

    # Validate labels
    invalid_labels = df_full[~df_full[LABEL_COLUMN].isin(POSSIBLE_LABELS)]
    if not invalid_labels.empty:
        print(f"Warning: Found {len(invalid_labels)} rows with invalid labels. Examples: {invalid_labels[LABEL_COLUMN].unique()[:5]}")
        print("These rows will be filtered out.")
        df_full = df_full[df_full[LABEL_COLUMN].isin(POSSIBLE_LABELS)]
        print(f"Shape after filtering invalid labels: {df_full.shape}")

    if df_full.empty:
        raise ValueError("Dataset is empty after loading or filtering. Please check your data.")

except FileNotFoundError:
    print(f"ERROR: Data file not found at {DATA_FILE_PATH}.")
    df_full = pd.DataFrame() # Avoid further errors
except ValueError as ve:
    print(f"ERROR during data loading: {ve}")
    df_full = pd.DataFrame()
except Exception as e:
    print(f"An unexpected error occurred during data loading: {e}")
    df_full = pd.DataFrame()


Successfully loaded data from /home/user/Dataset_ChongPha.csv. Shape: (18912, 4)


In [None]:
#Split Data into Train, Validation, and Test sets (80%, 10%, 10%)
if not df_full.empty:
    print("\nSplitting data into Train (80%), Validation (10%), Test (10%)...")
    df_train, df_temp = train_test_split(
        df_full,
        test_size=0.2,  
        random_state=42,
        stratify=df_full[LABEL_COLUMN]
    )

    df_val, df_test = train_test_split(
        df_temp,
        test_size=0.5,
        random_state=42,
        stratify=df_temp[LABEL_COLUMN]
    )

    print(f"Data split complete:")
    print(f"  Training set:   {len(df_train)} samples ({len(df_train)/len(df_full)*100:.1f}%)")
    print(f"  Validation set: {len(df_val)} samples ({len(df_val)/len(df_full)*100:.1f}%)")
    print(f"  Test set:       {len(df_test)} samples ({len(df_test)/len(df_full)*100:.1f}%)")

    print("\nLabel distribution in Training set:")
    print(df_train[LABEL_COLUMN].value_counts(normalize=True).sort_index())
    print("\nLabel distribution in Validation set:")
    print(df_val[LABEL_COLUMN].value_counts(normalize=True).sort_index())
    print("\nLabel distribution in Test set:")
    print(df_test[LABEL_COLUMN].value_counts(normalize=True).sort_index())

# <<< NEW: Oversample the minority class ('PHAN_DONG') in the training set (df_train) >>>
if not df_train.empty:
    print(f"\nOriginal df_train label distribution:")
    print(df_train[LABEL_COLUMN].value_counts(normalize=True).sort_index())

    # Identify minority and majority classes
    label_counts = df_train[LABEL_COLUMN].value_counts()
    majority_class_count = label_counts.max()
    minority_class = "PHAN_DONG" # Your specific minority class
    minority_class_count = label_counts.get(minority_class, 0)

    if minority_class_count > 0 and minority_class_count < majority_class_count:
        print(f"Oversampling minority class '{minority_class}' from {minority_class_count} to match majority (or a higher target)...")
        # Simple oversampling: duplicate minority samples
        # You can choose a target count, e.g., match the majority, or a fraction of it.
        # Let's aim to bring it closer to the next class or a significant portion of majority.
        # For simplicity, let's try to increase its count, e.g., to be similar to KHONG_PHAN_DONG
        target_count_for_minority = label_counts.get("KHONG_PHAN_DONG", majority_class_count) # Or a fixed number

        df_minority = df_train[df_train[LABEL_COLUMN] == minority_class]

        # Calculate how many times to replicate
        if minority_class_count > 0: # Avoid division by zero
            num_replications = (target_count_for_minority // minority_class_count) -1
            if num_replications < 0: num_replications = 0 # Should not happen if target > current

            df_oversampled_minority = pd.concat([df_minority] * num_replications, ignore_index=True)

            # Add remaining samples to get closer to target_count
            remaining_samples_needed = target_count_for_minority % minority_class_count
            if remaining_samples_needed > 0 and not df_minority.empty:
                df_oversampled_minority = pd.concat([df_oversampled_minority, df_minority.sample(n=remaining_samples_needed, replace=True, random_state=42)], ignore_index=True)

            df_train_oversampled = pd.concat([df_train, df_oversampled_minority], ignore_index=True)

            # Shuffle the oversampled training data
            df_train = df_train_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)
            print(f"\nOversampled df_train label distribution:")
            print(df_train[LABEL_COLUMN].value_counts(normalize=True).sort_index())
    else:
        print(f"Minority class '{minority_class}' not found or no oversampling needed/possible.")
# <<< END OF OVERSAMPLING SECTION >>>



Splitting data into Train (70%), Validation (10%), Test (10%)...
Data split complete:
  Training set:   15129 samples (80.0%)
  Validation set: 1891 samples (10.0%)
  Test set:       1892 samples (10.0%)

Label distribution in Training set:
label
KHONG_LIEN_QUAN    0.528984
KHONG_PHAN_DONG    0.356600
PHAN_DONG          0.114416
Name: proportion, dtype: float64

Label distribution in Validation set:
label
KHONG_LIEN_QUAN    0.528821
KHONG_PHAN_DONG    0.356425
PHAN_DONG          0.114754
Name: proportion, dtype: float64

Label distribution in Test set:
label
KHONG_LIEN_QUAN    0.529070
KHONG_PHAN_DONG    0.356765
PHAN_DONG          0.114165
Name: proportion, dtype: float64

Original df_train label distribution:
label
KHONG_LIEN_QUAN    0.528984
KHONG_PHAN_DONG    0.356600
PHAN_DONG          0.114416
Name: proportion, dtype: float64
Oversampling minority class 'PHAN_DONG' from 1731 to match majority (or a higher target)...

Oversampled df_train label distribution:
label
KHONG_LIEN_QUAN

In [11]:
num_samples_for_zs = min(NUM_SAMPLES_TO_TEST_ZERO_SHOT, len(df_test))
if num_samples_for_zs > 0 :
    df_zero_shot_evaluation_set = df_test.sample(n=num_samples_for_zs, random_state=42)
    print(f"\nUsing {len(df_zero_shot_evaluation_set)} samples from the new Test set for zero-shot baseline evaluation.")
    if not df_zero_shot_evaluation_set.empty:
        print("Label distribution in zero-shot evaluation sample:")
        print(df_zero_shot_evaluation_set[LABEL_COLUMN].value_counts(normalize=True).sort_index())
else:
    print("Test set is too small or NUM_SAMPLES_TO_TEST_ZERO_SHOT is 0, skipping zero-shot sample creation.")
    df_zero_shot_evaluation_set = pd.DataFrame() # Đảm bảo là DataFrame rỗng


Using 1892 samples from the new Test set for zero-shot baseline evaluation.
Label distribution in zero-shot evaluation sample:
label
KHONG_LIEN_QUAN    0.529070
KHONG_PHAN_DONG    0.356765
PHAN_DONG          0.114165
Name: proportion, dtype: float64


In [19]:
# --- Enhanced Sequence Length Analysis for SFT Optimization ---
def analyze_sequence_lengths_comprehensive(df, tokenizer, system_prompt, max_samples=1000):
    """
    Comprehensive analysis of sequence lengths for SFT training optimization.
    """
    print("=" * 60)
    print("COMPREHENSIVE SEQUENCE LENGTH ANALYSIS")
    print("=" * 60)

    # Sample data for analysis
    sample_size = min(len(df), max_samples)
    df_sample = df.sample(n=sample_size, random_state=42)
    print(f"Analyzing {sample_size} samples from {len(df)} total samples...")

    # Initialize containers
    prompt_lengths = []
    completion_lengths = []
    total_lengths = []

    # Component length analysis
    system_lengths = []
    user_content_lengths = []
    assistant_lengths = []

    print("\nAnalyzing individual components...")

    for idx, row in df_sample.iterrows():
        try:
            # Create conversation components
            user_content = create_user_content(row[SUMMARY_COLUMN], row[COMMENT_COLUMN])
            assistant_content = row[LABEL_COLUMN]

            # Create full conversation
            conversation = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": assistant_content}
            ]

            # Tokenize individual components
            system_tokens = tokenizer.encode(system_prompt, add_special_tokens=False)
            user_tokens = tokenizer.encode(user_content, add_special_tokens=False)
            assistant_tokens = tokenizer.encode(assistant_content, add_special_tokens=False)

            # Store component lengths
            system_lengths.append(len(system_tokens))
            user_content_lengths.append(len(user_tokens))
            assistant_lengths.append(len(assistant_tokens))

            # Full conversation tokenization
            full_input = tokenizer.apply_chat_template(
                conversation,
                add_generation_prompt=False,
                tokenize=True,
                return_tensors="pt"
            )

            # Prompt only (system + user + generation prompt)
            prompt_only = tokenizer.apply_chat_template(
                conversation[:-1],  # Exclude assistant
                add_generation_prompt=True,
                tokenize=True,
                return_tensors="pt"
            )

            prompt_length = prompt_only.shape[1]
            completion_length = len(assistant_tokens)
            total_length = full_input.shape[1]

            prompt_lengths.append(prompt_length)
            completion_lengths.append(completion_length)
            total_lengths.append(total_length)

        except Exception as e:
            print(f"Error processing sample {idx}: {e}")
            continue

    # Convert to numpy arrays for analysis
    prompt_lengths = np.array(prompt_lengths)
    completion_lengths = np.array(completion_lengths)
    total_lengths = np.array(total_lengths)
    system_lengths = np.array(system_lengths)
    user_content_lengths = np.array(user_content_lengths)
    assistant_lengths = np.array(assistant_lengths)

    return {
        'prompt_lengths': prompt_lengths,
        'completion_lengths': completion_lengths,
        'total_lengths': total_lengths,
        'system_lengths': system_lengths,
        'user_content_lengths': user_content_lengths,
        'assistant_lengths': assistant_lengths
    }

def print_length_statistics(lengths, name):
    """Print detailed statistics for length arrays."""
    print(f"\n{name.upper()} STATISTICS:")
    print(f"  Count: {len(lengths)}")
    print(f"  Mean: {np.mean(lengths):.1f}")
    print(f"  Median: {np.median(lengths):.1f}")
    print(f"  Std Dev: {np.std(lengths):.1f}")
    print(f"  Min: {np.min(lengths)}")
    print(f"  Max: {np.max(lengths)}")
    print(f"  Percentiles:")
    percentiles = [50, 75, 90, 95, 98, 99]
    for p in percentiles:
        print(f"    {p}th: {np.percentile(lengths, p):.0f}")

def recommend_max_seq_length(analysis_results, safety_margin=50):
    """Recommend optimal MAX_SEQ_LENGTH_SFT based on analysis."""
    total_lengths = analysis_results['total_lengths']

    print("\n" + "=" * 60)
    print("MAX_SEQ_LENGTH_SFT RECOMMENDATIONS")
    print("=" * 60)

    # Different strategies
    strategies = {
        'Conservative (95th percentile)': np.percentile(total_lengths, 95),
        'Balanced (98th percentile)': np.percentile(total_lengths, 98),
        'Aggressive (99th percentile)': np.percentile(total_lengths, 99),
        'Maximum (100th percentile)': np.max(total_lengths)
    }

    for strategy, value in strategies.items():
        recommended = int(value + safety_margin)
        coverage = np.mean(total_lengths <= recommended) * 100
        print(f"  {strategy}: {recommended} tokens (covers {coverage:.1f}% of data)")

    # Model-specific recommendations
    print(f"\n  MODEL-SPECIFIC CONSIDERATIONS:")
    print(f"    - Vistral-7B context window: ~4096 tokens")
    print(f"    - Memory efficient: 512-1024 tokens")
    print(f"    - Balanced performance: 1024-2048 tokens")
    print(f"    - Maximum context: 2048-4096 tokens")

    # Final recommendation
    recommended_balanced = int(np.percentile(total_lengths, 98) + safety_margin)
    recommended_safe = min(recommended_balanced, 1024)  # Cap for 7B model

    print(f"\n  🎯 RECOMMENDED FOR YOUR SETUP:")
    print(f"    MAX_SEQ_LENGTH_SFT = {recommended_safe}")
    print(f"    (98th percentile + {safety_margin} margin, capped for 7B model)")

    return recommended_safe

# Run comprehensive analysis
if not df_train.empty:
    print("\nStarting comprehensive sequence length analysis...")

    # Create tokenizer for analysis (if not already available)
    if 'tokenizer_sft' not in locals():
        temp_tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID, trust_remote_code=True)
        if temp_tokenizer.pad_token is None:
            temp_tokenizer.pad_token = temp_tokenizer.eos_token
    else:
        temp_tokenizer = tokenizer_sft

    # Analyze training data
    analysis_results = analyze_sequence_lengths_comprehensive(
        df_train,
        temp_tokenizer,
        SYSTEM_PROMPT_CONTENT,
        max_samples=1000
    )

    # Print detailed statistics
    print_length_statistics(analysis_results['system_lengths'], "System Prompt")
    print_length_statistics(analysis_results['user_content_lengths'], "User Content")
    print_length_statistics(analysis_results['assistant_lengths'], "Assistant Response")
    print_length_statistics(analysis_results['prompt_lengths'], "Full Prompt")
    print_length_statistics(analysis_results['completion_lengths'], "Completion")
    print_length_statistics(analysis_results['total_lengths'], "Total Sequence")

    # Get recommendation
    recommended_max_seq = recommend_max_seq_length(analysis_results)

    # Update MAX_SEQ_LENGTH_SFT if needed
    if recommended_max_seq != MAX_SEQ_LENGTH_SFT:
        print(f"\n📝 UPDATING MAX_SEQ_LENGTH_SFT:")
        print(f"    From: {MAX_SEQ_LENGTH_SFT}")
        print(f"    To: {recommended_max_seq}")
        MAX_SEQ_LENGTH_SFT = recommended_max_seq

    # Clean up temporary tokenizer
    if 'tokenizer_sft' not in locals():
        del temp_tokenizer

else:
    print("❌ Cannot perform sequence length analysis: Training data is empty")


Starting comprehensive sequence length analysis...


tokenizer_config.json:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/597k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.15M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

COMPREHENSIVE SEQUENCE LENGTH ANALYSIS
Analyzing 1000 samples from 18793 total samples...

Analyzing individual components...

SYSTEM PROMPT STATISTICS:
  Count: 1000
  Mean: 598.0
  Median: 598.0
  Std Dev: 0.0
  Min: 598
  Max: 598
  Percentiles:
    50th: 598
    75th: 598
    90th: 598
    95th: 598
    98th: 598
    99th: 598

USER CONTENT STATISTICS:
  Count: 1000
  Mean: 166.2
  Median: 157.0
  Std Dev: 51.9
  Min: 85
  Max: 436
  Percentiles:
    50th: 157
    75th: 191
    90th: 228
    95th: 257
    98th: 311
    99th: 346

ASSISTANT RESPONSE STATISTICS:
  Count: 1000
  Mean: 7.2
  Median: 8.0
  Std Dev: 1.3
  Min: 5
  Max: 8
  Percentiles:
    50th: 8
    75th: 8
    90th: 8
    95th: 8
    98th: 8
    99th: 8

FULL PROMPT STATISTICS:
  Count: 1000
  Mean: 776.2
  Median: 767.0
  Std Dev: 51.9
  Min: 695
  Max: 1046
  Percentiles:
    50th: 767
    75th: 801
    90th: 838
    95th: 867
    98th: 921
    99th: 956

COMPLETION STATISTICS:
  Count: 1000
  Mean: 7.2
  Median: 8.

In [12]:
# 2.4. Prepare Dataset for Fine-tuning (Conversational Format)
# This function will be used to map your DataFrame to the Hugging Face Dataset format
def create_conversation_for_sft(sample):
    user_content = create_user_content(sample[SUMMARY_COLUMN], sample[COMMENT_COLUMN])
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_CONTENT},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": sample[LABEL_COLUMN]} # The ground truth label
        ]
    }

In [13]:
train_hf_dataset = Dataset.from_pandas(df_train)
val_hf_dataset = Dataset.from_pandas(df_val)
# df_test cũng sẽ được dùng để tạo test_hf_dataset cho đánh giá cuối cùng của model đã fine-tune
train_sft_formatted = train_hf_dataset.map(
    create_conversation_for_sft,
    remove_columns=train_hf_dataset.column_names
)
val_sft_formatted = val_hf_dataset.map(
    create_conversation_for_sft,
    remove_columns=val_hf_dataset.column_names
)
from datasets import DatasetDict
raw_datasets_sft = DatasetDict({
    'train': train_sft_formatted,
    'validation': val_sft_formatted # Đổi tên key cho tập validation
})
print(f"\nDataset for SFT prepared:")
print(f"SFT Training set size:   {len(raw_datasets_sft['train'])}")
print(f"SFT Validation set size: {len(raw_datasets_sft['validation'])}")
# ... (phần in ví dụ dữ liệu SFT giữ nguyên) ...

Map:   0%|          | 0/18793 [00:00<?, ? examples/s]

Map:   0%|          | 0/1891 [00:00<?, ? examples/s]


Dataset for SFT prepared:
SFT Training set size:   18793
SFT Validation set size: 1891


In [33]:
# Replace the existing interactive testing section with this updated version
# --- Phase 3: Zero-Shot Testing (Baseline) ---
bnb_config_zs = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
        bnb_4bit_use_double_quant=False,
    )

model_for_zs = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     quantization_config=bnb_config_zs,
     device_map="auto",
     trust_remote_code=True
 )
model_for_zs.eval()

tokenizer_for_zs = AutoTokenizer.from_pretrained(TOKENIZER_ID, trust_remote_code=True)
if tokenizer_for_zs.pad_token is None:
     tokenizer_for_zs.pad_token = tokenizer_for_zs.eos_token
print("Model and Tokenizer for Zero-Shot loaded.")

# Interactive Testing on Full Test Dataset
print("\n--- Interactive Testing on Full Test Dataset ---")

if 'ft_model' in locals() and 'ft_tokenizer' in locals() and not df_test.empty:
    print(f"Running interactive testing on {len(df_test)} test samples...")
    
    correct_predictions = 0
    total_predictions = 0
    all_actual_labels = []
    all_predicted_labels = []
    
    for idx, row in df_test.iterrows():
        # Reset conversation for each test sample
        conversation = [{"role": "system", "content": SYSTEM_PROMPT_CONTENT}]
        
        # Get the test data
        post = row[SUMMARY_COLUMN]
        comment = row[COMMENT_COLUMN]
        actual_label = row[LABEL_COLUMN]
        
        # Create user content and add to conversation
        user_content = create_user_content(post, comment)
        conversation.append({"role": "user", "content": user_content})
        
        # Tokenize and generate response
        input_ids = tokenizer_for_zs.apply_chat_template(
            conversation, 
            return_tensors="pt",
            add_generation_prompt=True
        ).to(ft_model.device)

        out_ids = model_for_zs.generate(
            input_ids=input_ids,
            max_new_tokens=768,
            do_sample=True,
            top_p=0.95,
            top_k=40,
            temperature=0.1,
            repetition_penalty=1.05,
            pad_token_id=ft_tokenizer.eos_token_id,
            eos_token_id=ft_tokenizer.eos_token_id
        )
        
        # Decode the assistant response
        assistant_response = tokenizer_for_zs.batch_decode(
            out_ids[:, input_ids.size(1):], 
            skip_special_tokens=True
        )[0].strip()
        
        # Extract predicted label using your existing logic
        extracted_label = "NULL"
        normalized_response = re.sub(r'[^\w_]', '', assistant_response.upper().replace(" ", "_"))
        normalized_response = re.sub(r'_+', '_', normalized_response)
        
        for variant in POSSIBLE_LABELS_VARIANTS_FOR_MATCHING:
            if variant in normalized_response:
                if variant in POSSIBLE_LABELS_NORMALIZED_FOR_MATCHING:
                    original_label_index = POSSIBLE_LABELS_NORMALIZED_FOR_MATCHING.index(variant)
                    extracted_label = POSSIBLE_LABELS[original_label_index]
                else:
                    for pl_original in POSSIBLE_LABELS:
                        if pl_original.upper() == variant:
                            extracted_label = pl_original
                            break
                break
        
        # Track results
        all_actual_labels.append(actual_label)
        all_predicted_labels.append(extracted_label)
        total_predictions += 1
        
        if extracted_label == actual_label:
            correct_predictions += 1
        
        # Print detailed results for ALL samples (removed the <= 10 condition)
        print(f"\nSample {total_predictions}:")
        print(f"  Assistant Response: {assistant_response}")
        print(f"  Predicted: {extracted_label} | Actual: {actual_label} | {'✓' if extracted_label == actual_label else '✗'}")
        
        # Print progress every 50 samples
        if total_predictions % 50 == 0:
            current_accuracy = correct_predictions / total_predictions
            print(f"\n=== Progress Update ===")
            print(f"Processed {total_predictions} samples. Current accuracy: {current_accuracy:.4f}")
            print("=" * 40)
    
    # Final results
    final_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    print(f"\n=== Final Test Results ===")
    print(f"Total samples processed: {total_predictions}")
    print(f"Correct predictions: {correct_predictions}")
    print(f"Test Accuracy: {final_accuracy:.4f}")
    
    # Classification report
    if all_actual_labels and all_predicted_labels:
        print("\nTest Classification Report:")
        print(classification_report(
            all_actual_labels, 
            all_predicted_labels, 
            labels=POSSIBLE_LABELS, 
            target_names=POSSIBLE_LABELS, 
            zero_division=0
        ))
    
else:
    if 'ft_model' not in locals() or 'ft_tokenizer' not in locals():
        print("Fine-tuned model or tokenizer not available. Please run the fine-tuning section first.")
    if df_test.empty:
        print("Test dataset is empty.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and Tokenizer for Zero-Shot loaded.

--- Interactive Testing on Full Test Dataset ---
Running interactive testing on 1892 test samples...

Sample 1:
  Assistant Response: KHONG_LIEN_QUAN
  Predicted: KHONG_LIEN_QUAN | Actual: KHONG_LIEN_QUAN | ✓

Sample 2:
  Assistant Response: KHONG_LIEN_QUAN
  Predicted: KHONG_LIEN_QUAN | Actual: KHONG_LIEN_QUAN | ✓

Sample 3:
  Assistant Response: KHONG_PHAN_DONG
  Predicted: KHONG_PHAN_DONG | Actual: KHONG_LIEN_QUAN | ✗

Sample 4:
  Assistant Response: KHONG_LIEN_QUAN
  Predicted: KHONG_LIEN_QUAN | Actual: KHONG_LIEN_QUAN | ✓

Sample 5:
  Assistant Response: PHAN_DONG
  Predicted: PHAN_DONG | Actual: PHAN_DONG | ✓

Sample 6:
  Assistant Response: PHAN_DONG
  Predicted: PHAN_DONG | Actual: KHONG_LIEN_QUAN | ✗

Sample 7:
  Assistant Response: PHAN_DONG
  Predicted: PHAN_DONG | Actual: PHAN_DONG | ✓

Sample 8:
  Assistant Response: KHONG_LIEN_QUAN
  Predicted: KHONG_LIEN_QUAN | Actual: KHONG_PHAN_DONG | ✗

Sample 9:
  Assistant Response: PHAN_DO

In [26]:
torch.cuda.empty_cache()

In [15]:
 # --- Phase 4: Fine-tune LLM using SFTTrainer ---
print("\n--- Phase 4: Fine-tuning LLM with SFTTrainer ---")
model_sft = None
tokenizer_sft = None
trainer_sft = None

if raw_datasets_sft and 'train' in raw_datasets_sft and len(raw_datasets_sft['train']) > 0:
    print(f"Loading base model for SFT: {MODEL_ID}")
    bnb_config_sft = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
    )
    try:
        model_sft = AutoModelForCausalLM.from_pretrained(
            MODEL_ID, quantization_config=bnb_config_sft, device_map="auto",
            trust_remote_code=True
        )
        print("Base model for SFT loaded.")
        tokenizer_sft = AutoTokenizer.from_pretrained(TOKENIZER_ID, trust_remote_code=True)
        if tokenizer_sft.pad_token is None:
            tokenizer_sft.pad_token = tokenizer_sft.eos_token
            print(f"Set tokenizer_sft.pad_token to tokenizer_sft.eos_token: {tokenizer_sft.eos_token}")
        if tokenizer_sft.chat_template is None: # Qwen2 should have this. Fallback if not.
            print("Warning: tokenizer_sft.chat_template is None. Attempting to set a generic ChatML-like template.")
            tokenizer_sft.chat_template = "{% for message in messages %}{% if message['role'] == 'system' %}{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' }}{% elif message['role'] == 'user' %}{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}{% endfor %}"


        model_sft = prepare_model_for_kbit_training(model_sft)
        print("Model prepared for k-bit training.")
        peft_config = LoraConfig(
            r=32,
            lora_alpha=64,
            lora_dropout=0.02,
            target_modules=[
                  "q_proj",
                  "k_proj",
                  "v_proj",
                  "o_proj",
                  "up_proj",
                  "down_proj",
                  "gate_proj"
              ],
            bias="none",
            task_type="SEQ_CLS"
        )
        print("LoRA config defined.")

        # Dynamic MAX_SEQ_LENGTH_SFT adjustment
        if 'train' in raw_datasets_sft and len(raw_datasets_sft['train']) > 0:
            print("\nAnalyzing token lengths for SFT dataset formatting to set MAX_SEQ_LENGTH_SFT...")
            temp_sft_token_lengths = []
            # Analyze a subset for speed, e.g., first 1000 or 10%
            num_samples_for_len_analysis = min(len(raw_datasets_sft['train']), 1000)
            for i in range(num_samples_for_len_analysis):
                messages = raw_datasets_sft['train'][i]['messages']
                try:
                    formatted_input_ids = tokenizer_sft.apply_chat_template(
                        messages, add_generation_prompt=False, tokenize=True, return_tensors="pt"
                    )
                    temp_sft_token_lengths.append(formatted_input_ids.shape[1])
                except Exception as e: pass # Ignore errors for individual samples during length analysis

            if temp_sft_token_lengths:
                p98_sft_len = int(np.percentile(temp_sft_token_lengths, 98))
                model_abs_max = tokenizer_sft.model_max_length if hasattr(tokenizer_sft, 'model_max_length') and tokenizer_sft.model_max_length < 30000 else 4096
                candidate_max_len = min(p98_sft_len + 20, model_abs_max, 1024) # Add buffer, cap reasonably (e.g. 1024 for 7B)
                if candidate_max_len > MAX_SEQ_LENGTH_SFT:
                    print(f"Adjusting MAX_SEQ_LENGTH_SFT from {MAX_SEQ_LENGTH_SFT} to {candidate_max_len} based on 98th percentile of SFT data.")
                    MAX_SEQ_LENGTH_SFT = candidate_max_len
                else:
                    print(f"Current MAX_SEQ_LENGTH_SFT ({MAX_SEQ_LENGTH_SFT}) is sufficient or data is shorter (98th percentile: {p98_sft_len}).")
            else: print("Could not analyze SFT token lengths. Using default MAX_SEQ_LENGTH_SFT.")
        else: print("No training data for SFT length analysis.")

        early_stopping_callback = EarlyStoppingCallback(
            early_stopping_patience=2,
            early_stopping_threshold=0.01
        )
        training_args_sft = TrainingArguments(
            output_dir=OUTPUT_DIR,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=64,
            gradient_accumulation_steps=1,
            optim="paged_adamw_8bit",
            learning_rate=3e-5,
            num_train_epochs=2,
            warmup_ratio=0.15,
            weight_decay=0.02,
            fp16=True, # Default to FP16, BF16 handled below if supported
            logging_steps=100,
            eval_strategy="steps",
            eval_steps=300,
            save_steps=600,
            save_total_limit=2,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=True,
            report_to="tensorboard",
            push_to_hub=False,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={'use_reentrant':False}
        )

        print("TrainingArguments defined.")

        trainer_sft = SFTTrainer(
            model=model_sft,
            args=training_args_sft,
            train_dataset=raw_datasets_sft["train"],
            eval_dataset=raw_datasets_sft["validation"],
            peft_config=peft_config,
            processing_class=tokenizer_sft,
            callbacks=[early_stopping_callback]
        )
        print("SFTTrainer initialized.")
        print("\nStarting SFT fine-tuning...")
        trainer_sft.train()
        print("SFT Fine-tuning finished.")
        print(f"Saving LoRA adapter to {OUTPUT_DIR}...")
        trainer_sft.save_model(OUTPUT_DIR)
        tokenizer_sft.save_pretrained(OUTPUT_DIR)
        print("LoRA adapter and tokenizer saved.")
    except Exception as e:
        print(f"An error occurred during SFT fine-tuning setup or training: {e}")
        import traceback; traceback.print_exc()
else:
    print("Skipping SFT fine-tuning as formatted training dataset is not available or empty.")



--- Phase 4: Fine-tuning LLM with SFTTrainer ---
Loading base model for SFT: Viet-Mistral/Vistral-7B-Chat


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model for SFT loaded.
Model prepared for k-bit training.
LoRA config defined.

Analyzing token lengths for SFT dataset formatting to set MAX_SEQ_LENGTH_SFT...
Current MAX_SEQ_LENGTH_SFT (1024) is sufficient or data is shorter (98th percentile: 919).
TrainingArguments defined.


Tokenizing train dataset:   0%|          | 0/18793 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/18793 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1891 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1891 [00:00<?, ? examples/s]

SFTTrainer initialized.

Starting SFT fine-tuning...
{'loss': 0.6767, 'grad_norm': 0.7160813212394714, 'learning_rate': 1.677966101694915e-05, 'num_tokens': 2517102.0, 'mean_token_accuracy': 0.8504091376066207, 'epoch': 0.17006802721088435}
{'loss': 0.1401, 'grad_norm': 0.21353553235530853, 'learning_rate': 2.933933933933934e-05, 'num_tokens': 5030349.0, 'mean_token_accuracy': 0.9727194851636887, 'epoch': 0.3401360544217687}
{'loss': 0.1294, 'grad_norm': 0.2509133219718933, 'learning_rate': 2.6336336336336335e-05, 'num_tokens': 7546689.0, 'mean_token_accuracy': 0.9749470943212509, 'epoch': 0.5102040816326531}
{'eval_loss': 0.12593400478363037, 'eval_runtime': 116.9319, 'eval_samples_per_second': 16.172, 'eval_steps_per_second': 0.257, 'eval_num_tokens': 7546689.0, 'eval_mean_token_accuracy': 0.9756657719612122, 'epoch': 0.5102040816326531}
An error occurred during SFT fine-tuning setup or training: "The `metric_for_best_model` training argument is set to 'eval_f1', which is not found i

Traceback (most recent call last):
  File "/venv/main/lib/python3.10/site-packages/transformers/trainer.py", line 3202, in _determine_best_metric
    metric_value = metrics[metric_to_check]
KeyError: 'eval_f1'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/tmp/ipykernel_1736/156904884.py", line 117, in <module>
    trainer_sft.train()
  File "/venv/main/lib/python3.10/site-packages/transformers/trainer.py", line 2237, in train
    return inner_training_loop(
  File "/venv/main/lib/python3.10/site-packages/transformers/trainer.py", line 2660, in _inner_training_loop
    self._maybe_log_save_evaluate(
  File "/venv/main/lib/python3.10/site-packages/transformers/trainer.py", line 3134, in _maybe_log_save_evaluate
    is_new_best_metric = self._determine_best_metric(metrics=metrics, trial=trial)
  File "/venv/main/lib/python3.10/site-packages/transformers/trainer.py", line 3204, in _determine_best_metric
    raise KeyError

In [16]:
        training_args_sft = TrainingArguments(
            output_dir=OUTPUT_DIR,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=64,
            gradient_accumulation_steps=1,
            optim="paged_adamw_8bit",
            learning_rate=3e-5,
            num_train_epochs=2,
            warmup_ratio=0.15,
            weight_decay=0.02,
            fp16=True, # Default to FP16, BF16 handled below if supported
            logging_steps=100,
            eval_strategy="steps",
            eval_steps=300,
            save_steps=600,
            save_total_limit=2,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=True,
            report_to="tensorboard",
            push_to_hub=False,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={'use_reentrant':False}
        )

        print("TrainingArguments defined.")

        trainer_sft = SFTTrainer(
            model=model_sft,
            args=training_args_sft,
            train_dataset=raw_datasets_sft["train"],
            eval_dataset=raw_datasets_sft["validation"],
            peft_config=peft_config,
            processing_class=tokenizer_sft,
            callbacks=[early_stopping_callback]
        )
        print("SFTTrainer initialized.")
        print("\nStarting SFT fine-tuning...")
        trainer_sft.train()
        print("SFT Fine-tuning finished.")
        print(f"Saving LoRA adapter to {OUTPUT_DIR}...")
        trainer_sft.save_model(OUTPUT_DIR)
        tokenizer_sft.save_pretrained(OUTPUT_DIR)
        print("LoRA adapter and tokenizer saved.")

TrainingArguments defined.




Tokenizing train dataset:   0%|          | 0/18793 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/18793 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1891 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1891 [00:00<?, ? examples/s]

SFTTrainer initialized.

Starting SFT fine-tuning...
{'loss': 0.676, 'grad_norm': 0.7796154618263245, 'learning_rate': 1.677966101694915e-05, 'num_tokens': 2517102.0, 'mean_token_accuracy': 0.8506419092416764, 'epoch': 0.17006802721088435}
{'loss': 0.14, 'grad_norm': 0.22205282747745514, 'learning_rate': 2.933933933933934e-05, 'num_tokens': 5030349.0, 'mean_token_accuracy': 0.9727446383237839, 'epoch': 0.3401360544217687}
{'loss': 0.1293, 'grad_norm': 0.2101927250623703, 'learning_rate': 2.6336336336336335e-05, 'num_tokens': 7546689.0, 'mean_token_accuracy': 0.9749638479948044, 'epoch': 0.5102040816326531}
{'eval_loss': 0.12593789398670197, 'eval_runtime': 117.0227, 'eval_samples_per_second': 16.159, 'eval_steps_per_second': 0.256, 'eval_num_tokens': 7546689.0, 'eval_mean_token_accuracy': 0.9756904721260071, 'epoch': 0.5102040816326531}
{'loss': 0.1283, 'grad_norm': 0.25549882650375366, 'learning_rate': 2.3333333333333336e-05, 'num_tokens': 10064048.0, 'mean_token_accuracy': 0.97513140

In [30]:
# Replace the existing interactive testing section with this updated version

# Interactive Testing on Full Test Dataset
print("\n--- Interactive Testing on Full Test Dataset ---")

if 'ft_model' in locals() and 'ft_tokenizer' in locals() and not df_test.empty:
    print(f"Running interactive testing on {len(df_test)} test samples...")
    
    correct_predictions = 0
    total_predictions = 0
    all_actual_labels = []
    all_predicted_labels = []
    
    for idx, row in df_test.iterrows():
        # Reset conversation for each test sample
        conversation = [{"role": "system", "content": SYSTEM_PROMPT_CONTENT}]
        
        # Get the test data
        post = row[SUMMARY_COLUMN]
        comment = row[COMMENT_COLUMN]
        actual_label = row[LABEL_COLUMN]
        
        # Create user content and add to conversation
        user_content = create_user_content(post, comment)
        conversation.append({"role": "user", "content": user_content})
        
        # Tokenize and generate response
        input_ids = ft_tokenizer.apply_chat_template(
            conversation, 
            return_tensors="pt",
            add_generation_prompt=True
        ).to(ft_model.device)

        out_ids = ft_model.generate(
            input_ids=input_ids,
            max_new_tokens=768,
            do_sample=True,
            top_p=0.95,
            top_k=40,
            temperature=0.1,
            repetition_penalty=1.05,
            pad_token_id=ft_tokenizer.eos_token_id,
            eos_token_id=ft_tokenizer.eos_token_id
        )
        
        # Decode the assistant response
        assistant_response = ft_tokenizer.batch_decode(
            out_ids[:, input_ids.size(1):], 
            skip_special_tokens=True
        )[0].strip()
        
        # Extract predicted label using your existing logic
        extracted_label = "NULL"
        normalized_response = re.sub(r'[^\w_]', '', assistant_response.upper().replace(" ", "_"))
        normalized_response = re.sub(r'_+', '_', normalized_response)
        
        for variant in POSSIBLE_LABELS_VARIANTS_FOR_MATCHING:
            if variant in normalized_response:
                if variant in POSSIBLE_LABELS_NORMALIZED_FOR_MATCHING:
                    original_label_index = POSSIBLE_LABELS_NORMALIZED_FOR_MATCHING.index(variant)
                    extracted_label = POSSIBLE_LABELS[original_label_index]
                else:
                    for pl_original in POSSIBLE_LABELS:
                        if pl_original.upper() == variant:
                            extracted_label = pl_original
                            break
                break
        
        # Track results
        all_actual_labels.append(actual_label)
        all_predicted_labels.append(extracted_label)
        total_predictions += 1
        
        if extracted_label == actual_label:
            correct_predictions += 1
        
        # Print detailed results for ALL samples (removed the <= 10 condition)
        print(f"\nSample {total_predictions}:")
        print(f"  Assistant Response: {assistant_response}")
        print(f"  Predicted: {extracted_label} | Actual: {actual_label} | {'✓' if extracted_label == actual_label else '✗'}")
        
        # Print progress every 50 samples
        if total_predictions % 50 == 0:
            current_accuracy = correct_predictions / total_predictions
            print(f"\n=== Progress Update ===")
            print(f"Processed {total_predictions} samples. Current accuracy: {current_accuracy:.4f}")
            print("=" * 40)
    
    # Final results
    final_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    print(f"\n=== Final Test Results ===")
    print(f"Total samples processed: {total_predictions}")
    print(f"Correct predictions: {correct_predictions}")
    print(f"Test Accuracy: {final_accuracy:.4f}")
    
    # Classification report
    if all_actual_labels and all_predicted_labels:
        print("\nTest Classification Report:")
        print(classification_report(
            all_actual_labels, 
            all_predicted_labels, 
            labels=POSSIBLE_LABELS, 
            target_names=POSSIBLE_LABELS, 
            zero_division=0
        ))
    
else:
    if 'ft_model' not in locals() or 'ft_tokenizer' not in locals():
        print("Fine-tuned model or tokenizer not available. Please run the fine-tuning section first.")
    if df_test.empty:
        print("Test dataset is empty.")


--- Interactive Testing on Full Test Dataset ---
Running interactive testing on 1892 test samples...

=== Progress Update ===
Processed 50 samples. Current accuracy: 0.7800

=== Progress Update ===
Processed 100 samples. Current accuracy: 0.7600

=== Progress Update ===
Processed 150 samples. Current accuracy: 0.7867

=== Progress Update ===
Processed 200 samples. Current accuracy: 0.7950

=== Progress Update ===
Processed 250 samples. Current accuracy: 0.8120

=== Progress Update ===
Processed 300 samples. Current accuracy: 0.7867

=== Progress Update ===
Processed 350 samples. Current accuracy: 0.7829

=== Progress Update ===
Processed 400 samples. Current accuracy: 0.7925

=== Progress Update ===
Processed 450 samples. Current accuracy: 0.7933

=== Progress Update ===
Processed 500 samples. Current accuracy: 0.7920

=== Progress Update ===
Processed 550 samples. Current accuracy: 0.7836

=== Progress Update ===
Processed 600 samples. Current accuracy: 0.7817

=== Progress Update ==

# Live predicting

In [None]:
conversation = [{"role": "system", "content": SYSTEM_PROMPT_CONTENT }]
while True:
    post = input("Nội dung bài viết: ")
    comment = input("Bình luận cần đánh giá: ")
    if post.lower() == "reset":
        conversation = [{"role": "system", "content": SYSTEM_PROMPT_CONTENT }]
        print("The chat history has been cleared!")
        continue
    
    conversation.append({"role": "user", "content": create_user_content(post,comment) })
    input_ids = ft_tokenizer.apply_chat_template(conversation, return_tensors="pt").to(ft_model.device)

    out_ids = ft_model.generate(
        input_ids=input_ids,
        max_new_tokens=768,
        do_sample=True,
        top_p=0.95,
        top_k=40,
        temperature=0.1,
        repetition_penalty=1.05,
    )
    assistant = ft_tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()
    print("Assistant: ", assistant)
    conversation.append({"role": "assistant", "content": assistant })