In [169]:
__file__ = "__init__.py"

In [170]:
import sys, os, shutil, warnings
import json5
from pathlib import Path
import pandas as pd
from datasets import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    TrainerCallback
)

warnings.filterwarnings("ignore")

project_root = Path(__file__).resolve().parents[1]
sys.path.append(str(project_root))

from utils.qgene import generate_text

# from nlp.nlp_model import NLPModel

In [171]:
paths = {
    "processed": os.path.abspath(f"{project_root}/data/storage/processed"),
    "qfragments": os.path.abspath(f"{project_root}/intents/qfragments.json"),
    "questions": os.path.abspath(f"{project_root}/intents/questions.csv"),
    "trained_questions": os.path.abspath(f"{project_root}/intents/trained_questions.csv"),
    "models": os.path.abspath(f"{project_root}/models/t5-small"),
    "results": os.path.abspath(f"{project_root}/training/results"),
}

os.makedirs(paths["models"], exist_ok=True) 
os.makedirs(paths["results"], exist_ok=True)

In [None]:
class TokenizerSaver(TrainerCallback):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def on_save(self, args, state, control, **kwargs):
        ckpt_dir = Path(args.output_dir) / f"checkpoint-{state.global_step}"
        self.tokenizer.save_pretrained(ckpt_dir)
        return control

class NLPModel:
    def __init__(self, datanum: int = 1000) -> None:
        # Load data
        self.qfragments = json5.load(open(paths["qfragments"]))
        self.questions = self._dataset(datanum)

        # Model setup
        self.model_checkpoint = self._load_checkpoint()
        self.tokenizer = T5Tokenizer.from_pretrained(self.model_checkpoint)
        self.model = T5ForConditionalGeneration.from_pretrained(self.model_checkpoint)
        self.data_collator = DataCollatorForSeq2Seq(self.tokenizer, model=self.model)

        # Initialize datasets
        self._prepare_datasets()

    def _dataset(self, datanum: int = 1000) -> pd.DataFrame:
        if os.path.exists(paths["trained_questions"]):
            old_data = pd.read_csv(paths["trained_questions"])
            new_questions = pd.concat([old_data, generate_text(datanum)])
        else:
            new_questions = generate_text(datanum)
        
        new_questions.to_csv(paths["trained_questions"], index=False)
        return new_questions
    
    def _rows_preprocessor(self, row: pd.Series) -> dict:
        return {
        "input_text": f"{row['question']}",
        "target_text": f"""
            BRAND: {row['brand']};  
            PRICE: {row['price']}                   
            RAM: {row['ram']}; 
            GPU: {row['gpu']}; 
            CPU: {row['cpu']}; 
            DISPLAY: {row['display']};             
            REFRESH_RATE: {row['refresh rate']}; 
            """,
    }

    def _tokenize_function(self, examples):
        tokenized = self.tokenizer(
            examples["input_text"],
            max_length=512,
            truncation=True,
            padding="max_length",
        )
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                examples["target_text"],
                max_length=128,
                truncation=True,
                padding="max_length",
            )
        tokenized["labels"] = labels["input_ids"]
        return tokenized

    def _prepare_datasets(self):
        processed_data = self.questions.apply(self._rows_preprocessor, axis=1)
        df_processed = pd.DataFrame(list(processed_data))
        full_dataset = Dataset.from_pandas(df_processed)
        split_datasets = full_dataset.train_test_split(test_size=0.2, seed=42)
        self.tokenized_train = split_datasets["train"].map(
            self._tokenize_function,
            batched=True,
            remove_columns=split_datasets["train"].column_names,
        )

        self.tokenized_eval = split_datasets["test"].map(
            self._tokenize_function,
            batched=True,
            remove_columns=split_datasets["test"].column_names,
        )

    def _load_checkpoint(self, resume_checkpoint: str = "auto") -> str:
        def is_valid_checkpoint(ckpt_path: Path) -> bool:
            tokenizer_files = {
                "tokenizer_config.json",
                "special_tokens_map.json",
                "spiece.model"
            }

            model_files = {
                "config.json",
                "pytorch_model.bin",      
                "model.safetensors",      
                "generation_config.json"    
            }
            
            has_model_files = any((ckpt_path / f).exists() for f in model_files if f != "config.json")
            has_config = (ckpt_path / "config.json").exists()

            has_tokenizer = all((ckpt_path / f).exists() for f in tokenizer_files)
            
            return has_config and has_model_files and has_tokenizer
        if resume_checkpoint == "auto":
            checkpoint_dirs = sorted(
                Path(paths["results"]).glob("checkpoint-*"), 
                key=lambda x: int(x.name.split("-")[-1]) if x.name.split("-")[-1].isdigit() else 0,
                reverse=True
            )
            for ckpt_dir in checkpoint_dirs:
                if ckpt_dir.is_dir() and is_valid_checkpoint(ckpt_dir):
                    print(f"Auto-selected valid checkpoint: {ckpt_dir}")
                    return str(ckpt_dir)
            
            print("No valid checkpoints found in auto mode, using default t5-small")
            return "t5-small"

        manual_ckpt = Path(resume_checkpoint)
        if manual_ckpt.exists():
            if manual_ckpt.is_dir() and is_valid_checkpoint(manual_ckpt):
                return str(manual_ckpt)
            
            print(f"Manual checkpoint {manual_ckpt} is invalid, checking in results directory...")
            manual_ckpt = Path(paths["results"]) / resume_checkpoint
            if manual_ckpt.exists() and manual_ckpt.is_dir() and is_valid_checkpoint(manual_ckpt):
                return str(manual_ckpt)

        print(f"Checkpoint {resume_checkpoint} is invalid or corrupted. Reverting to t5-small")
        return "t5-small"

    def _del_checkpoint(self) -> None:
        checkpoint_dirs = list(Path(paths["results"]).glob("checkpoint-*"))
        checkpoints = sorted(
            [
                ckpt
                for ckpt in checkpoint_dirs
                if ckpt.is_dir() and ckpt.name.split("-")[-1].isdigit()
            ],
            key=lambda x: int(x.name.split("-")[-1]),
        )
        if len(checkpoints) > 3:
            for checkpoint in checkpoints[:-3]:
                shutil.rmtree(checkpoint)
                print(f"Removed old checkpoint: {checkpoint}")


    def _del_checkpoint(self) -> None:
        checkpoint_dirs = list(Path(paths["results"]).glob("checkpoint-*"))
        checkpoints = sorted(
            [
                ckpt
                for ckpt in checkpoint_dirs
                if ckpt.is_dir() and ckpt.name.split("-")[-1].isdigit()
            ],
            key=lambda x: int(x.name.split("-")[-1]),
        )
        if len(checkpoints) > 3:
            for checkpoint in checkpoints[:-3]:
                shutil.rmtree(checkpoint)
                print(f"Removed old checkpoint: {checkpoint}")

    def _nlptraining(
        self,
        early_stopping_patience: int = 3,
        resume_checkpoint: str = None,
        batch_size: int = 16,
        learning_rate: float = 3e-5,
        num_train_epochs: int = 1,
    ) -> None:

        training_args = TrainingArguments(
            output_dir=str(paths["results"]),
            overwrite_output_dir=True,
            report_to="none",          
            logging_strategy="steps", 
            evaluation_strategy="steps",   
            save_strategy="steps",                           
            logging_steps=25,              
            eval_steps=25,                 
            save_steps=25,           
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            weight_decay=0.01,
            gradient_accumulation_steps=2,
            fp16=True,
            load_best_model_at_end=True,
            greater_is_better=True,
            eval_accumulation_steps=1,
            resume_from_checkpoint=resume_checkpoint,
            save_safetensors=False,
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.tokenized_train,
            eval_dataset=self.tokenized_eval,
            data_collator=self.data_collator,
            callbacks=[
                EarlyStoppingCallback(early_stopping_patience=early_stopping_patience),
                TokenizerSaver(self.tokenizer),  
            ],
        )
            
        self.trainer.train()
        self.model.save_pretrained(paths["models"])
        self.tokenizer.save_pretrained(paths["models"])

model = NLPModel(1000)
model._nlptraining(
    resume_checkpoint = "auto"
)

In [None]:
# import sys, os, shutil, warnings, re
# import json5
# from pathlib import Path
# import pandas as pd
# from datasets import Dataset
# from transformers import (
#     T5Tokenizer,
#     T5ForConditionalGeneration,
#     Trainer,
#     TrainingArguments,
#     DataCollatorForSeq2Seq,
#     EarlyStoppingCallback,
#     TrainerCallback
# )
# import torch
# warnings.filterwarnings("ignore")

# project_root = Path(__file__).resolve().parents[1]
# sys.path.append(str(project_root))

# from utils.qgene import generate_text

# paths = {
#     "processed": os.path.abspath(f"{project_root}/data/storage/processed"),
#     "qfragments": os.path.abspath(f"{project_root}/intents/qfragments.json"),
#     "questions": os.path.abspath(f"{project_root}/intents/questions.csv"),
#     "trained_questions": os.path.abspath(f"{project_root}/intents/trained_questions.csv"),
#     "models": os.path.abspath(f"{project_root}/models/t5-small"),
#     "results": os.path.abspath(f"{project_root}/training/results"),
# }

# os.makedirs(paths["models"], exist_ok=True) 
# os.makedirs(paths["results"], exist_ok=True)

# class TokenizerSaver(TrainerCallback):
#     def __init__(self, tokenizer):
#         self.tokenizer = tokenizer

#     def on_save(self, args, state, control, **kwargs):
#         ckpt_dir = Path(args.output_dir) / f"checkpoint-{state.global_step}"
#         self.tokenizer.save_pretrained(ckpt_dir)
#         return control

# class NLPModel:
#     def __init__(self, datanum: int = 1000, mode: str = "train") -> None:
#         """
#         Nếu mode == "train": tạo dataset, load checkpoint (nếu có) và thiết lập model để huấn luyện.
#         Nếu mode == "usage": load model đã lưu từ paths["models"] và chỉ sử dụng hàm inference.
#         """
#         self.mode = mode
#         # Xác định device (GPU nếu có)
#         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
#         if self.mode == "train":
#             # Load dữ liệu
#             self.qfragments = json5.load(open(paths["qfragments"]))
#             self.questions = self._dataset(datanum)
            
#             # Huấn luyện: load checkpoint (từ checkpoint đã lưu hay revert về t5-small)
#             self.model_checkpoint = self._loading(resume_checkpoint="auto")
#             self.tokenizer = T5Tokenizer.from_pretrained(self.model_checkpoint)
#             self.model = T5ForConditionalGeneration.from_pretrained(self.model_checkpoint)
#             self.model.to(self.device)
#             if self.model_checkpoint != "t5-small":
#                 self.checkpoint_used = self.model_checkpoint
#                 print(f"Tiếp tục huấn luyện từ checkpoint: {self.checkpoint_used}")
#             else:
#                 self.checkpoint_used = None
#                 print("Huấn luyện từ đầu (không có checkpoint hợp lệ)")
#         elif self.mode == "usage":
#             # Chế độ sử dụng: load model và tokenizer đã lưu
#             self.tokenizer = T5Tokenizer.from_pretrained(paths["models"])
#             self.model = T5ForConditionalGeneration.from_pretrained(paths["models"])
#             self.model.to(self.device)
#             print("Đang sử dụng model đã lưu.")
#         else:
#             raise ValueError("Mode phải là 'train' hoặc 'usage'.")

#         # Các bước sau chỉ áp dụng cho chế độ train
#         if self.mode == "train":
#             self.data_collator = DataCollatorForSeq2Seq(self.tokenizer, model=self.model)
#             self._prepare_datasets()
            
#         # Thêm các token đặc biệt (áp dụng cho cả 2 mode)
#         special_tokens = ["BRAND:", "PRICE:", "RAM:", "GPU:", "CPU:", "DISPLAY:", "REFRESH_RATE:", ";", "none"]
#         self.tokenizer.add_tokens(special_tokens)
#         self.model.resize_token_embeddings(len(self.tokenizer))

#     def _dataset(self, datanum: int = 1000) -> pd.DataFrame:
#         if os.path.exists(paths["trained_questions"]):
#             old_data = pd.read_csv(paths["trained_questions"])
#             new_questions = pd.concat([old_data, generate_text(datanum)])
#         else:
#             new_questions = generate_text(datanum)
        
#         new_questions.to_csv(paths["trained_questions"], index=False)
#         return new_questions
    
#     def _rows_preprocessor(self, row: pd.Series) -> dict:
#         return {
#             "input_text": f"Extract component: {row['question']}",
#             "target_text": (
#                 f"BRAND: {row['brand']}; "
#                 f"PRICE: {row['price']}; "
#                 f"RAM: {row['ram']}; "
#                 f"GPU: {row['gpu']}; "
#                 f"CPU: {row['cpu']}; "
#                 f"DISPLAY: {row['display']}; "
#                 f"REFRESH_RATE: {row['refresh rate']}"
#             ).replace("  ", " ").strip(),
#         }

#     def _tokenize_function(self, examples):
#         tokenized = self.tokenizer(
#             examples["input_text"],
#             max_length=512,
#             truncation=True,
#             padding="max_length",
#         )
#         with self.tokenizer.as_target_tokenizer():
#             labels = self.tokenizer(
#                 examples["target_text"],
#                 max_length=128,
#                 truncation=True,
#                 padding="max_length",
#             )
#         tokenized["labels"] = labels["input_ids"]
#         return tokenized

#     def _prepare_datasets(self):
#         processed_data = self.questions.apply(self._rows_preprocessor, axis=1)
#         df_processed = pd.DataFrame(list(processed_data))
#         full_dataset = Dataset.from_pandas(df_processed)
#         split_datasets = full_dataset.train_test_split(test_size=0.2, seed=42)
#         self.tokenized_train = split_datasets["train"].map(
#             self._tokenize_function,
#             batched=True,
#             remove_columns=split_datasets["train"].column_names,
#         )
#         self.tokenized_eval = split_datasets["test"].map(
#             self._tokenize_function,
#             batched=True,
#             remove_columns=split_datasets["test"].column_names,
#         )

#     def _loading(self, resume_checkpoint: str = "auto") -> str:
#         """
#         Hàm _loading tương tự như _load_checkpoint cũ, dùng để kiểm tra và load checkpoint hợp lệ.
#         """
#         def is_valid_checkpoint(ckpt_path: Path) -> bool:
#             tokenizer_files = {
#                 "tokenizer_config.json",
#                 "special_tokens_map.json",
#                 "spiece.model"
#             }
#             model_files = {
#                 "config.json",
#                 "pytorch_model.bin",      
#                 "model.safetensors",      
#                 "generation_config.json"    
#             }
#             has_model_files = any((ckpt_path / f).exists() for f in model_files if f != "config.json")
#             has_config = (ckpt_path / "config.json").exists()
#             has_tokenizer = all((ckpt_path / f).exists() for f in tokenizer_files)
#             return has_config and has_model_files and has_tokenizer

#         if resume_checkpoint == "auto":
#             checkpoint_dirs = sorted(
#                 Path(paths["results"]).glob("checkpoint-*"), 
#                 key=lambda x: int(x.name.split("-")[-1]) if x.name.split("-")[-1].isdigit() else 0,
#                 reverse=True
#             )
#             for ckpt_dir in checkpoint_dirs:
#                 if ckpt_dir.is_dir() and is_valid_checkpoint(ckpt_dir):
#                     print(f"Auto-selected valid checkpoint: {ckpt_dir}")
#                     return str(ckpt_dir)
#             print("No valid checkpoints found in auto mode, using default t5-small")
#             return "t5-small"

#         manual_ckpt = Path(resume_checkpoint)
#         if manual_ckpt.exists():
#             if manual_ckpt.is_dir() and is_valid_checkpoint(manual_ckpt):
#                 return str(manual_ckpt)
#             print(f"Manual checkpoint {manual_ckpt} is invalid, checking in results directory...")
#             manual_ckpt = Path(paths["results"]) / resume_checkpoint
#             if manual_ckpt.exists() and manual_ckpt.is_dir() and is_valid_checkpoint(manual_ckpt):
#                 return str(manual_ckpt)
#         print(f"Checkpoint {resume_checkpoint} is invalid or corrupted. Reverting to t5-small")
#         return "t5-small"

#     def _del_checkpoint(self) -> None:
#         checkpoint_dirs = list(Path(paths["results"]).glob("checkpoint-*"))
#         checkpoints = sorted(
#             [
#                 ckpt
#                 for ckpt in checkpoint_dirs
#                 if ckpt.is_dir() and ckpt.name.split("-")[-1].isdigit()
#             ],
#             key=lambda x: int(x.name.split("-")[-1]),
#         )
#         if len(checkpoints) > 3:
#             for checkpoint in checkpoints[:-3]:
#                 shutil.rmtree(checkpoint)
#                 # Không cần thông báo khi xóa checkpoint

#     def _nlptraining(
#         self,
#         early_stopping_patience: int = 3,
#         resume_checkpoint: str = None,
#         batch_size: int = 16,
#         learning_rate: float = 3e-5,
#         num_train_epochs: int = 1,
#     ) -> None:
#         training_args = TrainingArguments(
#             output_dir=str(paths["results"]),
#             overwrite_output_dir=True,
#             report_to="none",          
#             logging_strategy="steps", 
#             evaluation_strategy="steps",   
#             save_strategy="steps",                           
#             logging_steps=25,              
#             eval_steps=25,                 
#             save_steps=25,           
#             learning_rate=learning_rate,
#             per_device_train_batch_size=batch_size,
#             num_train_epochs=num_train_epochs,
#             weight_decay=0.01,
#             gradient_accumulation_steps=2,
#             fp16=True,
#             load_best_model_at_end=True,
#             greater_is_better=True,
#             eval_accumulation_steps=1,
#             resume_from_checkpoint=resume_checkpoint,
#             save_safetensors=False,
#         )

#         self.trainer = Trainer(
#             model=self.model,
#             args=training_args,
#             train_dataset=self.tokenized_train,
#             eval_dataset=self.tokenized_eval,
#             data_collator=self.data_collator,
#             callbacks=[
#                 EarlyStoppingCallback(early_stopping_patience=early_stopping_patience),
#                 TokenizerSaver(self.tokenizer),  
#             ],
#         )
            
#         self.trainer.train()
#         self.model.save_pretrained(paths["models"])
#         self.tokenizer.save_pretrained(paths["models"])
#         # Sau khi huấn luyện, xóa các checkpoint cũ nếu cần
#         self._del_checkpoint()

#     def inference(self, text: str) -> dict:
#         """
#         Nhận vào 1 chuỗi văn bản và trả về dict chứa các component:
#         { 'cpu': '...', 'gpu': '...', 'display': '...', 'ram': '...', 'brand': '...', 'price': '...', 'refresh_rate': '...' }
#         Nếu một component không được phát hiện thì giá trị là None.
#         """
#         self.model.eval()  # Đưa model vào chế độ eval
#         input_text = "Extract component: " + text
#         inputs = self.tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
#         inputs = inputs.to(self.device)  # Chuyển tensor đầu vào sang cùng device với model
        
#         outputs = self.model.generate(
#             inputs,
#             max_length=128,
#             num_beams=5,
#             early_stopping=True
#         )
#         output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
#         pattern = r"([A-Z_]+):\s*([^;]+)"
#         matches = re.findall(pattern, output_text)
        
#         components = ["BRAND", "PRICE", "RAM", "GPU", "CPU", "DISPLAY", "REFRESH_RATE"]
#         result = { comp.lower(): None for comp in components }
        
#         for key, value in matches:
#             key = key.strip().upper()
#             value = value.strip() if value.strip() else None
#             if key in components:
#                 result[key.lower()] = value
#         return result

# # Ví dụ sử dụng cho chế độ train:
# if __name__ == "__main__":
#     # Chạy chế độ train:
#     model_train = NLPModel(300, mode="train")
#     model_train._nlptraining(resume_checkpoint="auto")
    
#     # Chạy chế độ usage (sau khi đã có model đã lưu)
#     model_usage = NLPModel(mode="usage")
#     sample_text = "recommend me a laptop have rtx 3070, ram 16GB, about 2000 USD"
#     inference_result = model_usage.inference(sample_text)
#     print("Inference result:", inference_result)
