In [1]:
__file__ = "__init__.py"

In [2]:
import os, sys, re
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
import torch.cuda.amp as amp
import pandas as pd
from pathlib import Path
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Optional

project_root = Path(__file__).resolve().parents[1]
sys.path.append(str(project_root))

from utils.ncomp import rlst, srlst, clst, glst, rrlst, dtlst, sslst

paths = {
    "processed": os.path.abspath(f"{project_root}/data/storage/processed"),
    "odata": os.path.abspath(f"{project_root}/data/storage/processed/final_cleaning.csv"),
}
odata = pd.read_csv(paths["odata"])

In [3]:
# ---------------------------
# Neural Network với LayerNorm, Dropout và tối ưu hóa (pruning, TorchScript, FP16)
# ---------------------------
class NeuralNetwork(nn.Module):
    """
    Neural Network nâng cấp với Layer Normalization và Dropout.
    """
    def __init__(self, input_size, hidden_size, num_classes, dropout_rate=0.3):
        super(NeuralNetwork, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.ln1 = nn.LayerNorm(hidden_size)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.l3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.l1(x)
        out = self.ln1(out)
        out = self.relu(out)
        out = self.dropout1(out)
        out = self.l2(out)
        out = self.ln2(out)
        out = self.relu(out)
        out = self.dropout2(out)
        out = self.l3(out)
        return out

    def optimize(self, pruning_amount=0.3):
        # Áp dụng pruning cho các lớp Linear
        for _, module in self.named_modules():
            if isinstance(module, nn.Linear):
                prune.l1_unstructured(module, name='weight', amount=pruning_amount)
                prune.remove(module, 'weight')
        # Chuyển mô hình sang half precision để tối ưu inference trên GPU
        self.half()
        # Sử dụng TorchScript để compile mô hình
        scripted_model = torch.jit.script(self)
        return scripted_model
    
    def __getstate__(self):
        # Lưu lại hyperparameters và state_dict thay vì toàn bộ đối tượng module
        state = {
            'input_size': self.input_size,
            'hidden_size': self.hidden_size,
            'num_classes': self.num_classes,
            'dropout_rate': self.dropout_rate,
            'state_dict': self.state_dict()
        }
        return state

    def __setstate__(self, state):
        # Khởi tạo lại đối tượng với các tham số đã lưu
        self.__init__(state['input_size'], state['hidden_size'], state['num_classes'], state['dropout_rate'])
        self.load_state_dict(state['state_dict'])

In [4]:
class ComponentExtractor:
    def __init__(
        self,
        model_names=["all-mpnet-base-v2", "all-MiniLM-L6-v2"],
        thresholds: Dict[str, float] = None,
        fuzzy_config: Dict[str, int] = None,
        similarity_weights: dict = None,
        use_nn: bool = True,
        embedding_dim: int = 384,
        hidden_size: int = 256,
        ensemble_weights: list = None
        ) -> None:
        # Thiết lập thiết bị: GPU nếu có, ngược lại CPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Tải dữ liệu và mô hình
        self.odata = pd.read_csv(paths["odata"])
        self.models = [SentenceTransformer(name) for name in model_names]
        self.components = self._load_components()
        self.df_components = pd.DataFrame.from_dict(self.components, orient="index").transpose()
        self.df_components.columns = [col.lower() for col in self.df_components.columns]
        
        # Precompute embeddings cho từng mô hình, đồng thời chuyển về self.device,
        # và nếu cần, sử dụng projection để đưa về không gian chung với chiều = embedding_dim.
        self.desired_embedding_dim = embedding_dim
        self.model_projections = self._create_model_projections()
        self.embeddings = self._precompute_all_embeddings()
        self.ensemble_weights = ensemble_weights if ensemble_weights is not None else [0.6, 0.4]
        
        # sử dụng neural network để học biểu diễn
        self.use_nn = use_nn
        if self.use_nn:
            self.embedding_net = NeuralNetwork(
                input_size=self.desired_embedding_dim,
                hidden_size=hidden_size,
                num_classes=self.desired_embedding_dim,
                dropout_rate=0.3
            ).to(self.device)
            # Tối ưu hóa neural network: pruning, chuyển sang TorchScript và half precision
            self.embedding_net = self.embedding_net.optimize(pruning_amount=0.3)
        else:
            self.embedding_net = None

        # Cấu hình ngưỡng: GPU 0.8, CPU 0.75, các cột khác mặc định 0.60 (có thể thay đổi qua file cấu hình)
        default_thresholds = {"gpu": 50, "cpu": 50, "default": 0.6}
        self.thresholds = thresholds if thresholds is not None else default_thresholds

        # Cấu hình fuzzy matching mặc định
        default_fuzzy = {
            "brand": {"score_cutoff": 100, "scorer": fuzz.token_sort_ratio},
            "gpu": {"score_cutoff":60, "scorer": fuzz.token_sort_ratio},
            "cpu": {"score_cutoff": 60, "scorer": fuzz.token_sort_ratio},
            "default": {"score_cutoff": 75, "scorer": fuzz.WRatio}
        }
        self.fuzzy_config = fuzzy_config if fuzzy_config is not None else default_fuzzy

        # Trọng số kết hợp giữa cosine similarity và fuzzy matching
        default_sim_weights = {"cosine": 0.3, "fuzzy": 0.7}
        self.similarity_weights = similarity_weights if similarity_weights is not None else default_sim_weights
        
    def _load_components(self) -> dict:
        """Tải và chuẩn hóa danh sách component từ odata và các hàm mẫu."""
        components = {
            "brand": [br.lower() for br in self.odata["BRAND"].unique()],
            "gpu": sorted(glst(), key=len, reverse=False),
            "cpu": sorted(clst(), key=len, reverse=False),
            "ram": sorted(rlst(), key=len, reverse=False),
            "resolution": sorted(srlst(), key=len, reverse=True),
            "refresh rate": sorted(rrlst(), key=len, reverse=False),
            "display type": sorted(dtlst(), key=len, reverse=False),
            "screen size": sorted(sslst(), key=len, reverse=False),
        }
        return components
    
    def _create_model_projections(self) -> List[Optional[nn.Linear]]:
        projections = []
        for model in self.models:
            dummy_text = self._clean_text("dummy")
            emb = model.encode(dummy_text, convert_to_tensor=True)
            current_dim = emb.shape[0]
            if current_dim != self.desired_embedding_dim:
                proj = nn.Linear(current_dim, self.desired_embedding_dim).to(self.device)
                projections.append(proj)
            else:
                projections.append(None)
        return projections
    
    def _precompute_all_embeddings(self) -> List[Dict[str, torch.Tensor]]:
        embeddings = []
        for idx, model in enumerate(self.models):
            model_emb = {}
            proj = self.model_projections[idx]
            for comp_type, comp_list in self.components.items():
                cleaned = [self._clean_text(text) for text in comp_list]
                emb = model.encode(cleaned, convert_to_tensor=True)
                if proj is not None:
                    emb = proj(emb)
                model_emb[comp_type] = emb.to(self.device)
            embeddings.append(model_emb)
        return embeddings
    
    def _clean_text(self, text: str=None) -> str:
        text = str(text)
        text = text.lower()
        text = re.sub(r"[\-/]", " ", text)
        text = re.sub(r"[^a-z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text 

    def _ensemble_scores(self, scores_list: list) -> torch.Tensor:
        weighted = sum(weight * score for weight, score in zip(self.ensemble_weights, scores_list))
        return weighted
    
    def _fuzzy_match(self, candidates: list, query: str, comp_type: str) -> str:
        best_candidate = None
        best_score = 0
        config = self.fuzzy_config.get(comp_type, self.fuzzy_config.get("default"))
        cutoff = config.get("score_cutoff", 60)
        scorer = config.get("scorer", fuzz.WRatio)
        for cand in candidates:
            score = scorer(query, cand)
            if score > best_score:
                best_score = score
                best_candidate = cand
        if best_score >= cutoff:
            return best_candidate
        return None

    def standardize_component(self, comp_type: str, candidate: str) -> str:
        candidate_clean = self._clean_text(candidate)
        if comp_type.upper() in self.odata.columns:
            canonical_list = self.odata[comp_type.upper()].dropna().unique().tolist()
        else:
            canonical_list = [candidate]
        
        best_score = 0.0
        best_name = candidate
        model = self.models[0]
        emb_candidate = model.encode(candidate_clean, convert_to_tensor=True).to(self.device)
        for name in canonical_list:
            name_clean = self._clean_text(name)
            emb_name = model.encode(name_clean, convert_to_tensor=True).to(self.device)
            cosine_sim = torch.cosine_similarity(emb_candidate, emb_name, dim=0).item()
            fuzzy_score = fuzz.ratio(candidate_clean, name_clean) / 100.0
            weighted_score = (self.similarity_weights["cosine"] * cosine_sim +
                            self.similarity_weights["fuzzy"] * fuzzy_score)
            # Debug: in ra điểm số của candidate so với tên chuẩn
            print(f"[DEBUG] Candidate '{candidate}' vs '{name}': cosine={cosine_sim:.3f}, fuzzy={fuzzy_score:.3f}, weighted={weighted_score:.3f}")
            if weighted_score > best_score:
                best_score = weighted_score
                best_name = name
        
        threshold = self.thresholds.get(comp_type, self.thresholds.get("default", 0.60))
        print(f"[DEBUG] Best candidate for '{candidate}' in {comp_type} is '{best_name}' with score {best_score:.3f} (threshold {threshold})")
        return best_name if best_score >= threshold else candidate

    
    def extract_components(self, query: str) -> dict:
        processed_question = self._clean_text(query)
        query_embeddings = []
        for idx, model in enumerate(self.models):
            q_emb = model.encode(processed_question, convert_to_tensor=True)
            proj = self.model_projections[idx]
            if proj is not None:
                q_emb = proj(q_emb)
            query_embeddings.append(q_emb.to(self.device))
        
        result = {}
        for comp_type in self.components.keys():
            if not self.components[comp_type]:
                result[comp_type] = None
                continue
            
            scores_all = []
            for idx, emb_dict in enumerate(self.embeddings):
                known_emb = emb_dict[comp_type]
                if self.use_nn:
                    # Sử dụng AMP để tối ưu inference trên GPU với half precision
                    with torch.amp.autocast("cuda"):
                        refined_known = self.embedding_net(known_emb)
                        refined_query = self.embedding_net(query_embeddings[idx].unsqueeze(0)).squeeze(0)
                else:
                    refined_known = known_emb
                    refined_query = query_embeddings[idx]
                scores = torch.cosine_similarity(refined_known, refined_query.unsqueeze(0), dim=1)
                scores_all.append(scores)
            
            combined_scores = self._ensemble_scores(scores_all)
            best_score, best_idx = torch.max(combined_scores, dim=0)
            candidate = self.components[comp_type][best_idx.item()] if best_score.item() >= self.thresholds.get(comp_type, self.thresholds.get("default", 0.60)) else None
            
            # Chuẩn hóa tên component dựa trên odata với phương pháp kết hợp similarity
            if candidate:
                standardized = self.standardize_component(comp_type, candidate)
                result[comp_type] = standardized
            else:
                candidate = self._fuzzy_match(self.components[comp_type], query, comp_type)
                result[comp_type] = candidate
        return result

In [None]:
# extractor = ComponentExtractor()