In [58]:
__file__ = "__init__.py"

In [75]:
import os, sys
# from train_ner import *
from pathlib import Path
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np  
from torch.utils.data import Dataset, DataLoader
from rapidfuzz import fuzz
from nltk.tokenize import word_tokenize

In [60]:
project_root = Path(__file__).resolve().parents[1]
sys.path.append(str(project_root))

from utils.ncomp import rlst, srlst, clst, glst, rrlst, dtlst, sslst
from utils.nlp_utils import tokenize, stem, bag_of_words
paths = {
    "processed": os.path.abspath(f"{project_root}/data/storage/processed"),
    "odata": os.path.abspath(f"{project_root}/data/storage/processed/final_cleaning.csv"),
    "config" : os.path.abspath(f"{project_root}/config/model_config.json"),
    "models": os.path.abspath(f"{project_root}/models"),
}

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetwork, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        return out

class ChatDataset(Dataset):
    def __init__(self, X_train, Y_train):
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = Y_train

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.n_samples

In [62]:
def build_training_data():
    """
    Xây dựng training data từ:
    - Tên chuẩn: lấy từ các cột của file CSV (đã chuyển về chữ thường)
    - Tên trộn (canonical + alias): lấy từ các hàm (glst, clst, …)
    
    Giả định: Nếu một tên từ danh sách trộn xuất hiện trong danh sách tên chuẩn của odata,
    gán nhãn canonical (label = 1), ngược lại là alias (label = 0).
    """
    # Đọc file CSV (chỉ chứa tên chuẩn cho mỗi component)
    odata = pd.read_csv(paths["odata"])
    # Định nghĩa mapping cột cho từng loại component
    col_map = {
        "brand": "BRAND",
        "gpu": "GPU",
        "cpu": "CPU",
        "ram": "RAM",
        "resolution": "RESOLUTION",
        "refresh rate": "REFRESH RATE",
        "display type": "DISPLAY TYPE",
        "screen size": "SCREEN SIZE"
    }
    
    # Các hàm trả về danh sách tên (canonical + alias)
    func_map = {
        "brand": lambda: [br.lower() for br in odata["BRAND"].dropna().unique().tolist()],
        "gpu": glst,
        "cpu": clst,
        "ram": rlst,
        "resolution": srlst,
        "refresh rate": rrlst,
        "display type": dtlst,
        "screen size": sslst
    }
    
    X_text = []
    Y_label = []
    # Duyệt qua các loại component
    for comp, func in func_map.items():
        # Lấy danh sách tên chuẩn từ odata theo cột tương ứng (chuẩn hóa về lower)
        canonical_list = odata[col_map[comp]].dropna().unique().tolist() if comp in col_map else []
        canonical_list = [str(name).lower() for name in canonical_list]
        # Lấy danh sách tên trộn từ hàm (canonical + alias)
        mixed_list = func()
        for name in mixed_list:
            name_lower = name.lower()
            # Nếu tên có trong canonical_list => label 1, ngược lại label 0
            label = 1 if name_lower in canonical_list else 0
            X_text.append(name_lower)
            Y_label.append(label)
    return X_text, Y_label


In [63]:
def build_vocabulary(tokenized_sentences):
    """
    Xây dựng từ điển (vocabulary) từ các câu tokenized.
    """
    vocab = set()
    for tokens in tokenized_sentences:
        for token in tokens:
            vocab.add(stem(token))
    return sorted(vocab)

In [64]:
def train_model(num_epochs=1000, learning_rate=0.001, hidden_size=8):
    # Xây dựng training data
    X_text, Y_label = build_training_data()
    
    # Tokenize từng câu
    tokenized_sentences = [tokenize(sentence) for sentence in X_text]
    # Xây dựng vocabulary
    vocabulary = build_vocabulary(tokenized_sentences)
    input_size = len(vocabulary)
    output_size = 2  # 2 lớp: canonical (1) và alias (0)
    
    print(f"Vocabulary size: {input_size}")
    print(f"Training samples: {len(X_text)}")
    
    # Tạo feature vectors sử dụng bag_of_words
    X_train = [bag_of_words(tokens, vocabulary) for tokens in tokenized_sentences]
    X_train = np.array(X_train)
    Y_train = np.array(Y_label)
    
    # Chuyển sang tensor
    X_train = torch.from_numpy(X_train)
    Y_train = torch.from_numpy(Y_train).long()
    
    dataset = ChatDataset(X_train, Y_train)
    dataloader = DataLoader(dataset=dataset, batch_size=4, shuffle=True)
    
    # Khởi tạo model
    model = NeuralNetwork(input_size, hidden_size, output_size)
    
    # Loss và optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Huấn luyện model
    for epoch in range(num_epochs):
        for (words, labels) in dataloader:
            outputs = model(words)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        if (epoch+1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
    
    print("Huấn luyện hoàn tất.")
    return model, vocabulary

In [65]:
model, vocab = train_model(num_epochs=100, learning_rate=0.001, hidden_size=16)

Vocabulary size: 266
Training samples: 965
Epoch [100/100], Loss: 0.0000
Huấn luyện hoàn tất.


In [76]:
def get_canonical_name(input_name: str, canonical_list: list, similarity_threshold: int = 80) -> str:
    """
    So sánh input_name (alias) với danh sách canonical_list sử dụng fuzz.token_sort_ratio.
    Sử dụng phiên bản lower-case của chuỗi để tính điểm tương đồng, nhưng trả về tên gốc (với chữ hoa, chữ thường)
    từ danh sách canonical nếu đạt ngưỡng.
    """
    best_score = -1
    best_match = input_name  # mặc định nếu không tìm thấy điểm cao hơn
    for canon in canonical_list:
        score = fuzz.token_sort_ratio(input_name.lower(), canon.lower())
        if score > best_score:
            best_score = score
            best_match = canon  # lưu giữ tên gốc từ CSV
    if best_score >= similarity_threshold:
        return best_match
    else:
        return input_name

def predict_canonical(model, vocab, test_sentence: str, canonical_list: list, similarity_threshold: int = 80):
    # Tiền xử lý: tokenize và tạo bag-of-words
    tokens = word_tokenize(test_sentence.lower())
    
    def bag_of_words(tokenized_sentence, words):
        bag = np.zeros(len(words), dtype=np.float32)
        for idx, w in enumerate(words):
            if w in tokenized_sentence:
                bag[idx] = 1.0
        return bag

    test_bag = bag_of_words(tokens, vocab)
    test_tensor = torch.from_numpy(test_bag).unsqueeze(0)  # thêm batch dimension
    output = model(test_tensor)
    _, predicted = torch.max(output, dim=1)
    label = predicted.item()  # 1: canonical, 0: alias
    
    # Nếu dự đoán là alias thì tìm tên chuẩn tương ứng từ canonical_list
    if label == 0:
        return get_canonical_name(test_sentence, canonical_list, similarity_threshold)
    else:
        # Trong trường hợp dự đoán canonical, vẫn tiến hành mapping để đảm bảo trả về tên chuẩn đúng định dạng từ CSV
        return get_canonical_name(test_sentence, canonical_list, similarity_threshold)



In [79]:
odata = pd.read_csv(paths["odata"])
canonical_gpu = odata["GPU"].dropna().unique().tolist()

test_sentence = "rtx 4070"
canonical_name = predict_canonical(model, vocab, test_sentence, canonical_gpu, similarity_threshold=80)
print(f"Input: {test_sentence} => Tên chuẩn: {canonical_name}")

Input: rtx 4070 => Tên chuẩn: rtx 4070


In [70]:
odata

Unnamed: 0,PRICE,BRAND,DEVICE,WARRANTY PERIOD,TYPE,WEIGHT,WIDTH,HEIGHT,THICKNESS,SCREEN SIZE,...,GPU TEXTURE RATE,GPU PIXEL RATE,GPU RENDER OUTPUT UNITS (ROPS),GPU TEXTURE MAPPING UNITS (TMUS),GPU FLOATING-POINT PERFORMANCE,CPU PASSMARK RESULT,CPU PASSMARK RESULT (SINGLE),CPU GEEKBENCH 6 RESULT (MULTI),CPU GEEKBENCH 6 RESULT (SINGLE),GPU PASSMARK (G3D) RESULT
0,1893.53,Acer,Acer ConceptD 5 (2023) 16 Inches Intel Core i7...,1,"Gaming, Productivity",2.40,3.58,2.62,0.19,16.0,...,248.20,112.80,80,176,15.88,26130,3582,13360,2531,18479
1,1752.17,Acer,Acer Nitro 16 (2023) AMD Ryzen 5 7640HS 4.3GHz...,1,"Gaming, Productivity",2.80,3.60,2.79,0.27,16.0,...,204.80,113.76,48,80,12.13,22983,3630,9150,2290,17148
2,1611.15,Acer,Acer Nitro 16 (2023) AMD Ryzen 7 7840HS 3.8GHz...,1,"Gaming, Productivity",2.70,3.60,2.79,0.27,16.0,...,204.80,113.76,48,80,12.13,28905,3784,11004,2367,17148
3,1743.39,Acer,Acer Nitro 16 (2023) AMD Ryzen 7 7840HS 3.8GHz...,1,"Gaming, Productivity",2.80,3.60,2.79,0.27,16.0,...,227.52,113.76,48,96,14.56,28905,3784,11004,2367,17710
4,1665.49,Acer,Acer Nitro 16 (2023) AMD Ryzen 9 7940HS 4GHz /...,1,"Gaming, Productivity",2.80,3.60,2.79,0.27,16.0,...,313.20,104.40,48,144,20.04,30388,3904,11560,2460,19574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,1726.15,XMG,XMG Neo 17 (2024) 17 Inches Intel Core i9-1490...,2,"Gaming, Productivity",2.80,3.81,2.72,0.27,17.0,...,227.52,113.76,48,96,14.56,45715,4300,15988,2711,17710
545,2229.76,XMG,XMG Pro 15 (2023) 15.6 Inches Intel Core i9-13...,2,"Gaming, Productivity",2.40,3.58,2.40,0.24,15.6,...,313.20,104.40,48,144,20.04,44115,4121,15722,2712,19574
546,1747.64,XMG,XMG Pro 16 Studio (2024) 16 Inches Intel Core ...,2,"Gaming, Productivity",2.30,3.59,2.73,0.19,16.0,...,227.52,113.76,48,96,14.56,45715,4300,15988,2711,17710
547,1662.40,XMG,XMG Pro 17 (2023) 17.3 Inches Intel Core i9-13...,2,"Gaming, Productivity",2.80,3.96,2.62,0.24,17.3,...,313.20,104.40,48,144,20.04,44115,4121,15722,2712,19574


In [68]:
# model = HybridModel(categorical_columns, numerical_columns, embedding_dim=8)
# if "laptop_evaluator.pth" not in os.listdir(paths["models"]):
#     print("Model not found. Training a new model...")
#     model_train()

# model.load_state_dict(
#     torch.load(
#         os.path.join(paths["models"], "laptop_evaluator.pth"), weights_only=False
#     )
# )
# model.eval()

# def Searching(query, top_k=5):
#     filtered = df[df["DEVICE"].str.contains(query, case=False)].copy()
    
#     if filtered.empty:
#         return filtered
#     numerical_input = torch.tensor(filtered[numerical_columns].values, dtype=torch.float)
#     categorical_input = torch.tensor(filtered[categorical_columns].values, dtype=torch.long)

#     model.eval()
#     with torch.no_grad():
#         outputs = model(numerical_input, categorical_input)
#         predicted_scores = outputs.squeeze().numpy()

#     filtered["PREDICTED_SCORE"] = predicted_scores
#     sorted_filtered = filtered.sort_values(
#         by=["PREDICTED_SCORE", "CPU PASSMARK RESULT", "GPU PASSMARK (G3D) RESULT"],
#         ascending=[False, False, False]
#     ).head(top_k)

#     return sorted_filtered

In [69]:
# recommendations = Searching("RTX 3060", top_k=10)
# for idx, laptop in recommendations.iterrows():
#     print(f"{laptop['DEVICE']} - Score: {laptop['SCORE']:.1f}")