In [None]:
# import thư viện
import os, json, joblib
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBRanker

# Lấy đường dẫn tới thư mục chứa mã nguồn
__file__ = os.getcwd()
base_dir = Path(__file__).resolve().parents[0]
storage_dir = os.path.join(base_dir, "data", "storage")
intents_dir = os.path.join(base_dir, "intents")
paths = {"processed": os.path.join(storage_dir, "processed"),}
dataset = pd.read_csv(os.path.join(paths["processed"], "final_preprocessing.csv"))
os.makedirs("Models", exist_ok=True)
with open(os.path.join(intents_dir, "config.json"), "r") as file:
    config = json.load(file)

# Lọc các cột sử dụng
df = dataset[config["usage_columns"]].copy()
numerical_features = []

# Label Encoding cho BRAND
brand_encoder = LabelEncoder()
df["BRAND_encoded"] = brand_encoder.fit_transform(df["BRAND"])
joblib.dump(brand_encoder, "Models/brand_encoder.pkl")

# Tạo từ điển ánh xạ GPU và CPU sang ID
gpu_to_id = {gpu: idx for idx, gpu in enumerate(df["GPU"].unique())}
cpu_to_id = {cpu: idx for idx, cpu in enumerate(df["CPU"].unique())}

# Ánh xạ GPU và CPU sang ID
df["GPU_id"] = df["GPU"].map(gpu_to_id)
df["CPU_id"] = df["CPU"].map(cpu_to_id)

# Lưu từ điển ánh xạ GPU và CPU sang ID
joblib.dump(gpu_to_id, "Models/gpu_to_id.pkl")
joblib.dump(cpu_to_id, "Models/cpu_to_id.pkl")

# Tính điểm số
df["SCORE"] = 0.5 * df["CPU PASSMARK RESULT"] + 0.5 * df["GPU PASSMARK (G3D) RESULT"]

# Xây dựng mô hình
class LaptopRanker(nn.Module):
    def __init__(self, num_brands, num_gpus, num_cpus, embedding_dim=8):
        super().__init__()
        # Embedding layers
        self.brand_embedding = nn.Embedding(num_brands, embedding_dim)
        self.gpu_embedding = nn.Embedding(num_gpus, embedding_dim)
        self.cpu_embedding = nn.Embedding(num_cpus, embedding_dim)
        # Fully connected layers
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim * 3 + len(numerical_features), 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
        )

    def forward(self, brand_ids, gpu_ids, cpu_ids, numerical_features):
        # Embedding lookup
        brand_emb = self.brand_embedding(brand_ids)
        gpu_emb = self.gpu_embedding(gpu_ids)
        cpu_emb = self.cpu_embedding(cpu_ids)
        # Kết hợp các features
        x = torch.cat([brand_emb, gpu_emb, cpu_emb, numerical_features], dim=1)
        return self.fc(x)

# Chuẩn hóa dữ liệu
for feature in config["usage_columns"]:
    if feature not in ["DEVICE", "BRAND", "GPU", "CPU"]:
        numerical_features.append(feature)
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])
final_features = numerical_features + ["BRAND_encoded", "GPU_id", "CPU_id"]

# Chuẩn bị dữ liệu
X = df[final_features]
y = df["SCORE"]  # Hoặc kết hợp thêm metrics khác

# Define groups (for simplicity, assuming each row is a separate group)
group = np.ones(len(X), dtype=int)

# Train model
model = XGBRanker(objective="rank:pairwise", tree_method="hist", random_state=42)
model.fit(X, y, group=group)

# output theo que query
def rank_laptops(query, top_k=5):
    # Lọc laptop theo query (vd: "gaming", "content creation")
    filtered = df[df["DEVICE"].str.contains(query, case=False)]

    # Dự đoán điểm số
    filtered["PREDICTED_SCORE"] = model.predict(filtered[final_features])

    # Sắp xếp theo ưu tiên
    return filtered.sort_values(
        by=["PREDICTED_SCORE", "CPU PASSMARK RESULT", "GPU PASSMARK (G3D) RESULT"],
        ascending=[False, False, False],
    ).head(top_k)

# Get recommendations for gaming laptops under $3000
recommendations = rank_laptops("RTX 3080", top_k=5)
for index, laptop in recommendations.iterrows():
    print(f"{laptop['DEVICE']} - Score: {laptop['SCORE']:.1f}")


In [None]:
import os, json5, joblib, re
from configloading import load
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from xgboost import XGBRanker
from sklearn.metrics import mean_absolute_error

__file__ = os.getcwd()

base_dir = Path(__file__).resolve().parents[0]
storage_dir = os.path.join(base_dir, "data", "storage")
intents_dir = os.path.join(base_dir, "intents")
paths = {
    "processed": os.path.join(storage_dir, "processed"),
    "config": os.path.join(intents_dir, "config.json"),
    "intents": os.path.join(intents_dir, "intents.json"),
    }

dataset = pd.read_csv(os.path.join(paths["processed"], "final_preprocessing.csv"))
os.makedirs("Models", exist_ok=True)

In [28]:
encoder_features, numerical_features = load(JsonFilePath=paths["config"])
usage_columns = encoder_features + numerical_features
df = dataset[usage_columns].copy()

encoder = LabelEncoder()
for feature in encoder_features:
    if feature != "DEVICE":
        df[f"{feature}_encoded"] = encoder.fit_transform(df.loc[:, feature])
        joblib.dump(encoder, "Models/brand_encoder.pkl")

# # Chuẩn hóa các features numericaL
# for feature in usage_columns:
#     if feature != "DEVICE":
#         numerical_features.append(feature)
# scaler = StandardScaler()
# df[numerical_features] = scaler.fit_transform(
#     df[usage_columns].loc[:, numerical_features]
# )
df

Unnamed: 0,BRAND,DEVICE,CPU,GPU,TYPE,RESOLUTION,DISPLAY TYPE,HAS A TOUCH SCREEN,PRICE,WEIGHT,...,GPU FLOATING-POINT PERFORMANCE,CPU PASSMARK RESULT,GPU PASSMARK (G3D) RESULT,BRAND_encoded,CPU_encoded,GPU_encoded,TYPE_encoded,RESOLUTION_encoded,DISPLAY TYPE_encoded,HAS A TOUCH SCREEN_encoded
0,Acer,Acer ConceptD 5 (2023) 16 Inches Intel Core i7...,Intel Core i7 12700H,Nvidia GeForce RTX 3070 Ti,"Gaming, Productivity",3072 x 1920,"LED-backlit, IPS",No,37848000.0,2.40,...,15.88,26130,18479,0,42,18,0,7,12,0
1,Acer,Acer Nitro 16 (2023) AMD Ryzen 5 7640HS 4.3GHz...,AMD Ryzen 5 7640HS,Nvidia GeForce RTX 4050,"Gaming, Productivity",1920 x 1200,"LCD, LED-backlit, IPS",No,33246000.0,2.80,...,12.13,22983,17148,0,1,22,0,1,10,0
2,Acer,Acer Nitro 16 (2023) AMD Ryzen 7 7840HS 3.8GHz...,AMD Ryzen 7 7840HS,Nvidia GeForce RTX 4050,"Gaming, Productivity",2560 x 1600,"LCD, IPS",No,32319000.0,2.70,...,12.13,28905,17148,0,8,22,0,4,7,0
3,Acer,Acer Nitro 16 (2023) AMD Ryzen 7 7840HS 3.8GHz...,AMD Ryzen 7 7840HS,Nvidia GeForce RTX 4060,"Gaming, Productivity",1920 x 1200,"LCD, LED-backlit, IPS",No,32139000.0,2.80,...,14.56,28905,17710,0,8,23,0,1,10,0
4,Acer,Acer Nitro 16 (2023) AMD Ryzen 9 7940HS 4GHz /...,AMD Ryzen 9 7940HS,Nvidia GeForce RTX 4070,"Gaming, Productivity",2560 x 1600,"LCD, LED-backlit, IPS",No,33124000.0,2.80,...,20.04,30388,19574,0,17,24,0,4,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,XMG,XMG Neo 17 (2024) 17 Inches Intel Core i9-1490...,Intel Core i9 14900HX,Nvidia GeForce RTX 4060,"Gaming, Productivity",2560 x 1600,IPS,No,33907000.0,2.80,...,14.56,45715,17710,12,69,23,0,4,0,0
545,XMG,XMG Pro 15 (2023) 15.6 Inches Intel Core i9-13...,Intel Core i9 13900HX,Nvidia Geforce RTX 4070,"Gaming, Productivity",2560 x 1440,IPS,No,44053000.0,2.40,...,20.04,44115,19574,12,65,27,0,3,0,0
546,XMG,XMG Pro 16 Studio (2024) 16 Inches Intel Core ...,Intel Core i9 14900HX,Nvidia GeForce RTX 4060,"Gaming, Productivity",2560 x 1600,IPS,No,32677000.0,2.30,...,14.56,45715,17710,12,69,23,0,4,0,0
547,XMG,XMG Pro 17 (2023) 17.3 Inches Intel Core i9-13...,Intel Core i9 13900HX,Nvidia GeForce RTX 4070,"Gaming, Productivity",2560 x 1440,IPS,No,32356000.0,2.80,...,20.04,44115,19574,12,65,24,0,3,0,0


In [None]:
# dataset[dataset["CPU"].str.contains("i9 13", case=False)]["CPU"].unique()
# dataset["CPU"].unique()

array(['Intel Core i9 13900HX', 'Intel Core i9 13980HX',
       'Intel Core i9 13900H', 'Intel Core i9 13905H',
       'Intel Core i9 13900HK', 'Intel Core i9 13950HX'], dtype=object)

In [None]:
def extract_cpu_info(text):
    cpu_info = {"brand": None, "family": None, "generation": None, "suffix": None}
    text = text.lower().replace("-", " ").replace("  ", " ").strip()
    # Pattern cho Intel Core và Ultra
    intel_pattern = r"""
        (?:intel\s+)?                   # Brand (optional)
        (core\s+(?:i\d+|ultra\s+\d+))   # Family (Core iX/Core Ultra X)
        \s+                             # Separator
        (\d{3,5})                       # SKU (3-5 digits)
        ([a-z]{1,2})                    # Suffix (1-2 letters)
    """
    match = re.search(intel_pattern, text, re.VERBOSE | re.IGNORECASE)
    if match:
        # Xác định brand
        cpu_info["brand"] = "intel"
        # Xử lý family
        family = match.group(1).replace("core ", "").strip()
        cpu_info["family"] = f"Core {family}"
        # Xử lý generation từ SKU
        sku = match.group(2)
        cpu_info["generation"] = int(sku[:2])  # Luôn lấy 2 số đầu
        # Xử lý suffix
        cpu_info["suffix"] = match.group(3).upper()

    return cpu_info


# Test cases
test_cases = [
    "Intel Core i9-13980HX",
    "Intel Core i7 11800H",
    "Core i5 1240P",
    "Intel Core Ultra 9 185H",
    "Core Ultra 7 165U",
    "Intel Core i7 1185G7",
    "Intel Core i7 1165G7",
]

for text in test_cases:
    print(f"{text}: {extract_cpu_info(text)}")

Intel Core i9-13980HX: {'brand': 'intel', 'family': 'Core i9', 'generation': 13, 'suffix': 'HX'}
Intel Core i7 11800H: {'brand': 'intel', 'family': 'Core i7', 'generation': 11, 'suffix': 'H'}
Core i5 1240P: {'brand': 'intel', 'family': 'Core i5', 'generation': 12, 'suffix': 'P'}
Intel Core Ultra 9 185H: {'brand': 'intel', 'family': 'Core ultra 9', 'generation': 18, 'suffix': 'H'}
Core Ultra 7 165U: {'brand': 'intel', 'family': 'Core ultra 7', 'generation': 16, 'suffix': 'U'}
Intel Core i7 1185G7: {'brand': 'intel', 'family': 'Core i7', 'generation': 11, 'suffix': 'G'}
Intel Core i7 1165G7: {'brand': 'intel', 'family': 'Core i7', 'generation': 11, 'suffix': 'G'}


In [None]:
category_features = [
    "BRAND", "DEVICE", "CPU", "GPU", "TYPE", "RESOLUTION",
    "DISPLAY TYPE", "HAS A TOUCH SCREEN"
    ]
for feature in category_features:
    load(JsonFilePath=paths["config"], Categorical=feature)

numerical_features = [
    "PRICE", "WEIGHT", "WIDTH", "HEIGHT", "THICKNESS", "SCREEN SIZE",'REFRESH RATE',
    'BATTERY SIZE', 'RAM', 'RAM SPEED','CPU CORES', 'CPU THREADS', 'CPU SPEED', 
    'TURBO CLOCK SPEED','CPU CLOCK MULTIPLIER', 'VRAM OF GPU', 'GPU CLOCK SPEED',
    'GPU TURBO', 'SEMICONDUCTOR SIZE', 'GPU MEMORY SPEED','GPU NUMBER OF TRANSISTORS', 
    'GPU THERMAL DESIGN POWER (TDP)', "GPU MEMORY BUS WIDTH", "GPU EFFECTIVE MEMORY SPEED",
    "GPU MAXIMUM MEMORY BANDWIDTH", "GPU SHADING UNITS", "GPU TEXTURE RATE","GPU PIXEL RATE",
    "GPU RENDER OUTPUT UNITS (ROPS)", "GPU TEXTURE MAPPING UNITS (TMUS)",
    "GPU FLOATING-POINT PERFORMANCE", "CPU PASSMARK RESULT", "GPU PASSMARK (G3D) RESULT"
    ]
for feature in numerical_features:
    load(JsonFilePath=paths["config"], Numerical=feature)

In [7]:


df = pd.DataFrame(
    {"BRAND": ["ASUS", "DELL", "HP", "ASUS"], "CPU": ["i7", "i5", "i9", "i7"]}
)

df_encoded = pd.get_dummies(df, columns=["BRAND", "CPU"])
print(df_encoded)

   BRAND_ASUS  BRAND_DELL  BRAND_HP  CPU_i5  CPU_i7  CPU_i9
0        True       False     False   False    True   False
1       False        True     False    True   False   False
2       False       False      True   False   False    True
3        True       False     False   False    True   False


In [None]:


df["BRAND"] = LabelEncoder().fit_transform(df["BRAND"])
df["CPU"] = LabelEncoder().fit_transform(df["CPU"])
print(df)

   BRAND  CPU
0      0    1
1      1    0
2      2    2
3      0    1


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.DataFrame(
    {
        "DESCRIPTION": [
            "ASUS gaming laptop with RTX 4080",
            "DELL business laptop with i7",
        ]
    }
)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["DESCRIPTION"]).toarray()

df_tfidf = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())
print(df_tfidf)

       4080      asus  business      dell    gaming        i7  laptop  \
0  0.446656  0.446656  0.000000  0.000000  0.446656  0.000000  0.3178   
1  0.000000  0.000000  0.499221  0.499221  0.000000  0.499221  0.3552   

        rtx    with  
0  0.446656  0.3178  
1  0.000000  0.3552  


In [11]:


# # Tạo các features GPU và CPU theo các brand cụ thể
# def create_device_ids(row):
#     device = row["DEVICE"].strip().lower()
#     parts = device.split("_")
#     gpu_id = int(parts[-1]) if len(parts) > 0 else -1
#     cpu_id = int(parts[-2]) if len(parts) >= 2 else -1

#     # Tính ID cho GPU và CPU
#     # Ví dụ: "nvidia-pow Jackson" có GPU là nvidia, CPU là nvidia-pow
#     unique_gpus = df[usage_columns].loc[:, "BRAND"].unique()

#     for gpu in unique_gpus:
#         if gpu in parts:
#             gpu_id = list(df[usage_columns].loc[:, "BRAND"].str.contains(gpu))
#             break

#     unique_cpus = df[usage_columns].loc[:, "BRAND"].unique()

#     for cpu in unique_cpus:
#         if cpu in parts:
#             cpu_id = list(df[usage_columns].loc[:, "BRAND"].str.contains(cpu))
#             break
#     return gpu_id, cpu_id


# # Tạo các features GPU và CPU
# df[["GPU_ID", "CPU_ID"]] = df.apply(lambda row: create_device_ids(row), axis=1)
# df["GPU_ID"] = df["GPU_ID"].astype(int)
# df["CPU_ID"] = df["CPU_ID"].astype(int)

# final_features = numerical_features + ["BRAND_encoded", "GPU_ID", "CPU_ID"]


# # Define groups
# def create_groups(row):
#     return 0 if "NaN" in row["Predicted"] else 1


# groups = df.apply(create_groups, axis=1).values

# # Chuẩn bị dữ liệu cho model
# X = df[final_features]
# y = df["SCORE"].values.reshape(-1, 1)


# # Define loss function and optimizer
# class LossFunction(nn.Module):
#     def __init__(self):
#         super().__init__()

#     def forward(self, y_pred, y_true):
#         mask = y_true != -1
#         y_pred = y_pred[mask]
#         y_true = y_true[mask]
#         return mean_absolute_error(y_pred, y_true)


# criterion = LossFunction()


# # Define model
# class LaptopRanker(nn.Module):
#     def __init__(self, num_brands, num_gpus, num_cpus, embedding_dim=8):
#         super().__init__()
#         self.brand_embedding = nn.Embedding(num_brands, embedding_dim)
#         self.cpu_embedding = nn.Embedding(num_cpus, embedding_dim)
#         self.gpu_embedding = nn.Embedding(num_gpus, embedding_dim)

#         # Linear regression layer
#         self.fc = nn.Sequential(
#             nn.Linear(embedding_dim * 3 + len(final_features) - 2, 10),
#             nn.ReLU(),
#             nn.Linear(10, 1),
#         )

#     def forward(self, brand_ids, gpu_ids, cpu_ids, numerical_features):
#         # Get embeddings
#         brand_emb = self.brand_embedding(brand_ids).squeeze()
#         gpu_emb = self.cpu_embedding(gpu_ids).squeeze()
#         cpu_emb = self.gpu_embedding(cpu_ids).squeeze()

#         # Concatenate all features
#         x = torch.cat([brand_emb, gpu_emb, cpu_emb, numerical_features], dim=1)

#         # Forward pass through fully connected layer
#         output = self.fc(x)
#         return output


# # Initialize model
# num_brands = len(df[usage_columns].loc[:, "BRAND"].unique())
# num_gpus = df["GPU_ID"].max() + 1 if not pd.isna(df["GPU_ID"]).all() else 0
# num_cpus = df["CPU_ID"].max() + 1 if not pd.isna(df["CPU_ID"]).all() else 0

# model = LaptopRanker(num_brands, num_gpus, num_cpus)

# # Train model
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# loss_fn = nn.MSELoss()

# best_loss = float("inf")
# no_improvement_epochs = 0
# num_epochs = 20

# for epoch in range(num_epochs):
#     # Forward pass
#     y_pred = model(
#         df["GPU_ID"].values, df["CPU_ID"].values, df[["BRAND_encoded"]].values, X
#     )
#     loss = loss_fn(y_pred, torch.tensor(y, dtype=torch.float32))

#     # Backward pass and optimize
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()

#     if (epoch + 1) % 5 == 0:
#         print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

#     # Early stopping
#     if loss < best_loss - 1e-6 and no_improvement_epochs < 2:
#         best_loss = loss.item()
#         no_improvement_epochs += 1
#     else:
#         no_improvement_epochs = 0

# print(f"Best validation loss: {best_loss:.4f}")


# # Function to rank laptops based on query
# def rank_laptops(query, top_k=5):
#     # Get the predicted scores for each laptop with the query
#     filtered = df[df["DEVICE"].str.contains(query, case=False)]

#     if not filtered.empty:
#         filtered["PREDICTED_SCORE"] = (
#             model(
#                 torch.tensor(filtered["GPU_ID"].values).long(),
#                 torch.tensor(filtered["CPU_ID"].values).long(),
#                 torch.tensor(filtered[["BRAND_encoded"]].values).long(),
#                 torch.tensor(filtered[final_features].values),
#             )
#             .detach()
#             .numpy()
#         )
#     else:
#         return pd.DataFrame(columns=final_features + ["PREDICTED_SCORE"])

#     # Sort by the criteria
#     sorted_df = filtered.sort_values(
#         by=["PREDICTED_SCORE", "CPU PASSMARK RESULT", "GPU PASSMARK (G3D) RESULT"],
#         ascending=[False, False, False],
#     ).head(top_k)

#     return sorted_df


# # Load và save các thông tin cần thiết
# joblib.dump(scaler, "Models/scaler.pkl")
# joblib.dump(model, "Models/laptop_ranker.pkl")

# print("Training complete!")

In [12]:
# # Chuyển đổi dữ liệu sang tensor
# brand_ids = torch.tensor(df["BRAND_encoded"].values, dtype=torch.long)
# gpu_ids = torch.tensor(df["GPU_id"].values, dtype=torch.long)
# cpu_ids = torch.tensor(df["CPU_id"].values, dtype=torch.long)
# numerical_tensor = torch.tensor(df[numerical_features].values, dtype=torch.float32)
# target = torch.tensor(df["SCORE"].values, dtype=torch.float32)

# # Khởi tạo mô hình
# num_brands = len(brand_encoder.classes_)
# num_gpus = len(gpu_to_id)
# num_cpus = len(cpu_to_id)
# model = LaptopRanker(num_brands, num_gpus, num_cpus)

# # Loss và optimizer
# criterion = nn.MSELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# # Huấn luyện
# for epoch in range(10000):
#     optimizer.zero_grad()
#     outputs = model(brand_ids, gpu_ids, cpu_ids, numerical_tensor)
#     loss = criterion(outputs.squeeze(), target)
#     loss.backward()
#     optimizer.step()

#     if (epoch + 1) % 10 == 0:
#         print(f"Epoch [{epoch+1}/100], Loss: {loss.item():.4f}")