In [1]:
file_input_path = "data_eReport_map_1308_(122k_remove).xlsx"
file_out_put = "group_e_report.xlsx"

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.sparse import lil_matrix  # Import sparse matrix
df = pd.read_excel(file_input_path)


## Sử dụng dưới 100k dòng

def normalized(txt):
    if txt is None:
        return ''
    else:
        return str(txt).lower().strip()

print("Read file")
# Load the updated data with text descriptions
file_input_path = file_input_path  
file_out_put = file_out_put  
test_data = pd.read_excel(file_input_path)
test_data["product_name"] = test_data["product_name"].apply(normalized)
print("Load model")

# Initialize the tokenizer and model for inflat-e5-large-v2
tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-large-v2")
model = AutoModel.from_pretrained("intfloat/e5-large-v2").to("cuda" if torch.cuda.is_available() else "cpu")

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def collate_fn(batch):
    return tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=300)

def preprocess_texts(dataloader, model, device):
    model.eval()
    features = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Processing Batches'):
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            features.append(embeddings)

    return np.concatenate(features, axis=0)

# Extract features
print("Embedding...")
texts = test_data["product_name"].tolist()
dataset = TextDataset(texts)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
features = preprocess_texts(dataloader, model, device)

# Optionally, reduce dimensionality with PCA
pca = PCA(n_components=300)
features_reduced = pca.fit_transform(features)

# Standardize features
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features_reduced)

# Group similar vectors using cosine similarity with sparse matrix
def group_similar_vectors_cosine_sparse(all_embeddings, threshold=0.75, batch_size=1000):
    embeddings = torch.tensor(all_embeddings, dtype=torch.float32)
    embedding_count = len(all_embeddings)
    similarity_matrix = lil_matrix((embedding_count, embedding_count))
    print(similarity_matrix.shape)

    # Compute similarity matrix in batches
    for i in tqdm(range(0, embedding_count, batch_size), desc='Computing Similarity Matrix'):
        for j in range(0, embedding_count, batch_size):
            batch_embeddings_i = embeddings[i:i + batch_size]
            batch_embeddings_j = embeddings[j:j + batch_size]
            similarity = F.cosine_similarity(batch_embeddings_i.unsqueeze(1), batch_embeddings_j.unsqueeze(0), dim=-1)
            similarity = similarity.numpy()
            for m in range(similarity.shape[0]):
                for n in range(similarity.shape[1]):
                    if similarity[m, n] >= threshold:
                        similarity_matrix[i + m, j + n] = similarity[m, n]

    group_ids = [-1] * embedding_count
    current_group_id = 0

    for i in tqdm(range(embedding_count), desc='Grouping Vectors'):
        if group_ids[i] == -1:
            group_ids[i] = current_group_id
            for j in range(i + 1, embedding_count):
                if group_ids[j] == -1 and similarity_matrix[i, j] >= threshold:
                    group_ids[j] = current_group_id
            current_group_id += 1

    return group_ids

# Use the sparse matrix-based cosine similarity grouping function
group_ids = group_similar_vectors_cosine_sparse(features_standardized, threshold=0.92, batch_size=1000)

# Create a new DataFrame to store results
result_df = test_data.copy()
result_df['group_text'] = group_ids

# Save to Excel
result_df.to_excel(file_out_put, index=False)

print("Finished processing and saved results to", file_out_put)


Read file
Load model
Embedding...


Processing Batches: 100%|██████████| 11247/11247 [06:37<00:00, 28.29it/s]


(359879, 359879)


Computing Similarity Matrix:   0%|          | 1/360 [16:41<99:51:13, 1001.32s/it]


KeyboardInterrupt: 

In [4]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.sparse import lil_matrix, coo_matrix

# Optimized Filtering
df = pd.read_excel(file_input_path)
categories = ["Nhà cửa & Đời sống", "Thời Trang Nữ", "Sắc Đẹp", "Điện Thoại & Phụ Kiện", 
              "Mẹ & Bé", "Thời Trang Nam", "Thực phẩm và đồ uống", "Phụ Kiện Thời Trang", 
              "Thiết Bị Điện Gia Dụng"]
df = df[(df['remove_edited'] != "x") & (df['category_name'].isin(categories))]
print(df.shape)

def normalized(txt):
    return str(txt).lower().strip() if txt is not None else ''

# print("Read file")
# test_data = pd.read_excel(file_input_path)
# test_data["name"] = test_data["name"].apply(normalized)
# print("Load model")

tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-large-v2")
model = AutoModel.from_pretrained("intfloat/e5-large-v2").to("cuda" if torch.cuda.is_available() else "cpu")

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def collate_fn(batch):
    return tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=300)

def preprocess_texts(dataloader, model, device):
    model.eval()
    features = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Processing Batches'):
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            features.append(embeddings)

    return np.concatenate(features, axis=0)

# Embedding
print("Embedding...")
texts = test_data["name"].tolist()
dataset = TextDataset(texts)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, pin_memory=True, num_workers=4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
features = preprocess_texts(dataloader, model, device)

# PCA Reduction
pca = PCA(n_components=300)
features_reduced = pca.fit_transform(features)

# Standardize Features
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features_reduced)

def group_similar_vectors_cosine_sparse(all_embeddings, threshold=0.75, batch_size=1000):
    embeddings = torch.tensor(all_embeddings, dtype=torch.float32)
    embedding_count = len(all_embeddings)
    similarity_matrix = lil_matrix((embedding_count, embedding_count))

    # Compute similarity matrix in batches
    for i in tqdm(range(0, embedding_count, batch_size), desc='Computing Similarity Matrix'):
        batch_embeddings_i = embeddings[i:i + batch_size]
        for j in range(i, embedding_count, batch_size):
            batch_embeddings_j = embeddings[j:j + batch_size]
            similarity = F.cosine_similarity(batch_embeddings_i.unsqueeze(1), batch_embeddings_j.unsqueeze(0), dim=-1)
            similarity = similarity.numpy()
            indices = np.argwhere(similarity >= threshold)
            for index in indices:
                similarity_matrix[i + index[0], j + index[1]] = similarity[index[0], index[1]]
                if i != j:
                    similarity_matrix[j + index[1], i + index[0]] = similarity[index[0], index[1]]

    group_ids = np.full(embedding_count, -1, dtype=int)
    current_group_id = 0

    for i in tqdm(range(embedding_count), desc='Grouping Vectors'):
        if group_ids[i] == -1:
            group_ids[i] = current_group_id
            connected_indices = similarity_matrix[i].nonzero()[1]
            group_ids[connected_indices] = current_group_id
            current_group_id += 1

    return group_ids

group_ids = group_similar_vectors_cosine_sparse(features_standardized, threshold=0.8, batch_size=1000)

result_df = test_data.copy()
result_df['group_text'] = group_ids

result_df.to_excel(file_out_put, index=False)
print("Finished processing and saved results to", file_out_put)


(161093, 20)
Embedding...


Processing Batches:   0%|          | 0/11247 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already

KeyboardInterrupt: 

In [10]:
import numpy as np
import faiss
from hdbscan import HDBSCAN

# Giả định danh sách tên sản phẩm
lst_product_name = [
    "Apple iPhone 13",
    "Apple iPhone 14",
    "Samsung Galaxy S21",
    "Samsung Galaxy S22",
    "Google Pixel 6",
    "Google Pixel 7",
    "Sony Xperia 5",
    "Sony Xperia 1"
]

# Giả định embedding (thủ công) với các vector có 4 chiều
# Các sản phẩm cùng hãng sẽ có các embedding gần nhau hơn
lst_embedings_product_name = np.array([
    [1.0, 0.8, 0.9, 0.7],  # Apple iPhone 13
    [1.0, 0.85, 0.88, 0.68], # Apple iPhone 14
    [0.2, 0.1, 0.3, 0.2],   # Samsung Galaxy S21
    [0.25, 0.15, 0.28, 0.18], # Samsung Galaxy S22
    [0.6, 0.9, 0.4, 0.7],   # Google Pixel 6
    [0.58, 0.88, 0.42, 0.72], # Google Pixel 7
    [0.05, 0.2, 0.1, 0.3],  # Sony Xperia 5
    [0.07, 0.18, 0.12, 0.28]  # Sony Xperia 1
])

# In ra các embedding để xem xét
print("Embeddings:")
for name, emb in zip(lst_product_name, lst_embedings_product_name):
    print(f"{name}: {emb}")

# Xây dựng FAISS index
d = lst_embedings_product_name.shape[1]  # số chiều của embedding (4)
index = faiss.IndexFlatL2(d)     # dùng chỉ số L2 (chuẩn Euclidean)
index.add(lst_embedings_product_name)     # thêm các embedding vào index

# Sử dụng HDBSCAN để phân cụm
clusterer = HDBSCAN(min_cluster_size=2)  # Số lượng sản phẩm trong cụm tối thiểu là 2
labels = clusterer.fit_predict(lst_embedings_product_name)


# Tạo danh sách các nhóm sản phẩm
grouped_products = {}
for label, product_name in zip(labels, lst_product_name):
    if label not in grouped_products:
        grouped_products[label] = []
    grouped_products[label].append(product_name)

# Hiển thị kết quả phân cụm
print("\nPhân cụm sản phẩm:")
for label, products in grouped_products.items():
    print(f"Group {label}:")
    for product in products:
        print(f"- {product}")


Embeddings:
Apple iPhone 13: [1.  0.8 0.9 0.7]
Apple iPhone 14: [1.   0.85 0.88 0.68]
Samsung Galaxy S21: [0.2 0.1 0.3 0.2]
Samsung Galaxy S22: [0.25 0.15 0.28 0.18]
Google Pixel 6: [0.6 0.9 0.4 0.7]
Google Pixel 7: [0.58 0.88 0.42 0.72]
Sony Xperia 5: [0.05 0.2  0.1  0.3 ]
Sony Xperia 1: [0.07 0.18 0.12 0.28]
clusterer HDBSCAN(min_cluster_size=2)
labels [0 0 1 1 0 0 1 1]

Phân cụm sản phẩm:
Group 0:
- Apple iPhone 13
- Apple iPhone 14
- Google Pixel 6
- Google Pixel 7
Group 1:
- Samsung Galaxy S21
- Samsung Galaxy S22
- Sony Xperia 5
- Sony Xperia 1


In [12]:
import psutil

# Kiểm tra tổng dung lượng RAM và dung lượng RAM khả dụng
mem = psutil.virtual_memory()
total_memory = mem.total / (1024 ** 3)  # Đổi sang GB
available_memory = mem.available / (1024 ** 3)  # Đổi sang GB

print(f"Total memory: {total_memory:.2f} GB")
print(f"Available memory: {available_memory:.2f} GB")


Total memory: 31.36 GB
Available memory: 22.11 GB


In [4]:
tesst_txt = "kem olay dưỡng ẩm"
print( tesst_txt.replace("olay ","")      )

kem dưỡng ẩm
