In [None]:
import pandas as pd
import numpy as np
import faiss 
from tqdm import tqdm
from FlagEmbedding import BGEM3FlagModel

## Function để chuẩn hóa text
def normalized(text):
    return str(text).lower().strip()

def unique_brand(df):
    df['brand'] = df['brand'].apply(normalized)
    lst_brand = df['brand'].unique()
    return lst_brand

lst_brand = unique_brand(df = df)

def remove_brand_from_lst_name(text):
    for brand in lst_brand:
        text = text.replace(f"{brand} ","")
    return text




# Đọc file dữ liệu
df = pd.read_excel('/content/tay_trang_clean_1_group_normalied.xlsx',sheet_name = "Sheet1")
df = df.sample(2000)
# Load mô hình BGE M3
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

# Batch size
batch = 20

# Danh sách để lưu các embeddings
lst_embeddings = []

# Chuẩn hóa tên sản phẩm
lst_name = df['product_name'].apply(normalized).apply(remove_brand_from_lst_name).to_list()

# Xử lý embeddings theo batch
for name in tqdm(range(0, len(lst_name), batch), desc="Processing Batches"):
    # Chuyển đổi đoạn lst_name[name:name+batch] thành list
    batch_text = lst_name[name:name+batch]
    try:
        embeddings = model.encode(batch_text, batch_size=10, max_length=8192)['dense_vecs']  # Sử dụng đúng trường 'dense_vecs'
        lst_embeddings.append(embeddings)
    except Exception as e:
        print(f"Error encoding batch: {e}")
        continue

# Chuyển list các embeddings thành một mảng numpy 2D
all_embeddings = np.vstack(lst_embeddings).astype('float32')  # Chuyển đổi sang float32

# Kích thước vector (dimensions)
d = all_embeddings.shape[1]

# Chuẩn hóa embeddings cho cosine similarity
faiss.normalize_L2(all_embeddings)

# Tạo FAISS index với GPU (sử dụng cosine similarity)
index = faiss.IndexFlatIP(d)  # Inner Product index, tương đương cosine similarity khi vectors được chuẩn hóa
res = faiss.StandardGpuResources()  # Tạo tài nguyên cho GPU
index = faiss.index_cpu_to_gpu(res, 0, index)  # Chuyển index sang GPU (GPU 0)

# Thêm các embeddings vào index trên GPU
index.add(all_embeddings)

# Kiểm tra số lượng vectors đã thêm
print(f"Number of embeddings in the index: {index.ntotal}")

# Hàm truy vấn FAISS index
def query(text, threshold):
    # Encode and normalize the query text
    query_vector = model.encode([normalized(text)], batch_size=1, max_length=8192)['dense_vecs']
    query_vector = np.array(query_vector).astype('float32')
    faiss.normalize_L2(query_vector)
    
    # Search in the FAISS index
    D, I = index.search(query_vector, k=10)
    
    # Filter results by threshold
    filtered_indices = [(i, score) for i, score in zip(I[0], D[0]) if score >= threshold]
    
    # Extract product names and cosine scores in a single step
    query_results = [(df.iloc[idx]['product_name'], score) for idx, score in filtered_indices]
    
    return query_results

# Prepare dictionary for product name to index mapping
dictionary_name_id = {row['product_name']: idx for idx, row in df.iterrows()}

# Initialize 'group_txt' and 'scores' columns with default values
df['group_txt'] = "no_group"
df['scores'] = 0.0

# Group products based on cosine similarity scores and threshold
def group_txt(threshold):
    for name in tqdm(df['product_name'], desc="Grouping products"):
        # Query for similar products based on the name and threshold
        query_results = query(name, threshold)
        
        for product_name, score in query_results:
            product_idx = dictionary_name_id[product_name]
            
            # Update group and score only if conditions are met
            if df.at[product_idx, 'group_txt'] == "no_group":  # Chưa có nhóm
                df.at[product_idx, 'group_txt'] = name  # Gán nhóm theo tên sản phẩm
                df.at[product_idx, 'scores'] = score
            # elif df.at[product_idx, 'scores'] < score and str(df.at[product_idx, 'group_txt'].lower().strip()) != name.lower().strip() and df.at[product_idx, 'group_txt'] != "no_group":  # Nếu có nhóm nhưng điểm thấp hơn
            #     df.at[product_idx, 'group_txt'] = name
            #     df.at[product_idx, 'scores'] = score

# Call group_txt function with a specific threshold
group_txt(threshold=0.85)  # Example threshold value



from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()


# Sử dụng LabelEncoder để chuyển tên nhóm thành số
df['group_txt'] = label_encoder.fit_transform(df['group_txt'])

# Kiểm tra kết quả
print(df['group_txt'].value_counts())



ModuleNotFoundError: No module named 'pandas'