In [None]:
file_input_path = "/content/tu_nhua_20_10_raw_AI.xlsx"
file_out_put = "group_e_report.xlsx"

In [None]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.sparse import lil_matrix  # Import sparse matrix
df = pd.read_excel(file_input_path)


## Sử dụng dưới 100k dòng

def normalized(txt):
    cleaned_text = re.sub(r'[\[\(].*?[\]\)]\s*', '', txt)
    return str(cleaned_text).lower().strip()

print("Read file")
# Load the updated data with text descriptions
file_input_path = file_input_path  
file_out_put = file_out_put  
test_data = pd.read_excel(file_input_path)
test_data["product_name"] = test_data["product_name"].apply(normalized)
# test_data = test_data.iloc[0:1000]
print("Load model")

# Initialize the tokenizer and model for inflat-e5-large-v2
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large")
model = AutoModel.from_pretrained("intfloat/multilingual-e5-large").to("cuda" if torch.cuda.is_available() else "cpu")

class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def collate_fn(batch):
    return tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=300)

def preprocess_texts(dataloader, model, device):
    model.eval()
    features = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Processing Batches'):
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            features.append(embeddings)

    return np.concatenate(features, axis=0)

# Extract features
print("Embedding...")
texts = test_data["product_name"].tolist()
dataset = TextDataset(texts)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
features = preprocess_texts(dataloader, model, device)

# Optionally, reduce dimensionality with PCA
pca = PCA(n_components=300)
features_reduced = pca.fit_transform(features)

# Standardize features
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features_reduced)

# Group similar vectors using cosine similarity with sparse matrix
def group_similar_vectors_cosine_sparse(all_embeddings, threshold=0.75, batch_size=1000):
    embeddings = torch.tensor(all_embeddings, dtype=torch.float32)
    embedding_count = len(all_embeddings)
    similarity_matrix = lil_matrix((embedding_count, embedding_count))
    print(similarity_matrix.shape)

    # Compute similarity matrix in batches
    for i in tqdm(range(0, embedding_count, batch_size), desc='Computing Similarity Matrix'):
        for j in range(0, embedding_count, batch_size):
            batch_embeddings_i = embeddings[i:i + batch_size]
            batch_embeddings_j = embeddings[j:j + batch_size]
            similarity = F.cosine_similarity(batch_embeddings_i.unsqueeze(1), batch_embeddings_j.unsqueeze(0), dim=-1)
            similarity = similarity.numpy()
            for m in range(similarity.shape[0]):
                for n in range(similarity.shape[1]):
                    if similarity[m, n] >= threshold:
                        similarity_matrix[i + m, j + n] = similarity[m, n]

    group_ids = [-1] * embedding_count
    current_group_id = 0

    for i in tqdm(range(embedding_count), desc='Grouping Vectors'):
        if group_ids[i] == -1:
            group_ids[i] = current_group_id
            for j in range(i + 1, embedding_count):
                if group_ids[j] == -1 and similarity_matrix[i, j] >= threshold:
                    group_ids[j] = current_group_id
            current_group_id += 1

    return group_ids

# Use the sparse matrix-based cosine similarity grouping function
group_ids = group_similar_vectors_cosine_sparse(features_standardized, threshold=0.75, batch_size=1000)

# Create a new DataFrame to store results
result_df = test_data.copy()
result_df['group_text'] = group_ids

# Save to Excel
result_df.to_excel(file_out_put, index=False)

print("Finished processing and saved results to", file_out_put)
