In [None]:
file_input_path = 'tay_da_chet_co_the_1208_AI_AT.xlsx'
file_input_path_updated = "file_output.xlsx"
folder_image = "images"
file_output = 'tdc_grouped_images_cosine.xlsx'

In [None]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from PIL import Image
from io import BytesIO
import requests
from tqdm import tqdm

# Đọc dữ liệu dự đoán
test_file_path = file_input_path
test_data = pd.read_excel(test_file_path)
print(test_data.columns)

# Tạo thư mục lưu trữ ảnh
test_image_folder = folder_image
os.makedirs(test_image_folder, exist_ok=True)

# Hàm tải ảnh
def download_image(row):
    img_url = row['url_thumbnail']
    img_id = str(row['product_base_id'])  
    img_path = os.path.join(test_image_folder, f"{img_id}.jpg")
    try:
        response = requests.get(img_url, timeout=10)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img.save(img_path)
        return img_path
    except Exception as e:
        print(f"Error downloading image {img_url}: {e}")
        return None

# Tải ảnh xuống thư mục
print("Downloading test images...")
with ThreadPoolExecutor(max_workers=30) as executor:
    image_paths = list(tqdm(executor.map(download_image, [row for _, row in test_data.iterrows()]), total=len(test_data), desc='Downloading Test Images'))

# Loại bỏ các hàng không tải được ảnh
test_data['image_path'] = image_paths
test_data = test_data.dropna(subset=['image_path']).reset_index(drop=True)
test_data.to_excel(file_input_path_updated)

print("Finished downloading group images.")


In [None]:
## Bản oke nhất
import pandas as pd
import numpy as np
import cv2
from tensorflow.keras.applications.efficientnet import EfficientNetB7, preprocess_input
from tqdm import tqdm
import faiss

# Khởi tạo mô hình EfficientNetB7
model = EfficientNetB7(weights='imagenet', include_top=False, pooling='avg')

def preprocess_image(img_path):
    try:
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (224, 224))
        img_data = np.expand_dims(img, axis=0)
        img_data = preprocess_input(img_data)
        return img_data
    except Exception as e:
        print(f"Error processing image from path {img_path}: {e}")
        return None

# Trích xuất đặc trưng
features = []
batch_size = 32
for idx in tqdm(range(0, len(test_data), batch_size), desc='Processing Images in Batches'):
    batch_paths = test_data['image_path'].iloc[idx:idx + batch_size]
    batch_imgs = [preprocess_image(img_path) for img_path in batch_paths if pd.notnull(img_path)]
    batch_imgs = [img for img in batch_imgs if img is not None]
    
    if batch_imgs:
        batch_features = model.predict(np.concatenate(batch_imgs), verbose=0)
        features.extend(batch_features)

features = np.array(features)
features = features.reshape(features.shape[0], -1)

# Chuẩn bị dữ liệu cho FAISS
features_normalized = features.astype('float32')
features_normalized /= np.linalg.norm(features_normalized, axis=1, keepdims=True)

# Initialize the index for clustering
n_clusters = int(np.sqrt(test_data.shape[0]))
index_flat = faiss.IndexFlatL2(features_normalized.shape[1])
kmeans = faiss.Clustering(features_normalized.shape[1], n_clusters)
kmeans.train(features_normalized, index_flat)

# Assign each vector to a cluster
_, cluster_assignments = index_flat.search(features_normalized, 1)

print("FAISS with Cosine Similarity within clusters")
group_labels = np.zeros(len(features_normalized), dtype=int)
current_group = 1

# For each cluster, create a new FAISS index and search for similar items
cosine_threshold = 0.96  # Ngưỡng cosine similarity
for cluster_id in tqdm(range(n_clusters), desc="Processing clusters..."):
    cluster_indices = np.where(cluster_assignments == cluster_id)[0]
    cluster_vectors = features_normalized[cluster_indices]

    if len(cluster_indices) > 1:
        # Create FAISS index for this cluster
        index = faiss.IndexFlatIP(cluster_vectors.shape[1])
        index.add(cluster_vectors)

        # Search within the cluster
        for i in range(len(cluster_indices)):
            if group_labels[cluster_indices[i]] != 0:
                continue

            distances, indices = index.search(np.array([cluster_vectors[i]]), k=min(3000, len(cluster_indices)))

            for idx, sim in zip(indices[0], distances[0]):
                if group_labels[cluster_indices[idx]] == 0 and sim >= cosine_threshold:
                    group_labels[cluster_indices[idx]] = current_group

            current_group += 1

# Gán nhãn cluster vào DataFrame gốc
test_data['cluster_label'] = group_labels

# Lưu kết quả vào tệp Excel mới
output_file_path = "clustered_data_v2.xlsx"
test_data.to_excel(output_file_path, index=False)
print(f"Kết quả đã được lưu vào {output_file_path}")
