In [7]:
import pandas as pd

def read_file():
    df = pd.read_parquet("logos.snappy.parquet", engine="pyarrow")

    total_domains = len(df["domain"])
    unique_domains = set(df["domain"])
    num_duplicates = total_domains - len(unique_domains)

    with open("logos.txt", "w") as file:
        for domain in unique_domains:
            file.write(domain + "\n")

    print(f"Total domains: {total_domains}")
    print(f"Unique domains: {len(unique_domains)}")
    print(f"Duplicates found: {num_duplicates}")

In [8]:
import requests
import os
import time
from tqdm import tqdm

In [9]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import queue
from pathlib import Path


success = 0
lock = threading.Lock()
error_lock = threading.Lock()

class LogoDownloaderWorker(threading.Thread):
    def __init__(self, task_queue):
        super().__init__()
        self.task_queue = task_queue
    
    def download_logo(self, domain, output_path):
        url = f"https://logo.clearbit.com/{domain}"
        
        try:
            response = requests.get(url, timeout=5)
            
            if response.status_code == 200:
                with open(output_path, "wb") as file:
                    file.write(response.content)
                with lock:
                    global success
                    success += 1
                return True
            
            else:
                with error_lock:
                    with open("log.txt", "a") as file:
                        file.write(f"{domain}\n")
                return False
        
        except requests.RequestException as e:
            with error_lock:
                with open("log.txt", "a") as file:
                    file.write(f"{domain}\n")
            return False

    def run(self):
        while True:
            try:
                domain = self.task_queue.get(timeout=1)
            except queue.Empty:
                break
            
            output_path = os.path.join("logos", f"{domain.replace('.', '_')}.png")
            self.download_logo(domain, output_path)
            self.task_queue.task_done()

        
def master_thread(num_workers = 20):
    os.makedirs("logos", exist_ok=True)

    with open("logos.txt") as file:
        domains = file.read().splitlines()
    
    task_queue = queue.Queue()
    for domain in domains:
        task_queue.put(domain)
    
    workers = [LogoDownloaderWorker(task_queue) for _ in range(num_workers)]
    for worker in workers:
        worker.start()
    
    with tqdm(total=len(domains), desc = "Downloading logos: ") as progress_bar:
        path = "logos"
        while not task_queue.empty():
            num_files = len([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))])
            progress_bar.update(num_files - progress_bar.n)
            time.sleep(1)

    task_queue.join()

    for worker in workers:
        worker.join()
    

In [None]:
import os
import requests
import threading
import queue
from bs4 import BeautifulSoup
import cairosvg

class HTMLLogoScraperWorker(threading.Thread):
    def __init__(self, task_queue):
        super().__init__()
        self.task_queue = task_queue
    
    def find_and_download_logo(self, domain):
        try:
            response = requests.get(f"https://{domain}", timeout=5)
            if response.status_code != 200:
                return
            
            soup = BeautifulSoup(response.text, "html.parser")

            if domain == "deswam.in":
                with open("deswam.in.html", "w") as file:
                    file.write(response.text)
            
            img_tags = soup.find_all("img")
            for img in img_tags:
                src = img.get("src", "")
                alt = img.get("alt", "").lower()
                verifClass = "".join(img.get("class", "")).lower()

                if "logo" in src.lower() or "logo" in alt or "logo" in verifClass:
                    logo_url = src if src.startswith("http") else f"https://{domain}{src}"
                    self.download_logo(logo_url, domain)
                    return

        except requests.RequestException:
            print(f"[ERROR] No logo in html source {domain}")
            pass

    def download_logo(self, url, domain):
        try:
            response = requests.get(url, timeout=5)
            if response.status_code == 200:
                file_extension = url.split(".")[-1].lower()
                content_type = response.headers.get("Content-Type", "")

                output_path = os.path.join("logos", f"{domain.replace('.', '_')}")
                
                if file_extension == "svg" or "image/svg+xml" in content_type:
                    svg_data = response.content.decode("utf-8", errors="ignore")

                    svg_path = f"{output_path}.svg"
                    png_path = f"{output_path}.png"

                    with open(svg_path, "w", encoding="utf-8") as file:
                        file.write(svg_data)

                    cairosvg.svg2png(url=svg_path, write_to=png_path)
                    os.remove(svg_path)

                else:
                    with open(f"{output_path}.png", "wb") as file:
                        file.write(response.content)

        except requests.RequestException:
            print(f"[ERROR] Eșec la descărcarea logo-ului pentru {domain}")
        except Exception as e:
            print(f"[ERROR] Eroare la procesarea SVG pentru {domain}: {e}")

    def run(self):
        while True:
            try:
                domain = self.task_queue.get(timeout=1)
            except queue.Empty:
                break
            
            self.find_and_download_logo(domain)
            self.task_queue.task_done()



In [None]:
import os
def scrape_html_logos(num_workers=10):
    if not os.path.exists("log.txt"):
        print("No failed domains to process.")
        return

    with open("log.txt") as file:
        failed_domains = list(set(file.read().splitlines()))
    
    if not failed_domains:
        print("No failed domains to process.")
        return
    
    print(f"Retrying {len(failed_domains)} domains by parsing HTML...")

    task_queue = queue.Queue()
    for domain in failed_domains:
        task_queue.put(domain)

    workers = [HTMLLogoScraperWorker(task_queue) for _ in range(num_workers)]
    for worker in workers:
        worker.start()

    task_queue.join()

    for worker in workers:
        worker.join()


In [None]:
read_file()
master_thread(20)

In [None]:
scrape_html_logos()

In [14]:
import os
import numpy as np
import faiss
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import cv2
import matplotlib.pyplot as plt

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.resnet50(pretrained=True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))
model = model.to(device)
model.eval()

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [16]:
def extract_features(image_path):
    if not os.path.exists(image_path):
        print(f"Warning: File not found - {image_path}")
        return None
    
    img = cv2.imread(image_path)
    if img is None:
        print(f"Warning: Unable to read image - {image_path}")
        return None

    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = transform(img).unsqueeze(0).to(device)
    
    return model(img).detach().cpu().numpy().flatten()


In [None]:
image_folder = "logos"
image_paths = [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith(".png")]

feature_list = [extract_features(path) for path in image_paths]
feature_list = [feat for feat in feature_list if feat is not None]
feature_array = np.array(feature_list, dtype=np.float32)

d = feature_array.shape[1]
index = faiss.IndexFlatL2(d)
index.add(feature_array)

num_clusters = 150
clustering = faiss.Clustering(d, num_clusters)
clustering.train(feature_array, index)

_, labels = index.search(feature_array, 1)

labels = labels.flatten()

clusters = {i: [] for i in range(num_clusters)}
for i, label in enumerate(labels):
    clusters[label].append(image_paths[i])

In [18]:
def plot_cluster(cluster_id, num_images=10):
    images = clusters[cluster_id][:num_images]
    
    plt.figure(figsize=(15, 5))
    for i, img_path in enumerate(images):
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.subplot(1, num_images, i + 1)
        plt.imshow(img)
        plt.axis("off")
    
    plt.show()

In [None]:
def save_cluster_images_to_directory(output_dir, num_images=10):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for cluster_id, images in clusters.items():
        cluster_dir = os.path.join(output_dir, f"cluster_{cluster_id}")
        os.makedirs(cluster_dir, exist_ok=True)

        print(f"Salvăm imagini pentru Cluster {cluster_id} - {len(images)} imagini")

        for i, img_path in enumerate(images[:num_images]):
            if not os.path.exists(img_path):
                print(f"[Eroare] Imaginea nu a fost găsită: {img_path}")
                continue

            img = cv2.imread(img_path)
            if img is None:
                print(f"[Eroare] OpenCV nu a putut încărca imaginea: {img_path}")
                continue

            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            output_image_path = os.path.join(cluster_dir, f"{cluster_id}_image_{i + 1}.png")

            cv2.imwrite(output_image_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))

    print(f"Imaginile au fost salvate în directorul {output_dir}")



In [None]:
output_directory = "output_clusters"
save_cluster_images_to_directory(output_directory, num_images=80)