In [None]:
!pip install keras-facenet

from google.colab import drive
drive.mount('/content/drive')

import cv2
import os

img_size = 160  # Size needed for FaceNet model
dataset_path = '/content/drive/MyDrive/celeba-dataset/celeba-dataset/img_align_celeba/img_align_celeba'

def load_images(dataset_path, img_size):
    images = []
    for img_name in os.listdir(dataset_path):
        img = cv2.imread(os.path.join(dataset_path, img_name))
        img = cv2.resize(img, (img_size, img_size))
        images.append(img)
    return images

images = load_images(dataset_path, img_size)


from keras_facenet import FaceNet
embedder = FaceNet()

embeddings = [embedder.embeddings([img])[0] for img in images]

import numpy as np

np.save('celeba_embeddings.npy', embeddings)


In [None]:
# !pip install keras-facenet
# from google.colab import drive
# drive.mount('/content/drive')

import requests
import os
import cv2
import numpy as np
from PIL import Image
from io import BytesIO
import dlib
from keras_facenet import FaceNet
from sklearn.metrics.pairwise import cosine_similarity  # Add this import

# Load face detector from dlib
detector = dlib.get_frontal_face_detector()

# Initialize the FaceNet model
embedder = FaceNet()
img_size = 160  # Size needed for FaceNet model

# Directory to save valid images
save_dir = '/content/drive/MyDrive/valid_images'
os.makedirs(save_dir, exist_ok=True)

def validate_image(img_cv, img_size=160):
    img_resized = cv2.resize(img_cv, (img_size, img_size))
    faces = detector(img_resized, 1)

    if len(faces) == 0:
        return False, "No face detected"

    for face in faces:
        if (face.left() < 0 or face.top() < 0 or
            face.right() > img_resized.shape[1] or face.bottom() > img_resized.shape[0]):
            return False, "Face is not centered properly"

    gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)
    laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
    if laplacian_var < 100:  # Adjust this threshold for blurriness
        return False, "Image is blurry"

    return True, "Image is valid"

def load_existing_embeddings(embeddings_path):
    if os.path.exists(embeddings_path):
        return np.load(embeddings_path, allow_pickle=True)
    return np.array([])

def download_and_validate_images(query, num_images=10000, pose_similarity_threshold=0.8, identity_similarity_threshold=0.7):
    access_key = 'MzaP1LgSWfRWU0YILfJtQPW2TjlBIzWBB5KEUHYUyp4'  # Replace with your Unsplash API key
    total_downloaded = 0
    page = 1
    new_embeddings = []  # Initialize the list to store embeddings

    # Load CelebA embeddings
    celeba_embeddings = np.load('celeba_embeddings.npy')

    while total_downloaded < num_images:
        search_url = f'https://api.unsplash.com/search/photos?query={query}&client_id={access_key}&per_page=30&page={page}'
        response = requests.get(search_url)

        if response.status_code != 200:
            print("Failed to fetch images")
            break

        results = response.json()['results']
        if not results:
            print("No more images found.")
            break

        print(f"Found {len(results)} images on page {page}.")

        for idx, img_data in enumerate(results):
            if total_downloaded >= num_images:
                break

            img_url = img_data['urls']['regular']  # Get the regular size image URL
            # print(f"Processing image {total_downloaded + 1}: {img_url}")

            try:
                img_response = requests.get(img_url)
                img = Image.open(BytesIO(img_response.content))
                img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

                is_valid, message = validate_image(img_cv)
                # print(f"Image {total_downloaded + 1}: {message}")

                if is_valid:
                    embedding = embedder.embeddings([img_cv])[0]

                    # Step 1: Check pose similarity to CelebA images
                    pose_similarities = cosine_similarity([embedding], celeba_embeddings)[0]
                    max_pose_similarity = np.max(pose_similarities)

                    if max_pose_similarity < pose_similarity_threshold:
                        # print(f"Image {total_downloaded + 1} rejected due to insufficient pose similarity ({max_pose_similarity:.2f})")
                        continue

                    # If both checks are passed, save the image and embedding
                    img_path = os.path.join(save_dir, f'image_{total_downloaded}.jpg')
                    cv2.imwrite(img_path, img_cv)
                    # print(f"Image saved at: {img_path}")

                    new_embeddings.append(embedding)  # Save the embedding
                    total_downloaded += 1

            except Exception as e:
                 print()#f"Error processing image {total_downloaded}: {e}")

        page += 1

    # Load existing embeddings
    existing_embeddings = load_existing_embeddings('celeba_embeddings.npy')

    # Concatenate new embeddings with existing ones
    all_embeddings = np.concatenate((existing_embeddings, new_embeddings), axis=0)

    # Save all embeddings to a file
    np.save('embeddings.npy', all_embeddings)
    print("All embeddings saved to embeddings.npy")

# Usage example
download_and_validate_images('people faces portrait', num_images=10000)

def find_similar(new_image, embeddings, top_n):
    if not isinstance(top_n, int):
        raise ValueError(f"Expected top_n to be an int, got {type(top_n)}")

    new_embedding = embedder.embeddings([new_image])[0]
    similarities = cosine_similarity([new_embedding], embeddings)[0]

    print(f"Similarities shape: {similarities.shape}, dtype: {similarities.dtype}")

    if similarities.ndim != 1:
        raise ValueError(f"Expected similarities to be 1D, got {similarities.ndim}D")

    indices = np.argsort(similarities)[::-1][:top_n]
    return indices, similarities[indices]

# Load new image and find similar ones
new_img = cv2.imread('/content/drive/MyDrive/valid_images/image_0.jpg')
new_img_resized = cv2.resize(new_img, (img_size, img_size))
embeddings = np.load('embeddings.npy')  # Load embeddings from file

top_n = 10  # Set this to the desired number of similar images
indices, similarities = find_similar(new_img_resized, embeddings, top_n)

# Show top similar images
for idx, similarity in zip(indices, similarities):
    print(f"Image {idx} is {similarity * 100:.2f}% similar")


Found 30 images on page 1.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━

In [None]:


def show_images(indices, similarities, dataset_path, top_n):
    for i, (idx, similarity) in enumerate(zip(indices, similarities)):
        img_path = os.path.join(dataset_path, f"{idx + 1:06d}.jpg")  # CelebA filenames are like 000001.jpg
        img = cv2.imread(img_path)

        if img is None:
            print(f"Image {idx} not found.")
            continue

        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Display the image and its similarity score
        plt.figure(figsize=(5, 5))
        plt.imshow(img_rgb)
        plt.title(f"Image {idx + 1} ({similarity * 100:.2f}% similar)")
        plt.axis('off')
        plt.show()
# Show top similar images
dataset_path = '/content/drive/MyDrive/celeba-dataset/celeba-dataset/img_align_celeba/img_align_celeba'
show_images(indices, similarities, dataset_path, top_n)
