In [5]:
import os
import pickle
import gdown
import tqdm
from PIL import Image
import numpy as np


### Download the lfwa dataset

<ul>
    <li>Labeled Faces in the wild</li>
    <li>It has  5,749 people pictures</li>
    <li>Total no of images = 13,233 </li>
    <li>Here in lfwa dataset all the images are greyscaled and aligned which will improve the  performance of the model
    </li>
</ul>

In [None]:
# link to the dataset
url = 'https://drive.google.com/u/0/uc?id=1p1wjaqpTh_5RHfJu4vUh8JJCdKwYMHCp&export=download'

# downloading the zip file in the current directory
gdown.download(url)

In [None]:
!unzip lfwa.zip

<ul>
    <li>train.txt --> It has 2200 pair of image details ,Among 2200 , 1100 are similar person details and 1100 are disimilar person details
    </li>
    <li>test.txt --> it has 1000 pair of image details, Among 1000, 500 are similar person details and 500 are disimilar person details </li>
</ul>

In [29]:
train ='train'
test = 'test'
width = 105
height = 105
cells = 1
datapath = './dataset/'

In [40]:

def _open_image(path):
    """
    Using the Image library we open the image in the given path. The path must lead to a .jpg file.
    We then resize it to 105x105 like in the paper (the dataset contains 250x250 images.)

    Returns the image as a numpy array.
    """
    image = Image.open(path)
    image = image.resize((width,height))
    data = np.asarray(image)
    data = np.array(data, dtype='float64')
    return data

def convert_image_to_array(person, image_num, data_path, predict=False):
    """
    Given a person, image number and datapath, returns a numpy array which represents the image.
    predict - whether this function is called during training or testing. If called when training, we must reshape
    the images since the given dataset is not in the correct dimensions.
    """
    max_zeros = 4
    image_num = '0' * max_zeros + image_num
    image_num = image_num[-max_zeros:]
    image_path = os.path.join(data_path, 'lfw2', person, f'{person}_{image_num}.jpg')
    image_data =_open_image(image_path)
    if not predict:
        image_data = image_data.reshape(width,height,cells)
    return image_data

def load(set_name,datapath):
    """
    Writes into the given output_path the images from the data_path.
    dataset_type = train or test
    """
    file_path = os.path.join(datapath, 'splits', f'{set_name}.txt')
    output_path = os.path.join(datapath,f'{set_name}.pickle')
    print(file_path)
    print('Loading dataset...')
    x_first = []
    x_second = []
    y = []
    names = []
    with open(file_path, 'r') as file:
        lines = file.readlines()
    for line in tqdm.tqdm(lines):
        line = line.split()
        if len(line) == 4:  # Class 0 - non-identical
            names.append(line)
            first_person_name, first_image_num, second_person_name, second_image_num = line[0], line[1], line[2], \
                                                                                           line[3]
            first_image =convert_image_to_array(person=first_person_name,
                                                          image_num=first_image_num,
                                                          data_path='dataset')
            second_image =convert_image_to_array(person=second_person_name,
                                                           image_num=second_image_num,
                                                           data_path='dataset')
            x_first.append(first_image)
            x_second.append(second_image)
            y.append(0)
        elif len(line) == 3:  # Class 1 - identical
            names.append(line)
            person_name, first_image_num, second_image_num = line[0], line[1], line[2]
            first_image =convert_image_to_array(person=person_name,
                                                          image_num=first_image_num,
                                                          data_path='dataset')
            second_image =convert_image_to_array(person=person_name,
                                                           image_num=second_image_num,
                                                           data_path='dataset')
            x_first.append(first_image)
            x_second.append(second_image)
            y.append(1)
    print('Done loading dataset')
    with open(output_path, 'wb') as f:
        pickle.dump([[x_first, x_second], y, names], f)

In [41]:
# forming train dataset 
# before loading the image plese make sure test.txt and train.txt is present inside the split folder 
load(train,datapath)

./dataset/splits/train.txt
Loading dataset...


100%|█████████████████████████████████████| 2201/2201 [00:01<00:00, 1128.33it/s]


Done loading dataset


In [42]:
load(test,datapath)

./dataset/splits/test.txt
Loading dataset...


100%|█████████████████████████████████████| 1001/1001 [00:00<00:00, 1104.46it/s]


Done loading dataset
