In [None]:
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
!unzip "lfw-funneled.tgz.zip" -d lfw_funneled



Archive:  lfw-funneled.tgz.zip
  inflating: lfw_funneled/lfw-funneled.tgz  


In [None]:
!tar -xvzf lfw_funneled/lfw-funneled.tgz -C lfw_funneled


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
lfw_funneled/Guillaume_Depardieu/Guillaume_Depardieu_0001.jpg
lfw_funneled/Delphine_Chuillot/
lfw_funneled/Delphine_Chuillot/Delphine_Chuillot_0001.jpg
lfw_funneled/Paul_LeClerc/
lfw_funneled/Paul_LeClerc/Paul_LeClerc_0001.jpg
lfw_funneled/Eddie_Jordan/
lfw_funneled/Eddie_Jordan/Eddie_Jordan_0001.jpg
lfw_funneled/Lionel_Richie/
lfw_funneled/Lionel_Richie/Lionel_Richie_0001.jpg
lfw_funneled/Lionel_Richie/Lionel_Richie_0002.jpg
lfw_funneled/Jose_Viegas_Filho/
lfw_funneled/Jose_Viegas_Filho/Jose_Viegas_Filho_0001.jpg
lfw_funneled/Jose_Viegas_Filho/Jose_Viegas_Filho_0002.jpg
lfw_funneled/Roger_Mahony/
lfw_funneled/Roger_Mahony/Roger_Mahony_0001.jpg
lfw_funneled/Clay_Aiken/
lfw_funneled/Clay_Aiken/Clay_Aiken_0001.jpg
lfw_funneled/Clay_Aiken/Clay_Aiken_0002.jpg
lfw_funneled/Clay_Aiken/Clay_Aiken_0003.jpg
lfw_funneled/Clay_Aiken/Clay_Aiken_0004.jpg
lfw_funneled/Clay_Aiken/Clay_Aiken_0005.jpg
lfw_funneled/Clay_Aiken/Clay_Aiken_00

In [None]:
import os

base_path = "lfw_funneled/lfw_funneled"
print("number ", len(os.listdir(base_path)))
print(" 5 first:", os.listdir(base_path)[:5])


number  5760
 5 first: ['Brian_Campbell', 'Joan_Dangerfield', 'Calista_Flockhart', 'Brad_Banks', 'Juan_Carlos_Ortega']


In [None]:
#Transformation
transform = transforms.Compose([
    transforms.Resize((128, 128)),   #resize
    transforms.ToTensor(),           #  Tensor
    transforms.Normalize((0.5,), (0.5,))  #  Normalization
])

In [None]:
# load dataset
data_dir = "lfw_funneled/lfw_funneled"
dataset = datasets.ImageFolder(root=data_dir, transform=transform)

In [None]:
# Train / Test
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

In [None]:
# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print("classes:", len(dataset.classes))
print("training:", len(train_dataset))
print("testing:", len(test_dataset))

classes: 5749
training: 10586
testing: 2647


In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_path = base_path

In [None]:
persons = os.listdir(data_path)
print("Total persons:", len(persons))


In [None]:
# Count images per person
image_counts = {p: len(os.listdir(os.path.join(data_path, p))) for p in persons}
df = pd.DataFrame(list(image_counts.items()), columns=["Person", "Image_Count"])
print(df.head())

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df["Image_Count"], bins=30, kde=False)
plt.title("Distribution of Images per Person")
plt.xlabel("Number of Images")
plt.ylabel("Frequency")
plt.show()


In [None]:
import random

def show_samples(person, n=5):
    folder = os.path.join(data_path, person)
    files = os.listdir(folder)
    samples = random.sample(files, min(n, len(files)))

    plt.figure(figsize=(15,5))
    for i, img_file in enumerate(samples):
        img = cv2.imread(os.path.join(folder, img_file))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.subplot(1, n, i+1)
        plt.imshow(img)
        plt.axis("off")
    plt.suptitle(f"Samples for {person}", fontsize=16)
    plt.show()

# Example
show_samples(persons[0])


In [None]:
heights, widths = [], []

for person in persons:
    folder = os.path.join(data_path, person)
    for img_file in os.listdir(folder):
        img = cv2.imread(os.path.join(folder, img_file))
        if img is not None:
            h, w = img.shape[:2]
            heights.append(h)
            widths.append(w)

plt.figure(figsize=(10,5))
sns.scatterplot(x=widths, y=heights, alpha=0.5)
plt.title("Image Width vs Height")
plt.xlabel("Width")
plt.ylabel("Height")
plt.show()

print("Average Width:", np.mean(widths))
print("Average Height:", np.mean(heights))


In [None]:
plt.figure(figsize=(12,6))
df.sort_values("Image_Count", ascending=False).head(20).plot(
    x="Person", y="Image_Count", kind="bar", color="skyblue", legend=False
)
plt.title("Top 20 Persons with Most Images")
plt.ylabel("Number of Images")
plt.xlabel("Person")
plt.xticks(rotation=75)
plt.show()
