In [1]:
import os
import torch
import copy
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.auto import tqdm
from facenet_pytorch import MTCNN, InceptionResnetV1

# Initialize MTCNN for face detection
mtcnn = MTCNN(keep_all=True)

# Load pre-trained FaceNet model
resnet = InceptionResnetV1(pretrained='casia-webface').eval()

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 111M/111M [00:00<00:00, 208MB/s]  


In [2]:
path = os.path.join("..","in","newspapers")

In [4]:
results = []
for newspaper in os.listdir(path)[:1]:

    paper_path = sorted(os.listdir(os.path.join(path, newspaper)))

    for page in tqdm(paper_path[:10], position=0, leave=True):
        try:
            img = Image.open(os.path.join(path, newspaper, page))
        except OSError as image:
            print(f"Error processing image: {image}")

        boxes, _ = mtcnn.detect(img)
        results.append([page, boxes])

print(results)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:16<00:00,  1.65s/it]

[['JDG-1826-02-16-a-p0001.jpg', None], ['JDG-1826-02-16-a-p0002.jpg', None], ['JDG-1826-02-16-a-p0003.jpg', None], ['JDG-1826-02-16-a-p0004.jpg', None], ['JDG-1826-04-20-a-p0001.jpg', None], ['JDG-1826-04-20-a-p0002.jpg', None], ['JDG-1826-04-20-a-p0003.jpg', None], ['JDG-1826-04-20-a-p0004.jpg', None], ['JDG-1826-06-15-a-p0001.jpg', None], ['JDG-1826-06-15-a-p0002.jpg', None]]





In [6]:
results_ = results.copy()

In [7]:
# count pages w/ faces and sum of all faces

sum_of_faces = 0
face_page_sum = 0

for obj in results_:
    if obj[1] is None:
        obj[1] = 0
    else:
        sum_of_faces += len(obj[1])
        face_page_sum += 1
        obj[1] = len(obj[1])

print(f'Faces detected: {sum_of_faces}')
print(f'Pages with faces: {face_page_sum}')

Faces detected: 152
Pages with faces: 98


In [8]:
df = pd.DataFrame(results_, columns=['Pages', 'Faces freq.'])

In [86]:
# extract year from file name, calculate decade 

df['Year'] = df['Pages'].str.extract(r'-(\d{4})-').astype(int)

df['Decade'] = (df['Year'] // 10) * 10

df_sorted = df.sort_values('Decade')

grouped_df = df.groupby('Decade').agg({
    'Faces freq.':'sum',
    'Pages': 'count'
}).reset_index()

grouped_df['% of pages'] = (grouped_df['Faces freq.'] / grouped_df['Pages']) * 100
grouped_df['% of pages'] = grouped_df['% of pages'].round(2)
grouped_df

Unnamed: 0,Decade,Faces freq.,Pages,% of pages
0,1820,0,26,0.0
1,1830,1,54,1.85
2,1840,1,50,2.0
3,1850,2,48,4.17
4,1860,4,48,8.33
5,1870,5,46,10.87
6,1880,6,50,12.0
7,1890,1,52,1.92
8,1900,11,54,20.37
9,1910,6,50,12.0
