# Face Data Collection

### Imports

In [1]:
import pandas as pd
import yfinance as yf
import os
import numpy as np
import pickle
import face_recognition

### Constants

In [50]:
# S&P500 Data
ALL_NAMES_CSV = os.path.join(os.getcwd(), 'lfw-dataset/lfw_allnames.csv')
IMAGE_BASE_DIR = os.path.join(os.getcwd(), 'lfw-dataset/lfw-deepfunneled/lfw-deepfunneled/')
FACE_DATA_CSV = os.path.join(os.getcwd(), 'face_data.csv')

In [3]:
def filter_out_people(min_images=5):
    df = pd.read_csv(ALL_NAMES_CSV)
    df = df[df['images'] >= min_images]
    return df

In [4]:
people_names_df = filter_out_people(5)

In [5]:
people_names_df.shape

(423, 2)

In [6]:
people_names_df['images'].sum()

5985

In [16]:
def collect_data_paths(people_names_df):
    image_path_df = people_names_df.copy().reindex()
    filepaths_list = []
    for index, row in people_names_df.iterrows():
        filenames = ["{}_{:04d}.jpg".format(row['name'], idx+1) for idx in range(int(row['images']))]
        filepaths = [os.path.join(IMAGE_BASE_DIR, row['name'], filename) for filename in filenames]
        filepaths_list.append(filepaths)
    
    image_path_df['file_path'] = filepaths_list
    image_path_df = image_path_df.explode('file_path')
    return image_path_df[['name', 'file_path']].reset_index(drop=True)

In [17]:
image_path_df = collect_data_paths(people_names_df)

In [28]:
image_path_df

Unnamed: 0,name,file_path
0,Abdullah_Gul,/Users/david/ColumbiaCS/AA/w4995-project1/data...
1,Abdullah_Gul,/Users/david/ColumbiaCS/AA/w4995-project1/data...
2,Abdullah_Gul,/Users/david/ColumbiaCS/AA/w4995-project1/data...
3,Abdullah_Gul,/Users/david/ColumbiaCS/AA/w4995-project1/data...
4,Abdullah_Gul,/Users/david/ColumbiaCS/AA/w4995-project1/data...
...,...,...
5980,Zinedine_Zidane,/Users/david/ColumbiaCS/AA/w4995-project1/data...
5981,Zinedine_Zidane,/Users/david/ColumbiaCS/AA/w4995-project1/data...
5982,Zinedine_Zidane,/Users/david/ColumbiaCS/AA/w4995-project1/data...
5983,Zinedine_Zidane,/Users/david/ColumbiaCS/AA/w4995-project1/data...


In [85]:
def encode_from_filepath(filepath):
    image = face_recognition.load_image_file(filepath)
    encodings = face_recognition.face_encodings(image)
    if (len(encodings) < 1):
        return None
    else:
        return encodings[0]

def collect_encode_images(image_path_df):
    embedding_df = image_path_df.copy()
    embedding_df['embedding'] = embedding_df['file_path'].apply(encode_from_filepath)
    embedding_df = embedding_df[embedding_df['embedding'].notnull()]
    return embedding_df.reset_index()

In [None]:
embedding_df = collect_encode_images(image_path_df)

In [None]:
embedding_df.shape

(5965, 4)

In [None]:
embedding_df.to_csv(FACE_DATA_CSV)

In [None]:
print(np.linalg.norm(embedding_df['embedding'][0]-embedding_df['embedding'][1]))
print(np.linalg.norm(embedding_df['embedding'][0]-embedding_df['embedding'][2]))
print(np.linalg.norm(embedding_df['embedding'][0]-embedding_df['embedding'][3]))

0.4681664639667913
0.4363866329084555
0.44419430000228627


In [93]:
print(np.linalg.norm(embedding_df['embedding'][0]-embedding_df['embedding'][5960]))

0.8551450599176703
