# Data Processing

Importing required libraries

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import mtcnn
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow as tf
import imutils
import logging
tf.get_logger().setLevel(logging.ERROR)
import glob
import cv2
import pandas as pd
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

### Function to crop the Images and prepare Training Dataset

In [3]:
not_found_faces = []
def detect_and_crop_faces(source_path, filename, margin, dtype, datasetfor):
    image = cv2.imread(source_path,1)
    image = imutils.resize(image, width=200)
    I_H,I_W,I_C = image.shape
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray,1.2,10,minSize=(50,50))
    if len(faces)==0:
        #print("Face not found in:",filename)
        cv2.imwrite(datasetfor+'//Rejected//'+filename, image)
        not_found_faces.append(filename)
        return None
    else:
        max_area = 0
        for i in range(len(faces)):
            x, y, w, h = faces[i]
            if (w*h > max_area):
                max_area = w*h
                idx = i
    X, Y, w, h = faces[idx]
    destination_path = datasetfor+'//'+ dtype + '/' + filename
    p = margin/100
    X, Y, w, h = faces[idx]
    y1 = max(0,Y - int(p*h))
    y2 = min((Y+h) + int(p*h), I_H)
    x1 = max(0,X - int(w*p))
    x2 = min(X+w + int(p*w), I_W)
    cropped_image = image[y1:y2, x1:x2]
    cv2.imwrite(destination_path, cropped_image)

This function takes Path of the Original Image, Filename of the Image, margin and the type i.e Train/Test. 

Margin here implies the threshold that needs to be considered apart from the face data, i.e taking a margin of some percent so that important data like hair will be preserved.

Note: The same code snippet is excecuted for Smile Data and the Wavy Hair data. where Smile Data doesnt require the extra margin so, margin will be passsed as zero. For Wavy Hair model, I've considered a margin of 30% so that important information is preserved. For the above discussed variants there is variable called "datasetfor" which carries the information where(directory) to save the cropped Images

In [4]:
#Running the Function for all the images of the dataset

files = glob.glob("img_align_celeba//img_align_celeba//*.jpg")
for i in tqdm(range(len(files))):
    if(i<140000):
        detect_and_crop_faces(files[i],files[i][35:],0,"Train","Smile_Data")
    else:
        detect_and_crop_faces(files[i],files[i][35:],0,"Test","Smile_Data")

  0%|          | 0/202599 [00:00<?, ?it/s]

In [2]:
#Preparing the DataFrame for the model
data = pd.read_csv("list_attr_celeba.csv") 

In [8]:
data.replace(-1,0,inplace=True)

In [7]:
data.head(10)

Unnamed: 0,image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,000001.jpg,0,1,1,0,0,0,0,0,0,...,0,1,1,0,1,0,1,0,0,1
1,000002.jpg,0,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
2,000003.jpg,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
3,000004.jpg,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,1,1,0,1
4,000005.jpg,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
5,000006.jpg,0,1,1,0,0,0,1,0,0,...,0,0,0,1,1,0,1,0,0,1
6,000007.jpg,1,0,1,1,0,0,1,1,1,...,0,0,1,0,0,0,0,0,0,1
7,000008.jpg,1,1,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
8,000009.jpg,0,1,1,0,0,1,1,0,0,...,0,1,0,0,1,0,1,0,0,1
9,000010.jpg,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1


In [14]:
file_train = glob.glob("Smile_Data\\Train\\*.jpg")
map_df = pd.DataFrame()
map_df["image_id"] = file_train

In [15]:
def func(x):
    return x.split("\\")[-1]
map_df["image_id"] = map_df["image_id"].apply(func)

In [16]:
#Merging the Original DataFrame and the Train Split Data
map_df = map_df.merge(data,on="image_id",how='inner')

In [20]:
#Saving the DataFrame as CSV
df = pd.DataFrame()
df["image_id"] = map_df["image_id"]
df["Smiling"] = map_df["Smiling"]
df.replace(-1,0,inplace=True)
df.to_csv("smile_data.csv",index=False)

In [21]:
df.head()

Unnamed: 0,image_id,Smiling
0,000001.jpg,1
1,000002.jpg,1
2,000005.jpg,0
3,000006.jpg,0
4,000007.jpg,0


#### Generating the Dataset for Wavy Hair - 30% margin face-cropped Dataset

In [11]:
files = glob.glob("img_align_celeba//img_align_celeba//*.jpg")
for i in tqdm(range(len(files))):
    filename = files[i][35:]
    if(i<140000):
        detect_and_crop_faces(files[i],filename,30,"Train","Cropped_Data")
    else:
        detect_and_crop_faces(files[i],filename,30,"Test","Cropped_Data")

In [4]:
#Preparing the DataFrame for the model
data = pd.read_csv("list_attr_celeba.csv") 

In [5]:
data.replace(-1,0,inplace=True)

In [11]:
file_train = glob.glob("Cropped_Data\\Train\\*.jpg")
map_df = pd.DataFrame()
map_df["image_id"] = file_train

In [12]:
def func(x):
    return x.split("\\")[-1]
map_df["image_id"] = map_df["image_id"].apply(func)

In [13]:
#Merging the Original DataFrame and the Train Split Data
map_df = map_df.merge(data,on="image_id",how='inner')

In [9]:
#Saving the WavyHair DataFrame as CSV
df = pd.DataFrame()
df["image_id"] = map_df["image_id"]
df["Wavy_Hair"] = map_df["Wavy_Hair"]
df.to_csv("wavy_data.csv",index=False)

In [12]:
df.head()

Unnamed: 0,image_id,Wavy_Hair
0,000001.jpg,0
1,000002.jpg,0
2,000005.jpg,0
3,000006.jpg,1
4,000007.jpg,0


In [14]:
#Saving the Gender DataFrame as CSV
df = pd.DataFrame()
df["image_id"] = map_df["image_id"]
df["Male"] = map_df["Male"]
df.to_csv("gender_data.csv",index=False)

In [15]:
df.head()

Unnamed: 0,image_id,Male
0,000001.jpg,0
1,000002.jpg,0
2,000005.jpg,0
3,000006.jpg,0
4,000007.jpg,1
