# Data preprocessing

## Goal
The goal of this script is to prepare the data in order to train the siamese neural network:
- Group all portrait together
- Crop face from other pictures using face detection algorithm.
- Create training and test sample
- Create pairs of pictures:
    - Training pairs are couple of pictures from the train sample
    - Test pairs are couple of pictures, with one from test sample, one from train sample

In [1]:
import os
import pandas as pd
from random import shuffle
from math import floor
from PIL import Image
import numpy as np
from shutil import copy

In [2]:
dataRoot = "D:/Project/DeepLearning/buddhaStyle/Data/original/"
dataPortrait = dataRoot + "face/"
dataWork = "D:/Project/DeepLearning/buddhaStyle/Data/workData/"
dataDebout = dataRoot + "debout/"
dataTailleur = dataRoot + "tailleur/"

## Data distribution
Check the distribution of the portrait

In [3]:
data = os.listdir(dataPortrait)

In [4]:
data2 = list(map(lambda s: s.split('_')[0],data  ))
data2 = pd.DataFrame.from_dict( { 'Period':data2  }  )

In [5]:
data2.groupby(['Period'])['Period'].count()

Period
Chiang%20Saen%20Kindom     1
Dvaravati%20Period        31
Lanna%20Kingdom            1
Lopburi%20Period           3
Srivichai%20Kingdom        3
Sukhothai%20Period        30
Uthong%20Art               7
Name: Period, dtype: int64

* Very unbalanced sample  Some style not a lot of face.
* Lets try to use face detection model in other picture in order to detect and crop the face of other pictures

## Face detection from other pictures

In [6]:
# Copy picture from dataPortrait to datWork
if not os.path.isdir(dataWork):
    os.mkdir(dataWork)

for file in os.listdir(dataPortrait):
    copy( dataPortrait + file, dataWork + file.split('/')[-1]  )

In [7]:
from mtcnn.mtcnn import MTCNN
detector = MTCNN()

In [8]:
def extractFace(inputPath,outputPath,detector):
    '''Function extracting face from a picture and save it in a new picture. '''
    
    def createTuple(corner):
        '''This function return coordonate box of the face, and add 50 pixel to each border'''   
        return  (corner[0] -50 ,corner[1] -50 ,corner[0]+corner[2] + 50 ,corner[1]+corner[3] +50 )
    
    for file in os.listdir(inputPath):
        #loop through pictures file
        img = Image.open( inputPath + file  )#open the image
        array = np.asarray(img)# convert it in numpy array
        
        faces = detector.detect_faces(array)#Create a liste of face. Face are dictionnary object
        
        for face in faces:
            z = img.crop(createTuple(face['box']))
            
            
            i = 0
            p = outputPath + file
            
            while os.path.isfile(p):
                p = p.split('.')[0]+ '_{}.png'.format(i)
                i+=1
            z.save(p)
    print( inputPath + ' has been treated')

In [9]:
for path in [dataDebout,dataTailleur]:
    extractFace(path,dataWork,detector)

D:/Project/DeepLearning/buddhaStyle/Data/original/debout/ has been treated
D:/Project/DeepLearning/buddhaStyle/Data/original/tailleur/ has been treated


In [10]:
# New distribution 
data = os.listdir(dataWork)

data2 = list(map(lambda s: s.split('_')[0],data  ))
data2 = pd.DataFrame.from_dict( { 'Period':data2  }  )
data2.groupby(['Period'])['Period'].count()

Period
Chiang%20Saen%20Kindom     2
Dvaravati%20Period        50
Lanna%20Kingdom           11
Lopburi%20Period          18
Rattanakosin%20Period     25
Srivichai%20Kingdom       12
Sukhothai%20Period        76
Uthong%20Art              21
Name: Period, dtype: int64

Not enough pictures of Chiang Saen art, those pictures are going to use only in test set and check that those face are not reconize as another style

## Create test & train samples

In [11]:
files = os.listdir(dataWork)

data = dict()
# Create a dictionary 
for elt in files:
    style = elt.split('_')[0]
    if style in data.keys():
        data[style].append(elt)
    else:
        data[style] = [elt]


In [12]:
tr = dict()
test = dict()
trainRatio = 0.8

for key in data.keys():
    if key == 'Chiang%20Saen%20Kindom' :
        test[key] = data[key]
    else:
        z = data[key]
        shuffle(z)
        n = floor(trainRatio * len(z))
        tr[key] = z[:n]
        test[key] = z[n:]


## Create Pairs

**Training pairs:**
- Create a text file with all pairs of pictures

In [13]:

train = []
for k in tr.keys():
    train.extend( tr[k] )
    
shuffle(train)
train = train[:floor(len(train)*.6)  ]

file = open(dataWork + 'train_all.txt' , 'w')
file.write('photo1,photo2,y\n')

index = 0
for k in train:
    for j in train[index:]:
        if k.split('_')[0] == j.split('_')[0]:
            y = 1
        else:
            y= 0
        file.write('{},{},{}\n'.format(k,j,y)  )
    index += 1
file.close()

#Randomly select 60% of those couples

file  = open(dataWork + 'train_all.txt' , 'r')
lines = file.readlines()
file.close()



file  = open(dataWork + 'train.txt' , 'w')
file.write(lines[0])

lines.pop(0)
shuffle(lines)
lines = lines[:floor(0.6*len(lines))]

for line in lines:
    file.write(line)
file.close()

**Test pairs:**
- Create couple with one picture from the train sample; and one from the test sample
- For each picture of the test set, the script below will create 2 pairs per style.


In [14]:
t = list()
for key in  test.keys():
    t.extend(test[key])

file = open(dataWork + 'test.txt','w')
file.write('photo1,photo2,y\n')
for tt in t:
    for k in tr.keys():
        y= []
        d = tr[k]
        shuffle(d)
        
        if len(d)!= 0:

            if tt.split('_')[0] == d[0].split('_')[0]:
                y.append(1)
            else:
                y.append(0)

            if tt.split('_')[0] == d[1].split('_')[0]:
                y.append(1)
            else:
                y.append(0)        

            file.write('{},{},{}\n'.format(tt,d[0],y[0])  )
            file.write('{},{},{}\n'.format(tt,d[1],y[1])  )
file.close()        

In [15]:
pd.read_csv(dataWork + 'test.txt').head(5)

Unnamed: 0,photo1,photo2,y
0,Chiang%20Saen%20Kindom_954.jpg,Dvaravati%20Period_175.jpg,0
1,Chiang%20Saen%20Kindom_954.jpg,Dvaravati%20Period_143.jpg,0
2,Chiang%20Saen%20Kindom_954.jpg,Lanna%20Kingdom_968.jpg,0
3,Chiang%20Saen%20Kindom_954.jpg,Lanna%20Kingdom_974.jpg,0
4,Chiang%20Saen%20Kindom_954.jpg,Lopburi%20Period_261.jpg,0


In [16]:
pd.read_csv(dataWork + 'train.txt').head(5)

Unnamed: 0,photo1,photo2,y
0,Sukhothai%20Period_647.jpg,Rattanakosin%20Period_368.jpg,0
1,Sukhothai%20Period_644.jpg,Sukhothai%20Period_688.jpg,1
2,Sukhothai%20Period_644.jpg,Sukhothai%20Period_794.jpg,1
3,Uthong%20Art_950.jpg,Sukhothai%20Period_906.jpg,0
4,Lopburi%20Period_319.jpg,Dvaravati%20Period_36.jpg,0
