# VGG Face - 2
* Download dataset.
* Align dataset images.


# Mount google drive.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

# Set the root directory.

In [0]:
import os

root_dir = '/content/'
os.chdir(root_dir)

!ls -al

# Download train and test dataset splits.

In [0]:
import requests
import getpass
import sys

In [0]:
login_url = 'http://zeus.robots.ox.ac.uk/vgg_face2/login/'
test_dataset_url = 'http://zeus.robots.ox.ac.uk/vgg_face2/get_file?fname=vggface2_test.tar.gz'
train_dataset_url = 'http://zeus.robots.ox.ac.uk/vgg_face2/get_file?fname=vggface2_train.tar.gz'

In [0]:
print('Please enter your VGG Face 2 credentials:')
user_string = input('    User: ')
password_string = getpass.getpass(prompt='    Password: ')

### Use this code block to download the train dataset split.

In [0]:
dataset_url = train_dataset_url

### OR

### Use this code block to download the test dataset split.

In [0]:
dataset_url = test_dataset_url

In [0]:
payload = {
    'username': user_string,
    'password': password_string
}

In [0]:
session = requests.session()
r = session.get(login_url)

if 'csrftoken' in session.cookies:
    csrftoken = session.cookies['csrftoken']
elif 'csrf' in session.cookies:
    csrftoken = session.cookies['csrf']
else:
    raise ValueError("Unable to locate CSRF token.")

payload['csrfmiddlewaretoken'] = csrftoken

In [0]:
r = session.post(login_url, data=payload)

filename = dataset_url.split('=')[-1]

print('downloading -', filename)
with open(filename, "wb") as file:    
    request = session.get(dataset_url, data=payload, stream=True)
    bytes_written = 0
    for chunk in request.iter_content(chunk_size=512*1024):
      if chunk:
        file.write(chunk)
        bytes_written += len(chunk)        
    file.close()
    
print('done')

In [0]:
!ls -al

# Compute MD5 sum to check integrity.

### Check MD5 sum for train dataset split.

In [0]:
!md5sum vggface2_train.tar.gz # MD5 - 88813c6b15de58afc8fa75ea83361d7f

### Check MD5 sum for test dataset split.

In [0]:
!md5sum vggface2_test.tar.gz # MD5 - bb7a323824d1004e14e00c23974facd3

# Extract train and test dataset splits.

### Extract train dataset split.

In [0]:
!tar -xzf vggface2_train.tar.gz --wildcards --no-anchored 'train/n000*' # 'train/n000000'
!tar -xzf vggface2_train.tar.gz --wildcards --no-anchored 'train/n001*' # 'train/n000000'

In [0]:
!tar -xzf vggface2_train.tar.gz --wildcards --no-anchored 'train/n002*' # 'train/n000000'
!tar -xzf vggface2_train.tar.gz --wildcards --no-anchored 'train/n003*' # 'train/n000000'

In [0]:
!tar -xzf vggface2_train.tar.gz --wildcards --no-anchored 'train/n004*' # 'train/n000000'
!tar -xzf vggface2_train.tar.gz --wildcards --no-anchored 'train/n005*' # 'train/n000000'

In [0]:
!tar -xzf vggface2_train.tar.gz --wildcards --no-anchored 'train/n006*' # 'train/n000000'
!tar -xzf vggface2_train.tar.gz --wildcards --no-anchored 'train/n007*' # 'train/n000000'

In [0]:
!tar -xzf vggface2_train.tar.gz --wildcards --no-anchored 'train/n008*' # 'train/n000000'
!tar -xzf vggface2_train.tar.gz --wildcards --no-anchored 'train/n009*' # 'train/n000000'

In [0]:
!ls -l train | grep ^d | wc -l

In [0]:
#!rm -rf train

### Extract test dataset split.

In [0]:
!tar -xzf vggface2_test.tar.gz

# Align train or test dataset splits.

In [0]:
source_image_dir = '/content/test'
target_image_dir = '/content/drive/My Drive/datasets/vggface2/aligned_images/test'
bounding_box_filename = '/content/drive/My Drive/datasets/vggface2/bbox_landmark/loose_bb_test.csv'

In [0]:
source_image_dir = '/content/train'
target_image_dir = '/content/drive/My Drive/datasets/vggface2/aligned_images/train'
bounding_box_filename = '/content/drive/My Drive/datasets/vggface2/bbox_landmark/loose_bb_train.csv'

In [0]:
bounding_box_file = open(bounding_box_filename, 'rt')
source_image_bboxes = bounding_box_file.readlines()
print(len(source_image_bboxes))

### Define letter box resize function.

In [0]:
import numpy as np

def letterbox_image(input_image, target_size):
    image_height, image_width = input_image.shape[0:2]
    target_height, target_width = target_size

    scale = min(target_width/image_width, target_height/image_height)
    new_width = int(image_width*scale)
    new_height = int(image_height*scale)

    scaled_image = cv2.resize(input_image, (new_width, new_height), interpolation=cv2.INTER_CUBIC)

    output_image = np.zeros((target_height, target_width, 3), np.uint8)
    output_image.fill(128)

    start_x = (target_width - new_width) // 2
    start_y = (target_height - new_height) // 2
    output_image[start_y:(start_y + new_height), start_x:(start_x + new_width),:] = scaled_image

    return(output_image)

### Align train and test dataset splits.

In [0]:
overwrite_source = True

In [0]:
import cv2
from google.colab.patches import cv2_imshow

image_offset_scale = (256. - 224.) / 256. 
got_header = False
image_count = 0

for source_image_bbox in source_image_bboxes:  

  if(not got_header):
    got_header = True
    continue    

  #print(source_image_bbox)
  data = source_image_bbox.split(',')

  image_filename = data[0].strip('"')
  image_filename = image_filename + '.jpg'
  source_image_path = os.path.join(source_image_dir, image_filename)

  if(not os.path.exists(source_image_path)):
    continue

  image_count =  image_count + 1
  
  class_name = image_filename.split('/')[0]
  print(class_name, image_filename, source_image_path)

  if(overwrite_source):
    target_image_path = source_image_path   
  else:
    target_class_dir = os.path.join(target_image_dir, class_name)
    if(not os.path.exists(target_class_dir)):
      os.mkdir(target_class_dir)

    target_image_path = os.path.join(target_class_dir, image_filename)
    if(os.path.exists(target_image_path)):
      continue  

  input_image = cv2.imread(source_image_path, cv2.IMREAD_COLOR)
  image_height, image_width = input_image.shape[0:2]

  start_x = max(int(data[1]), 0)
  start_y = max(int(data[2]), 0)
  bbox_width = min(int(data[3]), image_width)
  bbox_height = min(int(data[4]), image_height)

  #print(source_image_path, start_x, start_y, bbox_width, bbox_height)  

  image_y_offset = int(image_height * image_offset_scale) / 2.
  image_x_offset = int(image_width * image_offset_scale) / 2.

  start_x_offset = int(max(start_x - image_x_offset, 0))
  start_y_offset = int(max(start_y - image_y_offset, 0))
  bbox_width_offset = int(min(bbox_width + 2 * image_x_offset, image_width))
  bbox_height_offset = int(min(bbox_height + 2 * image_y_offset, image_height))
  #print(start_x_offset, start_y_offset, bbox_width_offset, bbox_height_offset)

  #cv2_imshow(input_image)

  #cropped_image = input_image[start_y:start_y+bbox_height, start_x:start_x+bbox_width]
  #cv2_imshow(cropped_image)

  cropped_image_offset = input_image[start_y_offset:start_y_offset+bbox_height_offset, start_x_offset:start_x_offset+bbox_width_offset]
  #cv2_imshow(cropped_image_offset)  

  image_size = max(cropped_image_offset.shape[0], cropped_image_offset.shape[1])
  #print(image_size)
  output_image = letterbox_image(cropped_image_offset, (image_size, image_size))
  #cv2_imshow(output_image)

  cv2.imwrite(target_image_path, output_image)  

In [0]:
print('number of images -', image_count)

### Compress and copy train dataset 0-1 split.

In [0]:
!tar -czf aligned_vggface2_train-0-1.tar.gz train

In [0]:
!cp aligned_vggface2_train-0-1.tar.gz '/content/drive/My Drive/.'

In [0]:
!rm -rf aligned_vggface2_train-0-1.tar.gz train

### Compress and copy train dataset 2-3 split.

In [0]:
!tar -czf aligned_vggface2_train-2-3.tar.gz train

In [0]:
!cp aligned_vggface2_train-2-3.tar.gz '/content/drive/My Drive/.'

In [0]:
!rm -rf aligned_vggface2_train-2-3.tar.gz train

### Compress and copy train dataset 4-5 split.

In [0]:
!tar -czf aligned_vggface2_train-4-5.tar.gz train

In [0]:
!cp aligned_vggface2_train-4-5.tar.gz '/content/drive/My Drive/.'

In [0]:
!rm -rf aligned_vggface2_train-4-5.tar.gz train

### Compress and copy train dataset 6-7 split.

In [0]:
!tar -czf aligned_vggface2_train-6-7.tar.gz train

In [0]:
!cp aligned_vggface2_train-6-7.tar.gz '/content/drive/My Drive/.'

In [0]:
!rm -rf aligned_vggface2_train-6-7.tar.gz train

### Compress and copy train dataset 8-9 split.

In [0]:
!tar -czf aligned_vggface2_train-8-9.tar.gz train

In [0]:
!cp aligned_vggface2_train-8-9.tar.gz '/content/drive/My Drive/.'

In [0]:
!rm -rf aligned_vggface2_train-8-9.tar.gz train

# Unmount google drive.

In [0]:
!ls -al '/content/drive/My Drive/'

In [0]:
drive.flush_and_unmount()