# Data Sampling for Test/Train Set

 - Dataset
   - Name: oxford_iiit_pet
   - Url: https://www.tensorflow.org/datasets/catalog/oxford_iiit_pet
   - Description: The Oxford-IIIT pet dataset is a 37 category pet image dataset with roughly 200 images for each class. The images have large variations in scale, pose and lighting. All images have an associated ground truth annotation of breed.

## Extract Files and Store Data Info

In [1]:
import tarfile
import pandas as pd
import numpy as np

# open tar file for reading
tar = tarfile.open('data/images.tar.gz', 'r')
tar_names = tar.getnames()

# extract all images from tar file into 'data' directory
tar.extractall('data')

filenames = []
labels = []

for i, name in enumerate(tar_names):
    if '.jpg' in name:
        filename = name.split('/')[-1]
        labels.append(filename.split('_')[0])
        filenames.append(filename)

# create data frame to hold data information
data = pd.DataFrame({'label': labels, 'filename': filenames})

# print first five rows of data
print(data.head())

# close tar file
tar.close()

       label              filename
0      boxer          boxer_16.jpg
1  chihuahua     chihuahua_165.jpg
2        pug           pug_183.jpg
3    english  english_setter_1.jpg
4  chihuahua     chihuahua_170.jpg


## Select Images for Test and Train Sets

In [2]:
from sklearn.model_selection import train_test_split

y = data['label']
X = data.drop('label',axis=1)

# split data into 15% test and 85% train
x_train, x_test, y_train, y_test=train_test_split(X, y, train_size=0.85, test_size=0.15, 
                                                  stratify=y, random_state=123)

In [3]:
import os
import shutil

def movefiles(filelist, olddir, newdir):
    '''
    Move files to newdir.
    
    Parameters:
    - filelist: a list of file names
    - olddir: current directory where the files reside
    - newdir: directory where the files should be moved to
    '''
    
    # move files to the specified directory in newdir
    for filename in filelist:
        shutil.move(os.path.join(olddir, filename), os.path.join(newdir, filename))

In [4]:
image_dir = 'data/images'
test_dir = 'data/test'
train_dir = 'data/train'

# create directory test/train if not exist
for folder in [test_dir, train_dir]:
    if not os.path.exists(folder):
        os.makedirs(folder)

# move images to 'test' folder
movefiles(x_test['filename'].to_list(), image_dir, test_dir)

# move images to 'train' folder
movefiles(x_train['filename'].to_list(), image_dir, train_dir)

# remove 'image' directory if empty
if len(os.listdir(image_dir)) == 0:
    os.remove(image_dir)

## Save Data to Pickle Files

In [8]:
# test dataset
test_set = pd.concat([x_test, y_test], axis=1)
test_set.reset_index(inplace=True)
test_set.drop(['index'], axis=1, inplace=True)

# train dataset
train_set = pd.concat([x_train, y_train], axis=1)
train_set.reset_index(inplace=True)
train_set.drop(['index'], axis=1, inplace=True)

# save data
test_set.to_pickle('data/test_data.pkl')
train_set.to_pickle('data/train_data.pkl')

print('train size:', train_set.shape)
print('test size:', test_set.shape)

train size: (6281, 2)
test size: (1109, 2)
