<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part One: Data Preparation </h2>	


<h3>  Import Statements </h3>	

In [1]:
import numpy as np
import pandas as pd
import skimage
import os
import zipfile 
from zipfile import ZipFile
import re 
from skimage import io
from pathlib import Path

### Zip Helper Method

In [2]:
def remove_directory_file_spaces_zip(dir_name='20_Validation'): 
    """
    Removes spaces in file names of specified directory in zip file.
    The zip file name should be specified from the root directory. 
    Extracts the zip file, trims the filenames and rezips the file.
    
    Keyword arguments: 
    dir_name (string) -- name of zip file in root folder 
    
    Returns: 
        None 
    """
    val_files_zip = zipfile.ZipFile(f"{os.getcwd()}/{dir_name}.zip")
    val_files_zip.extractall(f"{os.getcwd()}/{dir_name}/")

    current_directory = os.getcwd()
    directory = f"{current_directory}/{dir_name}"
    [os.rename(os.path.join(directory, f), 
               os.path.join(directory, f).replace(' ', '').lower()) 
     for f in os.listdir(directory)]

    zf = ZipFile(f"{dir_name}.zip", "w")
    for dirname, subdirs, files in os.walk(directory): 
        for filename in files:
            zf.write(os.path.join(dirname, filename), filename)
    zf.close()
    
remove_directory_file_spaces_zip() 

### RGB Helper Method

In [3]:
def convert_gray(images):
    """
    Creates RGB representation of gray-level images. 
    
    Keyword arguments: 
    images (pd.Series) -- array containing image pixels 
    
    Returns: 
        ndarray of images all with RGB representation
        
    """
    return images.apply(lambda image: skimage.color.grey2rgb(image) if len(image.shape) < 3 else image) 

### Read Image Data Method 

<h4> Using the code for reference, take the given folder and create a dataframe with the picture object, and the encoding as listed below. </h4>	
0=Airplanes, 1=Bear, 2=Blimp, 3=Comet, 4=Crab, 5=Dog, 6=Dolphin, 7=Giraffe, 8=Goat, 9=Gorilla, 10=Kangaroo, 11=Killer-Whale, 12=Leopards, 13=Llama, 14= Penguin, 15= Porcupine, 16=Teddy-Bear, 17=Triceratops, 18=Unicorn, 19=Zebra

In [48]:
Path("/cleaned_test.pkl")

WindowsPath('cleaned_test.pkl')

In [35]:
def read_organize_data(folder_name='20_categories_training', isTest=False, isCache=True):    
    """
    Returns a dataframe with picture objects and category encodings of all images in folder. 
    
    Keyword arguments:
    folder_name (string) -- name of image folder in root directory 
    isTest (bool) -- Flag. True loads images from test set, False loads train set.
    isCache (bool) -- Flag. True loads images from cache, False overwrites cached images.
        
    Return: 
        pd.DataFrame with image objects and category encodings 
    """
    # Read in cached file, if exists
    cache_path = 'cleaned_test.pkl' if isTest else 'cleaned_train.pkl'
    if Path(cache_path).is_file() and isCache: 
        print(f"Loading cache {'test' if isTest else 'train'} file")
        return pd.read_pickle(cache_path)    
    
    folder_name = '20_Validation' if isTest else folder_name
    img_dir_path = Path(f"{os.getcwd()}/{folder_name}.zip") 
    
    category_zip = zipfile.ZipFile(f"{os.getcwd()}/{folder_name}.zip")
    category_zip.extractall(f"{os.getcwd()}/{folder_name}/")
    
    image_names, category_names, images, encoding = get_image_data(folder_name, img_dir_path, isTest)
    df = create_cleaned_df(image_names, images, category_names, encoding)          
    
    # Cache dataframe
    if not Path(cache_path).is_file() or isCache is False: 
        df.to_pickle(f"./{cache_path}")
    
    return df

In [36]:
def get_image_data(folder_name, img_dir_path, isTest):
    """
    Returns lists of image names, category names, and image pixels of all images in folder. 
    
    Keyword arguments:
    folder_name (string) -- path of image folder in root directory 
    img_dir_path (Path) -- path of image directory in root directory 
    isTest (bool) -- Flag. True loads images from test set, False loads train set.
        
    Return: 
        lists with image names, category names, and image objects 
    """
    image_names, category_names, images = [], [], []
    
    with ZipFile(img_dir_path, 'r') as images_zip: 
        for filename in images_zip.namelist(): 
            img_name = re.search("^.*.jpg", filename)
              
            if img_name is not None and "/._" not in img_name.group(): 
                if ' ' in img_name.group(): 
                    continue 
                image_names.append(img_name.group())

            if isTest: 
                img_path = Path(f"{folder_name}/{img_name.group()}")
                images.append(io.imread(img_path))
                continue 
            
            category_name = re.search("(.*)/", filename)
            if category_name and "/._" not in img_name.group(): 
                category_name = category_name.group().replace("/", "")
                category_names.append(category_name)
                category_path = Path(f"{folder_name}/{img_name.group()}")
                image = io.imread(category_path)
                images.append(image)
                
    if not isTest:
        encoding = encode_categories(category_names)
        return image_names, category_names, images, encoding
    return image_names, category_names, images, _

In [37]:
def encode_categories(category_names):
    """
    Returns list category encodings of all images in folder. 
    
    Keyword arguments:
    category_names (list) -- category names of all images 
        
    Return: 
        list with category name encodings of all images
    """
    encoding_dict = {0:'airplanes', 1:'bear', 2:'blimp', 3:'comet', 4:'crab', \
                    5:'dog', 6:'dolphin', 7:'giraffe', 8:'goat', 9:'gorilla', \
                    10:'kangaroo', 11:'killer-whale', 12:'leopards', 13:'llama', \
                    14:'penguin', 15:'porcupine', 16:'teddy-bear', 17:'triceratops', \
                    18:'unicorn', 19:'zebra'}
    encoding_dict = {v: k for k, v in encoding_dict.items()}
    return [encoding_dict[cat_name] for cat_name in category_names]

In [38]:
def create_cleaned_df(image_names, images, category_names=None, encoding=None):
    """
    Returns a dataframe with picture objects and category encodings of all images in folder. 
    
    Keyword arguments:
    image_names (list) -- name of all images
    images (list) -- image objects of all images
    category_names (list) -- category names of all images 
    encoding (list) -- category encodings of all images
    
    Return: 
        pd.DataFrame with image objects and category encodings 
    """
    df = pd.DataFrame() 
    df['name'] = image_names
    df['image'] = images
    df['image'] = convert_gray(df['image'])
    
    if category_names and encoding:
        df['category'] = category_names
        df['encoding'] = encoding
    return df

### Measure Reading Time 

In [39]:
import time 
t1 = time.time()
training_data = read_organize_data(isTest=False, isCache=True)
validation_data = read_organize_data(isTest=True, isCache=True)
t2 = time.time()
print(f"Seconds: {round(t2 - t1, 2)}")
print(f"Training images loaded: {len(training_data)}")
print(f"Test images loaded: {len(validation_data)}")

Seconds: 13.74
Training images loaded: 1501
Test images loaded: 716


In [23]:
starting_data = read_organize_data(isTest=False, isCache=True)

Loading cache train file


In [24]:
# Check if any pixels in images do not have RGB representation after cleanup
starting_data[starting_data['image'].apply(lambda x: len(x.shape) < 3)]

Unnamed: 0,name,image,category,encoding


In [25]:
starting_data.head()

Unnamed: 0,name,image,category,encoding
0,airplanes/airplanes_0001.jpg,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",airplanes,0
1,airplanes/airplanes_0002.jpg,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",airplanes,0
2,airplanes/airplanes_0003.jpg,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",airplanes,0
3,airplanes/airplanes_0004.jpg,"[[[174, 190, 187], [173, 189, 186], [172, 188,...",airplanes,0
4,airplanes/airplanes_0005.jpg,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",airplanes,0


In [None]:
# Testing...For nate dev only

df1 = pd.read_pickle('cleaned_train.pkl')
df2 = pd.read_pickle('cleaned_train2.pkl')

print(len(df1['encoding'] == df2['encoding']))

sumsss = 0
for i in range(len(df1)):
    if df1['image'][i].all() == df2['image'][i].all():
        sumsss += 1
print(sumsss)