# Classifier für Wohnungsbilder

## Data Preparation

### Load Data

In [132]:
import PIL.Image
from PIL.ExifTags import TAGS

import os, sys
from shutil import copyfile

import numpy as np
import random 

# load images
all_files = os.listdir("data/raw")

In [134]:
# extract image title from EXIF
def extract_labels(image_files, field_name = "XPSubject"):
    
    # extract field from Exif
    def get_field(exif, field):
        for (k,v) in exif.items():
            if TAGS.get(k) == field:
                return v.decode("utf-16")

    labels = {}

    for i in image_files:
        img = PIL.Image.open("data/raw/" + i)
        exif = img._getexif()
        title = get_field(exif, field_name)
        labels[i] = title
        img.close()

    return labels

all_labels = extract_labels(all_files)

In [136]:
# consolidate labels (e.g. different languages, Special Characters)
def consolidate_labels(labels):
    
    # consolidate languages
    recodes_language = {
        "Plan d'ensemble": "Grundriss",
        "Pianta":"Grundriss",
        "Vue extérieure": "Aussenansicht",
        "Cuisine": "Küche",
        "Salle de bains":"Bad",
        "Vue intérieure":"Innenansicht",
        "Cucina":"Küche",
        "Bagno":"Bad",
        "Diverse":"Divers",
        "Varie": "Divers",
        "Bild  LIA":"Divers",
        "immagine  LIA":"Divers",
        "immagine  LIA":"Divers",
        "Balkon/Terrasse/Sitzplatz":"Balkon_Terrasse_Sitzplatz",
        "Balcone/terrazza/posto a sedere":"Balkon_Terrasse_Sitzplatz",
        "Veduta esterna":"Aussenansicht", 
        "Stanza":"Zimmer",
        "Vue":"Aussicht",
        "Prospettiva":"Aussicht",
        "Soggiorno":"Wohnzimmer", 
        "Séjour":"Wohnzimmer",
        "Pièce":"Zimmer",
        "Veduta interna":"Innenansicht", 
        "corridoio":"Korridor",
        "couloir":"Korridor"
    }
    
    # group classes to get approx. even distribution 
    recodes_groups = {
        "Aussenansicht":"Aussenansicht",
        "Küche":"Küche", 
        "Bad":"Bad",
        "Zimmer":"Innenansicht",
        "Innenansicht":"Innenansicht",
        "Wohnzimmer":"Innenansicht",
        "Korridor":"Innenansicht",
        "Grundriss":"Grundriss",
        "Divers":"Divers",
        "Aussicht":"Divers",
        "Balkon_Terrasse_Sitzplatz":"Divers"
    }
    
    for l in labels.keys():
        if labels[l] in recodes_language.keys():
            labels[l] = recodes_language[labels[l]]
        
        if labels[l] in recodes_groups.keys():
            labels[l] = recodes_groups[labels[l]]
            
    return(labels)


all_labels_clean = consolidate_labels(all_labels)
classes = set(all_labels_clean.values())           
print(label_set)
print(len(classes), "Classes")

{'Bad', 'Innenansicht', 'Grundriss', 'Aussenansicht', 'Divers', 'Küche'}
6 Classes


In [163]:
# organize files in one folder per class
def organize_files(input_dir, output_dir, labels, files):
    
    classes = set(labels.values())

    for c in classes:
        if not os.path.isdir(output_dir+c):
            os.mkdir(output_dir+c)
    
    for f in files:
        src = input_dir + f        
        dst = output_dir + labels[f] + "/" + f
        os.rename(src, dst)
        

# Split training, validation and testset
def split_data(input_dir, output_dir, labels_clean, probs=(0.75, 0.125, 0.125)):
    
    train_dir = output_dir + "train/"
    valid_dir = output_dir + "valid/"
    test_dir  = output_dir + "test/"
    
    # setup folders
    if not os.path.isdir(train_dir): os.mkdir(train_dir)
    if not os.path.isdir(valid_dir): os.mkdir(valid_dir)
    if not os.path.isdir(test_dir): os.mkdir(test_dir)

    # split data
    np.random.seed(1)
    splits = np.random.choice([0,1,2], size = len(labels), p = probs)
    labels_keys = np.array(list(labels_clean.keys()))
    
    train_files = labels_keys[splits == 0]
    valid_files = labels_keys[splits == 1]
    test_files  = labels_keys[splits == 2]
    
    # copy files to correct train-, valid-, test-folder
    for f in train_files: copyfile(input_dir + f, train_dir + f)
    for f in valid_files: copyfile(input_dir + f, valid_dir + f)
    for f in test_files: copyfile(input_dir + f, test_dir + f)
        
    # organize files by class
    organize_files(train_dir, train_dir, labels_clean, train_files)
    organize_files(valid_dir, valid_dir, labels_clean, valid_files)
    organize_files(test_dir, test_dir, labels_clean, test_files)
    
split_data("data/raw/", "data/clean/", labels_clean)

In [13]:
def images_to_array(data_dir, img_width, img_height):
    
    import os
    import numpy as np
    from imageio import imread
    from skimage.transform import resize
    
    classes = os.listdir(data_dir)
    
    # list all files
    all_files = []
    for root, directories, filenames in os.walk(data_dir):        
        for filename in filenames:
            all_files.append(os.path.join(root,filename))

    X = np.zeros((len(all_files), img_width, img_height, 3))
    ind = 0
    for f in all_files:
        img = imread(f)
        img_resized = resize(img, [img_width, img_height])
        X[ind,:,:,:] = img_resized
        ind += 1
        
    Y = list()
    for c in classes:
        cdir = data_dir+c+"/"
        cfiles = os.listdir(cdir)
        
        Y += [c]*len(cfiles)
        
    return X, np.array(Y)
    


In [14]:
X_train, Y_train = images_to_array("data/clean/train/", 224, 224)
X_valid, Y_valid = images_to_array("data/clean/valid/", 224, 224)
X_test, Y_test = images_to_array("data/clean/test/", 224, 224)

  warn("The default mode, 'constant', will be changed to 'reflect' in "


(1465, 224, 224, 3)
(1465,)
(264, 224, 224, 3)


In [29]:
def convertToOneHot(Y):
    from sklearn.preprocessing import LabelEncoder
    from sklearn.preprocessing import OneHotEncoder
    
    # integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(Y)
    
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    
    return onehot_encoded

In [31]:
Y_train=convertToOneHot(Y_train)
Y_valid=convertToOneHot(Y_valid)
Y_test=convertToOneHot(Y_test)

In [32]:
print(X_train.shape)
print(Y_train.shape)
print(X_valid.shape)
print(Y_valid.shape)
print(X_test.shape)
print(Y_test.shape)

(1465, 224, 224, 3)
(1465, 6)
(264, 224, 224, 3)
(264, 6)
(271, 224, 224, 3)
(271, 6)


In [38]:
def save_to_hdf5(datasets, output_path):
    
    import h5py
    
    h5f = h5py.File(output_path, 'w')
    for d in datasets.keys():
        h5f.create_dataset(d, data=datasets[d])
    
    h5f.close()


In [39]:
datasets = {"X_train":X_train,
            "Y_train":Y_train,
            "X_valid":X_valid,
            "Y_valid":Y_valid,
            "X_test":X_test,
            "Y_test":Y_test }

save_to_hdf5(datasets, "data/wohnungsbilder2000_tvt.hdf5")