# Classifier für Wohnungsbilder

## Data Preparation

### Load Data

In [132]:
import PIL.Image
from PIL.ExifTags import TAGS

import os, sys
from shutil import copyfile

import numpy as np
import random 

# load images
all_files = os.listdir("data/raw")

In [134]:
# extract image title from EXIF
def extract_labels(image_files, field_name = "XPSubject"):
    
    # extract field from Exif
    def get_field(exif, field):
        for (k,v) in exif.items():
            if TAGS.get(k) == field:
                return v.decode("utf-16")

    labels = {}

    for i in image_files:
        img = PIL.Image.open("data/raw/" + i)
        exif = img._getexif()
        title = get_field(exif, field_name)
        labels[i] = title
        img.close()

    return labels

all_labels = extract_labels(all_files)

In [136]:
# consolidate labels (e.g. different languages, Special Characters)
def consolidate_labels(labels):
    
    # consolidate languages
    recodes_language = {
        "Plan d'ensemble": "Grundriss",
        "Pianta":"Grundriss",
        "Vue extérieure": "Aussenansicht",
        "Cuisine": "Küche",
        "Salle de bains":"Bad",
        "Vue intérieure":"Innenansicht",
        "Cucina":"Küche",
        "Bagno":"Bad",
        "Diverse":"Divers",
        "Varie": "Divers",
        "Bild  LIA":"Divers",
        "immagine  LIA":"Divers",
        "immagine  LIA":"Divers",
        "Balkon/Terrasse/Sitzplatz":"Balkon_Terrasse_Sitzplatz",
        "Balcone/terrazza/posto a sedere":"Balkon_Terrasse_Sitzplatz",
        "Veduta esterna":"Aussenansicht", 
        "Stanza":"Zimmer",
        "Vue":"Aussicht",
        "Prospettiva":"Aussicht",
        "Soggiorno":"Wohnzimmer", 
        "Séjour":"Wohnzimmer",
        "Pièce":"Zimmer",
        "Veduta interna":"Innenansicht", 
        "corridoio":"Korridor",
        "couloir":"Korridor"
    }
    
    # group classes to get approx. even distribution 
    recodes_groups = {
        "Aussenansicht":"Aussenansicht",
        "Küche":"Küche", 
        "Bad":"Bad",
        "Zimmer":"Innenansicht",
        "Innenansicht":"Innenansicht",
        "Wohnzimmer":"Innenansicht",
        "Korridor":"Innenansicht",
        "Grundriss":"Grundriss",
        "Divers":"Divers",
        "Aussicht":"Divers",
        "Balkon_Terrasse_Sitzplatz":"Divers"
    }
    
    for l in labels.keys():
        if labels[l] in recodes_language.keys():
            labels[l] = recodes_language[labels[l]]
        
        if labels[l] in recodes_groups.keys():
            labels[l] = recodes_groups[labels[l]]
            
    return(labels)


all_labels_clean = consolidate_labels(all_labels)
classes = set(all_labels_clean.values())           
print(label_set)
print(len(classes), "Classes")

{'Bad', 'Innenansicht', 'Grundriss', 'Aussenansicht', 'Divers', 'Küche'}
6 Classes


In [163]:
# organize files in one folder per class
def organize_files(input_dir, output_dir, labels, files):
    
    classes = set(labels.values())

    for c in classes:
        if not os.path.isdir(output_dir+c):
            os.mkdir(output_dir+c)
    
    for f in files:
        src = input_dir + f        
        dst = output_dir + labels[f] + "/" + f
        os.rename(src, dst)
        

# Split training, validation and testset
def split_data(input_dir, output_dir, labels_clean, probs=(0.75, 0.125, 0.125)):
    
    train_dir = output_dir + "train/"
    valid_dir = output_dir + "valid/"
    test_dir  = output_dir + "test/"
    
    # setup folders
    if not os.path.isdir(train_dir): os.mkdir(train_dir)
    if not os.path.isdir(valid_dir): os.mkdir(valid_dir)
    if not os.path.isdir(test_dir): os.mkdir(test_dir)

    # split data
    np.random.seed(1)
    splits = np.random.choice([0,1,2], size = len(labels), p = probs)
    labels_keys = np.array(list(labels_clean.keys()))
    
    train_files = labels_keys[splits == 0]
    valid_files = labels_keys[splits == 1]
    test_files  = labels_keys[splits == 2]
    
    # copy files to correct train-, valid-, test-folder
    for f in train_files: copyfile(input_dir + f, train_dir + f)
    for f in valid_files: copyfile(input_dir + f, valid_dir + f)
    for f in test_files: copyfile(input_dir + f, test_dir + f)
        
    # organize files by class
    organize_files(train_dir, train_dir, labels_clean, train_files)
    organize_files(valid_dir, valid_dir, labels_clean, valid_files)
    organize_files(test_dir, test_dir, labels_clean, test_files)
    
split_data("data/raw/", "data/clean/", labels_clean)