In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from skimage import io
import multiprocessing
from joblib import Parallel, delayed
import wget
import os
import shutil
from google.cloud import storage
import sys
import math 
import time


import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context('notebook', font_scale=1.5,
                rc={"lines.linewidth": 2.5})

from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.preprocessing.image import ImageDataGenerator
from keras import Model
from sklearn.utils import shuffle
import umap


from hdbscan import HDBSCAN

from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split

BUCKET_NAME = 'product-classification'
PROJECT_NAME = 'project_001_freisteller'
path = 'gs://' + BUCKET_NAME + '/' + PROJECT_NAME + '/'
IMAGE_PATH = 'images'
MOUNTED_PATH = '/home/jupyter/product-classification/'

CORE_FILE = 
CLASSFIED_FILE = 'train_classifier.csv'

# Model Parameters:
BATCH_SIZE = 512
JOB_NAME = BUCKET_NAME + "_" + PROJECT_NAME + "_" + "{}".format(int(time.time()))

label_names = {0: 'Zweifarbig',
               1: 'Einfarbig',
               2: 'Freisteller',
               3: 'Ambiente',
               4: 'Abmaßungen',
               5: 'Sonstige'}



import os
GC_PROJECT =  # REPLACE WITH YOUR PROJECT ID
REGION = 'us-central1' # REPLACE WITH YOUR BUCKET REGION e.g. us-central1

# do not change these
os.environ['GC_PROJECT'] = GC_PROJECT
os.environ['BUCKET'] = BUCKET_NAME
os.environ['REGION'] = REGION
os.environ['TFVERSION'] = '1.9'

In [None]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

In [None]:
def clean_core_data(df):
    df = df.drop(['Detailbild14',
                          'Detailbild50',
                       'Detailbild62',
                          'Freisteller (Test)',
                          'Herstellerbild'], axis=1)

    df = df[df['Klassifikation'].isin(['WOHNLANDSCHAFT',
                                        'SOFA',
                                        'SESSEL',
                                        'HOCKER'])]

    df = df[df['Warengruppe'].isin(['Hocker',
                                    'Polsterecken',
                                    'Schlafsofas',
                                    'Sessel',
                                    'Sofas',
                                    'Wohnlandschaften'])]

    df = df[df['Produktname'].notnull()]
    df = df[df['Suchfarbe'].notnull()]
    df = df.melt(id_vars=['Artikelnummer', 'Produktname', 'Produkttyp', 'Warengruppe', 'Klassifikation', 'Suchfarbe'], var_name='Imagetype', value_name="Image")
    df = df[df['Image'].notnull()]
    return df


In [None]:
def remnove_Not_Available(df, not_av):
    return df[~df['Image'].str.contains('|'.join(not_av))]

In [None]:
def plt_images(df, n):
    i = 0
    for val in df['Image'].values:
        if i < n:
            plt.imshow(io.imread(val))
            plt.show()
        else:
            break
        i += 1

In [None]:
def download_pic(pic):
    try:
        wget.download(pic, (MOUNTED_PATH+PROJECT_NAME+'/'+IMAGE_PATH))
        
    except:
        print("ERROR: "+ pic)
        

In [None]:
core_df = pd.read_excel(path + CORE_FILE)
classified_df = pd.read_csv(path + CLASSFIED_FILE)

In [None]:
core_df = clean_core_data(core_df)
core_df = remnove_Not_Available(core_df, ['6303722.jpg', '6303723.jpg', '6303721.jpg', '6303709.jpg'])
core_df['ImageName'] = core_df['Image'].apply(lambda st: st[st.find("otto/")+5:st.find("?$ads_")])
core_df = core_df.set_index('ImageName')
core_df = core_df.groupby(core_df.index).first()

classified_df['Label'] = classified_df['Label'].astype(str)
classified_df = classified_df.set_index('Image')

df = classified_df.join(core_df , how='left')
df.index = df.index.rename('ImageName')



print("Size of core %d" %(len(core_df)))
print("Size of classified %d" %(len(classified_df)))
print("Size of df %d" %(len(df)))
print("")
plt_images(df, 0) 

In [None]:
try:
    pictures_in_GCS = len([name for name in os.listdir((MOUNTED_PATH+PROJECT_NAME+'/'+IMAGE_PATH+'/'))])
except FileNotFoundError:
    !/usr/bin/gcsfuse product-classification /home/jupyter/product-classification

pictures_in_GCS = len([name for name in os.listdir((MOUNTED_PATH+PROJECT_NAME+'/'+IMAGE_PATH+'/'))])

if pictures_in_GCS < len(df):
    num_cores = multiprocessing.cpu_count()
    print("Start download with:")
    print('%d CPU\'s available' % num_cores)

    results = Parallel(n_jobs=num_cores)(delayed(download_pic)(i) for i in df.groupby('Image').count().index.values)
else:
    print("pictures up to date!")

In [None]:
def reshape(df, value_dict, size):
    df = df.reset_index()
    num_of_values = len(value_dict)
    num_of_values_per_class = math.floor(size/num_of_values)
    new_df = pd.DataFrame()
    for idx, i in enumerate(df.groupby('Label')):
        new_df = new_df.append(i[1][:num_of_values_per_class], ignore_index = True)
    return new_df.set_index('ImageName')

short_df = reshape(df, label_names, 50)
#short_df.head()

-------------------------------------

In [None]:
df['Label'] = pd.to_numeric(df['Label']).map(label_names)

In [None]:
df['path'] = (path +IMAGE_PATH + '/') + df.index

In [None]:
msk = np.random.rand(len(df[['path','Label']])) < 0.9

train = train_set[msk]
test = train_set[~msk]

train.to_csv(path +'train_set.csv', header=False, index=False)
test.to_csv(path +'eval_set.csv', header=False, index=False)
pd.DataFrame(data=list(label_names.values())).to_csv(path +'labels.txt', sep=',',index=False, header=False)

----------------------------------

In [None]:
incesnet = InceptionResNetV2(weights='imagenet', include_top=False, pooling='avg', input_shape=(299,299,3))

generator = ImageDataGenerator(rescale= 1. / 255)

In [None]:
gen = generator.flow_from_dataframe(dataframe=short_df.reset_index(),
                                    shuffle=False,
                                    directory=(MOUNTED_PATH + PROJECT_NAME + '/' + IMAGE_PATH),
                                    x_col='ImageName',
                                    #y_col='Label',
                                    #has_ext=True,
                                    target_size=(299, 299),
                                    batch_size=BATCH_SIZE,  
                                    class_mode='input')

In [None]:
encoded_features = incesnet.predict_generator(gen, steps=gen.n / BATCH_SIZE, verbose=1, use_multiprocessing=True)

In [None]:
encoded_features = pd.DataFrame(encoded_features)
print(len(encoded_features))
encoded_features.head()

In [None]:
from keras.applications.resnet50 import preprocess_input, decode_predictions

decode_predictions( encoded_features,top = 5)

In [None]:
target = to_categorical(short_df['Label'])
features = pd.merge(short_df['Label'].reset_index(), encoded_features, left_index=True, right_index=True).set_index('ImageName').drop('Label', axis=1)



X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.10)

print("Test-Size: %d" % len(y_test))
print("Train-Size: %d" % len(y_train))
print("Number of Features: %d" % X_test.shape[1])

In [None]:
assert len(label_names) == target.shape[1]

print("Number of classes: %d" % len(label_names))
print('Labelnames:', label_names)
print('Labelcounts:', {label_names[idx]: i for idx, i in enumerate(short_df.groupby('Label').count()['Artikelnummer'])})