# UHK SMAP ovalek 01

In [5]:
# install packages
%pip install numpy pandas tensorflow

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow
  Downloading tensorflow-2.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
     |████████████████████████████████| 588.3 MB 51 kB/s              
Collecting keras<2.12,>=2.11.0
  Downloading keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
     |████████████████████████████████| 1.7 MB 9.0 MB/s            
Collecting h5py>=2.9.0
  Downloading h5py-3.7.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.5 MB)
     |████████████████████████████████| 4.5 MB 13.5 MB/s            
[?25hCollecting wrapt>=1.11.0
  Downloading wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (77 kB)
     |████████████████████████████████| 77 kB 4.9 MB/s             
[?25hCollecting flatbuffers>=2.0
  Downloading flatbuffers-22.12.6-py2.py3-none-any.whl (26 kB)
Collecting libclang>=13.0.0
  Downloading libclang-14.0.6

In [4]:
import numpy as np
import pandas as pd
import os
print(os.listdir("./data"))

['small', 'parameters_products_color_sanitized.csv', 'parameters_products_pattern_sanitized.csv']


## Build neural network model

In [23]:
# Disable warnings
import warnings
warnings.filterwarnings('ignore')

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf

# Initializes a sequential model
model = Sequential()

# First layer (pictures are 240x340=85000 x 3 colors => 255000)
model.add(Dense(2048, activation='relu', input_shape=(255000, )))

# Second layer
model.add(Dense(256, activation='relu'))

# Output layer (multi-label classification therefore use sigmoid activation function)
model.add(Dense(48, activation='sigmoid'))

model.summary()

2022-12-14 00:56:53.535623: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 2088960000 exceeds 10% of free system memory.
2022-12-14 00:56:53.748080: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 2088960000 exceeds 10% of free system memory.
2022-12-14 00:56:54.113711: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 2088960000 exceeds 10% of free system memory.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 2048)              522242048 
                                                                 
 dense_1 (Dense)             (None, 256)               524544    
                                                                 
 dense_2 (Dense)             (None, 48)                12336     
                                                                 
Total params: 522,778,928
Trainable params: 522,778,928
Non-trainable params: 0
_________________________________________________________________


## Compile a neural network

In [18]:
# multi-label classification therefore use binary_crossentropy
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## Load data

In [35]:
import csv
import numpy as np
import os
import PIL
import PIL.Image
import tensorflow as tf

# load labels
label_ids = []
labels = []
with open('./data/parameters_list_sanitized.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
            continue
        else:
            line_count += 1
            if not row[0] in label_ids:
                label_ids += [row[0]]
                labels += [row[2]]

# load product label relations
products_labels = {}
with open('./data/parameters_products_color_sanitized.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
            continue
        else:
            line_count += 1
            if not row[0] in products_labels:
                products_labels[row[0]] = []
            for label_index, label_id in enumerate(label_ids):
                if label_id == row[2]:
                    products_labels[row[0]] += [labels[label_index]]
with open('./data/parameters_products_pattern_sanitized.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
            continue
        else:
            line_count += 1
            if not row[0] in products_labels:
                products_labels[row[0]] = []
            for label_index, label_id in enumerate(label_ids):
                if label_id == row[2]:
                    products_labels[row[0]] += [labels[label_index]]

# load products with main photo
products = []
with open('./data/photos_sanitized.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            line_count += 1
            if row[4] != 'Y':
                continue
            found = False
            for product in products:
                if product[0] == row[0]:
                    found = True
                    break
            if not found:
                products += [(row[0], row[1], row[3], row[4], ), ]
    print(f'Processed {line_count} lines.')
    print(f'Found products {products.__len__}.')


Column names are id_product, id_photo, date_update, position, show_in_lead
Processed 16161 lines.
Found products <method-wrapper '__len__' of list object at 0x7fc21c72ae40>.


In [36]:
# prepare data for dataset
data_photos_paths = []
data_labels = []
for product in products:
    data_photos_paths += [f'./data/small/{int(product[0]) % 10}/{product[0]}/{product[0]}_{product[1]}_{product[3]}_{product[2]}_small.webp']
    tmp_product_labels = products_labels[product[0]]
    tmp_labels_vector = []
    for label in labels:
        tmp_labels_vector += [1 if label in tmp_product_labels else 0]
    data_labels += [tmp_labels_vector]

data_photos_paths = tf.constant(data_photos_paths)
data_labels = tf.constant(data_labels)
dataset = tf.data.Dataset.from_tensor_slices((data_photos_paths, data_labels))

def _prepare_picture_func(picturepath, labels):
    image_string = tf.io.read_file(picturepath)
    image = tf.image.decode_image(image_string, channels=3)
    image.set_shape([None, None, None])
    image = tf.image.resize(image, [150, 150])
    image = tf.subtract(image, 116.779) # Zero-center by mean pixel
    image.set_shape([150, 150, 3])
    # image = tf.reverse(image, axis=[2]) # 'RGB'->'BGR'
    # d = dict(zip([picturepath], [image])), labels
    return (picturepath, image), labels
    # return d
dataset = dataset.map(_prepare_picture_func)

## Prepare train test and validation data

In [None]:
dataset.batch()

## Use NN for data

In [20]:
# (train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.fashion_mnist.load_data()

# train_data = train_data[(train_labels >= 0) & (train_labels < 3)][0:50].reshape(-1, 28, 28, 1)
# train_labels = train_labels[(train_labels >= 0) & (train_labels < 3)][0:50]
# train_labels = pd.get_dummies(train_labels).to_numpy()

# test_data = test_data[(test_labels >= 0) & (test_labels < 3)][0:10].reshape(-1, 28, 28, 1)
# test_labels = test_labels[(test_labels >= 0) & (test_labels < 3)][0:10]
# test_labels = pd.get_dummies(test_labels).to_numpy()