**Import packages**

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import cv2
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
import re
tqdm_notebook.pandas()
import tensorflow as tf
from tensorflow import keras
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator
from sklearn import metrics
import itertools
import pickle
from joblib import dump, load

# Identification of product classes

 ***Création de DataFrames à partir des csv Textes***

In [2]:
df_xtrain = pd.read_csv('./data/texts/X_train_update.csv') 

In [3]:
df_xtest = pd.read_csv('./data/texts/X_test_update.csv') 

In [4]:
df_ytrain = pd.read_csv('./data/texts/Y_train_CVw08PX.csv') 

In [5]:
#print("df_xtrain shape: ", df_xtrain.shape)
#print("df_xtest shape: ", df_xtest.shape)
#print("df_ytrain shape: ", df_ytrain.shape)

In [6]:
df_xtrain.head()

Unnamed: 0.1,Unnamed: 0,designation,description,productid,imageid
0,0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786


In [7]:
df_xtest.head()

Unnamed: 0.1,Unnamed: 0,designation,description,productid,imageid
0,84916,Folkmanis Puppets - 2732 - Marionnette Et Théâ...,,516376098,1019294171
1,84917,Porte Flamme Gaxix - Flamebringer Gaxix - 136/...,,133389013,1274228667
2,84918,Pompe de filtration Speck Badu 95,,4128438366,1295960357
3,84919,Robot de piscine électrique,<p>Ce robot de piscine d&#39;un design innovan...,3929899732,1265224052
4,84920,Hsm Destructeur Securio C16 Coupe Crois¿E: 4 X...,,152993898,940543690


In [8]:
df_ytrain.head()

Unnamed: 0.1,Unnamed: 0,prdtypecode
0,0,10
1,1,2280
2,2,50
3,3,1280
4,4,2705


In [9]:
# Nan values
null_counts = df_xtrain.isnull().sum().sort_values(ascending=False)
print('df_xtrain: \n', null_counts[null_counts > 0])

null_counts = df_xtest.isnull().sum().sort_values(ascending=False)
print(' \n df_xtest: \n',null_counts[null_counts > 0])

null_counts = df_ytrain.isnull().sum().sort_values(ascending=False)
print(' \ndf_ytrain: \n',null_counts[null_counts > 0])

df_xtrain: 
 description    29800
dtype: int64
 
 df_xtest: 
 description    4886
dtype: int64
 
df_ytrain: 
 Series([], dtype: int64)


In [12]:
display(df_ytrain["prdtypecode"].unique())

unique, counts = np.unique(df_ytrain["prdtypecode"], return_counts=True)
dict(zip(unique, counts))

array([  10, 2280,   50, 1280, 2705, 2522, 2582, 1560, 1281, 1920, 2403,
       1140, 2583, 1180, 1300, 2462, 1160, 2060,   40,   60, 1320, 1302,
       2220, 2905, 2585, 1940, 1301], dtype=int64)

{10: 3116,
 40: 2508,
 50: 1681,
 60: 832,
 1140: 2671,
 1160: 3953,
 1180: 764,
 1280: 4870,
 1281: 2070,
 1300: 5045,
 1301: 807,
 1302: 2491,
 1320: 3241,
 1560: 5073,
 1920: 4303,
 1940: 803,
 2060: 4993,
 2220: 824,
 2280: 4760,
 2403: 4774,
 2462: 1421,
 2522: 4989,
 2582: 2589,
 2583: 10209,
 2585: 2496,
 2705: 2761,
 2905: 872}

In [13]:
df_ytrain["prdtypecode"].value_counts(normalize=True).mul(100).round(2).astype(str) + '%'

2583    12.02%
1560     5.97%
1300     5.94%
2060     5.88%
2522     5.88%
1280     5.74%
2403     5.62%
2280     5.61%
1920     5.07%
1160     4.66%
1320     3.82%
10       3.67%
2705     3.25%
1140     3.15%
2582     3.05%
40       2.95%
2585     2.94%
1302     2.93%
1281     2.44%
50       1.98%
2462     1.67%
2905     1.03%
60       0.98%
2220     0.97%
1301     0.95%
1940     0.95%
1180      0.9%
Name: prdtypecode, dtype: object

 *** Concaténation df_xtrain et df_ytrain(Retrouver les product codes) ***

In [14]:
# Sort des deux DF avant de faire la concaténation
df_xtrain = df_xtrain.sort_values(by = 'Unnamed: 0', ascending = True)
df_ytrain = df_ytrain.sort_values(by = 'Unnamed: 0', ascending = True)

In [15]:
# Comparaison des deux colonnes 'Unnamed: 0' des deux DF df_xtrain et  df_ytrain
df_xtrain['Unnamed: 0'].isin(df_ytrain['Unnamed: 0']).value_counts()

True    84916
Name: Unnamed: 0, dtype: int64

In [16]:
# Concaténation des deux DF - Train
df_combined = pd.concat([df_xtrain, df_ytrain['prdtypecode'] ], axis = 1)
df_combined.drop('Unnamed: 0', axis =1 , inplace = True)

In [17]:
df_combined

Unnamed: 0,designation,description,productid,imageid,prdtypecode
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,10
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,2280
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,50
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,1280
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,2705
...,...,...,...,...,...
84911,The Sims [ Import Anglais ],,206719094,941495734,40
84912,Kit piscine acier NEVADA déco pierre Ø 3.50m x...,<b>Description complète :</b><br />Kit piscine...,3065095706,1188462883,2583
84913,Journal Officiel De La Republique Francaise N°...,,440707564,1009325617,2280
84914,Table Basse Bois De Récupération Massif Base B...,<p>Cette table basse a un design unique et con...,3942400296,1267353403,1560


In [18]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84916 entries, 0 to 84915
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   designation  84916 non-null  object
 1   description  55116 non-null  object
 2   productid    84916 non-null  int64 
 3   imageid      84916 non-null  int64 
 4   prdtypecode  84916 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 3.9+ MB


In [16]:
df_combined.shape

(84916, 5)

In [19]:
#path_im = "./data/images/all/image_train/"  # Train folder containing org image  500 x 500 
path_im = "./data/images/all/image_train_resized/"  # Train folder containing resized  256 x 256 
#im_prfix = "image_"
im_prfix = "resized_image_"

#df_combined["filePath"] =  path_im +"image_" + df_combined.imageid.astype(str)+ "_product_" + df_combined.productid.astype(str) + ".jpg"
df_combined["filePath"] =  path_im  + im_prfix  + df_combined.imageid.astype(str)+ "_product_" + df_combined.productid.astype(str) + ".jpg"

In [18]:
df_combined

Unnamed: 0,designation,description,productid,imageid,prdtypecode,filePath
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,10,./data/images/all/image_train_resized/resized_...
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,2280,./data/images/all/image_train_resized/resized_...
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,50,./data/images/all/image_train_resized/resized_...
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,1280,./data/images/all/image_train_resized/resized_...
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,2705,./data/images/all/image_train_resized/resized_...
...,...,...,...,...,...,...
84911,The Sims [ Import Anglais ],,206719094,941495734,40,./data/images/all/image_train_resized/resized_...
84912,Kit piscine acier NEVADA déco pierre Ø 3.50m x...,<b>Description complète :</b><br />Kit piscine...,3065095706,1188462883,2583,./data/images/all/image_train_resized/resized_...
84913,Journal Officiel De La Republique Francaise N°...,,440707564,1009325617,2280,./data/images/all/image_train_resized/resized_...
84914,Table Basse Bois De Récupération Massif Base B...,<p>Cette table basse a un design unique et con...,3942400296,1267353403,1560,./data/images/all/image_train_resized/resized_...


# Dataframe saving to CSV

In [20]:
import xlwt
import openpyxl
pathSave = './saves'
date = '27092021'
version = '_V0'

# To pickle file
df_combined.to_pickle(f"{pathSave}/df_combined.pkl")

# To CSV file
df_combined.to_csv(f"{pathSave}/df_combined_xtrain_ytrain_{date}{version}.csv")



# Iteration #1 - Architecture LeNet

In [None]:
# read saved DF - Combined  for train images

In [10]:
#df_train_im = pd.read_csv('./df_saves/df_combined_xtrain_ytrain_27092021_V0.csv', index_col=0) 
df_train_im = pd.read_pickle("./saves/df_combined.pkl")

In [11]:
display(df_train_im.shape)

(84916, 6)

In [12]:
display(df_train_im.shape)

(84916, 6)

In [14]:
#df_train_im, data_test = train_test_split(df_train['filePath'], test_size=0.2, random_state=123)
X_train_im, X_test_im = train_test_split(df_train_im, train_size=0.8, random_state=1234)

In [15]:
display(X_train_im.shape)
display(X_test_im.shape)

(67932, 6)

(16984, 6)

*** Générateur de données images ***

In [16]:
X_train_im["prdtypecode"] = X_train_im["prdtypecode"].astype(str)
X_test_im["prdtypecode"] = X_test_im["prdtypecode"].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_im["prdtypecode"] = X_train_im["prdtypecode"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_im["prdtypecode"] = X_test_im["prdtypecode"].astype(str)


In [54]:
unique, counts = np.unique(X_train_im["prdtypecode"], return_counts=True)
dict(zip(unique, counts))

{'10': 838,
 '1140': 741,
 '1160': 1079,
 '1180': 213,
 '1280': 1379,
 '1281': 564,
 '1300': 1434,
 '1301': 218,
 '1302': 739,
 '1320': 924,
 '1560': 1427,
 '1920': 1249,
 '1940': 207,
 '2060': 1485,
 '2220': 230,
 '2280': 1398,
 '2403': 1349,
 '2462': 419,
 '2522': 1361,
 '2582': 719,
 '2583': 2837,
 '2585': 727,
 '2705': 752,
 '2905': 263,
 '40': 724,
 '50': 483,
 '60': 241}

In [19]:
#Générateur de données
datagen = ImageDataGenerator(rotation_range = 10,
                                width_shift_range = 0.1, # Translation horizontale
                                height_shift_range= 0.1, # Translation verticale
                                zoom_range = 0.1,
                                horizontal_flip=True,
                                validation_split = 0.2)

In [20]:
#Itérateur 
train_generator = datagen.flow_from_dataframe(dataframe = X_train_im,                                           
                                            x_col = "filePath",
                                            y_col = 'prdtypecode',  #target data                                            
                                            class_mode = 'sparse',
                                            subset = 'training',
                                            # target_size = (256 , 256) - default
                                            #color_mode = 'rgb'- default
                                            batch_size = 32 # default
                                           )

test_generator = datagen.flow_from_dataframe(dataframe = X_test_im,                                            
                                            x_col = "filePath",
                                            y_col ='prdtypecode', 
                                            class_mode = 'sparse',
                                            subset = 'validation', 
                                            #target_size = (256 , 256)-default
                                            #color_mode = 'rgb'- default
                                            batch_size = 32 # default
                                   )

Found 54346 validated image filenames belonging to 27 classes.
Found 3396 validated image filenames belonging to 27 classes.


In [70]:
# Architecture du modèle
lenet = Sequential()

conv_1 = Conv2D(filters = 30,                     # Nombre de filtres
                kernel_size = (5, 5),            # Dimensions du noyau
                padding = 'valid',               # Mode de Dépassement
                input_shape = (256, 256, 3),       # Dimensions de l'image en entrée 256 x 256 !!!!!!!!
                activation = 'relu')             # Fonction d'activation

max_pool_1 = MaxPooling2D(pool_size = (2, 2))

conv_2 = Conv2D(filters = 16,                    
                kernel_size = (3, 3),          
                padding = 'valid',             
                activation = 'relu')

max_pool_2 = MaxPooling2D(pool_size = (2, 2))

flatten = Flatten()

dropout = Dropout(rate = 0.2)

dense_1 = Dense(units = 128,
                activation = 'relu')

dense_2 = Dense(units = 27,              # changer le nombre de units pour coller au nombre de classe!!!!!!
                activation = 'softmax')

lenet.add(conv_1)
lenet.add(max_pool_1)
lenet.add(conv_2)
lenet.add(max_pool_2)

lenet.add(dropout)
lenet.add(flatten)
lenet.add(dense_1)
lenet.add(dense_2)

lenet.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 252, 252, 30)      2280      
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 126, 126, 30)      0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 124, 124, 16)      4336      
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 62, 62, 16)        0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 62, 62, 16)        0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 61504)             0         
_________________________________________________________________
dense_10 (Dense)             (None, 128)              

In [71]:
# Compilation
lenet.compile(loss='sparse_categorical_crossentropy',  # fonction de perte
              optimizer='adam',                 # algorithme de descente de gradient
              metrics=['accuracy'])             # métrique d'évaluation


In [None]:
#batch_size = 32
history = lenet.fit_generator(generator = train_generator, 
                              ##steps_per_epoch = len(X_train_im)//batch_size,
                              epochs = 5,                            
                              validation_data = test_generator,
                              #validation_steps = len(X_test_im)//batch_size
                             )


In [72]:
%%time
batch_size = 32

# Train Train generator
history = lenet.fit(train_generator,                    
                     epochs = 10,
                    )

#batch_size = 32
#history = lenet.fit_generator(generator = train_generator, 
                              ##steps_per_epoch = len(X_train_im)//batch_size,
                              #epochs = 5,                            
                              #validation_data = test_generator,
                              #validation_steps = len(X_test_im)//batch_size
                            # )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 13h 46min 21s


In [22]:
#Prediction - Test set
y_pred_proba = lenet.predict_generator(generator = test_generator)

# l'argmax pour obtenir les classes prédites
y_pred_class = np.argmax(y_pred_proba,axis = 1).astype(int)

# To get classes fro test generator
y_true = test_generator.classes


Instructions for updating:
Please use Model.predict, which supports generators.


In [23]:
unique, counts = np.unique(y_pred_class, return_counts=True)
dict(zip(unique, counts))

{0: 1, 4: 1, 8: 1, 9: 3257, 11: 1, 12: 2, 13: 4, 16: 1, 18: 1, 24: 1, 25: 126}

In [89]:
#Get the accuracy score
test_score = lenet.evaluate_generator(test_generator)

print("[INFO] accuracy: {:.2f}%".format(test_score[1] * 100)) 
print("[INFO] Loss: ",test_score[0])

Instructions for updating:
Please use Model.evaluate, which supports generators.
[INFO] accuracy: 0.00%
[INFO] Loss:  294.4401550292969


In [24]:
print(metrics.classification_report(y_true, y_pred_class))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       137
           1       0.00      0.00      0.00       112
           2       0.00      0.00      0.00       161
           3       0.00      0.00      0.00        31
           4       0.00      0.00      0.00       217
           5       0.00      0.00      0.00        85
           6       0.00      0.00      0.00       198
           7       0.00      0.00      0.00        34
           8       0.00      0.00      0.00        84
           9       0.04      0.95      0.08       137
          10       0.00      0.00      0.00       168
          11       0.00      0.00      0.00       170
          12       0.00      0.00      0.00        31
          13       0.00      0.00      0.00       219
          14       0.00      0.00      0.00        34
          15       0.00      0.00      0.00       160
          16       0.00      0.00      0.00       182
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [76]:
#Enregistrement d'un modèle Keras :
lenet.save("./saves/lenet_train_im_all_28082021_10_epoch.hdf5")
#The SavedModel and HDF5 file contains:
#the model's configuration (topology)
#the model's weights
#the model's optimizer's state (if any)

In [21]:
#Rechargement du modèle :
lenet = load_model('./saves/lenet_train_im_all_28082021_10_epoch.h5')