# Unification des modeles sur les images et le texte

## Chargement et transformation des données

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Les images sont stockées dans un zip sur google drive
!tar -zxvf drive/MyDrive/rakuten_cropped_resized_images_in_classes.tar.gz

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
rakuten/images/2403/image_1309932192_product_4182252631.jpg
rakuten/images/2403/image_1127685977_product_2128700978.jpg
rakuten/images/2403/image_974193646_product_271205994.jpg
rakuten/images/2403/image_1145659570_product_2501106654.jpg
rakuten/images/2403/image_1153338974_product_2625371934.jpg
rakuten/images/2403/image_1226562830_product_3601106558.jpg
rakuten/images/2403/image_1141418440_product_2430131529.jpg
rakuten/images/2403/image_1000692914_product_359078892.jpg
rakuten/images/2403/image_1243148264_product_3770360934.jpg
rakuten/images/2403/image_1166751912_product_2823855643.jpg
rakuten/images/2403/image_1263204970_product_3915305487.jpg
rakuten/images/2403/image_1255799372_product_3871478961.jpg
rakuten/images/2403/image_1019080065_product_514299107.jpg
rakuten/images/2403/image_1125531084_product_2089420134.jpg
rakuten/images/2403/image_1220527866_product_3547511027.jpg
rakuten/imag

In [None]:
# Move all the images directly to the rakuten/images folder
!mv rakuten/images/10/* rakuten/images/
!mv rakuten/images/1160/* rakuten/images/
!mv rakuten/images/1280/* rakuten/images/
!mv rakuten/images/1300/* rakuten/images/
!mv rakuten/images/1302/* rakuten/images/
!mv rakuten/images/1560/* rakuten/images/
!mv rakuten/images/1940/* rakuten/images/
!mv rakuten/images/2220/* rakuten/images/
!mv rakuten/images/2403/* rakuten/images/
!mv rakuten/images/2522/* rakuten/images/
!mv rakuten/images/2583/* rakuten/images/
!mv rakuten/images/2705/* rakuten/images/
!mv rakuten/images/40/* rakuten/images/
!mv rakuten/images/60/* rakuten/images/
!mv rakuten/images/1140/* rakuten/images/
!mv rakuten/images/1180/* rakuten/images/
!mv rakuten/images/1281/* rakuten/images/
!mv rakuten/images/1301/* rakuten/images/
!mv rakuten/images/1320/* rakuten/images/
!mv rakuten/images/1920/* rakuten/images/
!mv rakuten/images/2060/* rakuten/images/
!mv rakuten/images/2280/* rakuten/images/
!mv rakuten/images/2462/* rakuten/images/
!mv rakuten/images/2582/* rakuten/images/
!mv rakuten/images/2585/* rakuten/images/
!mv rakuten/images/2905/* rakuten/images/
!mv rakuten/images/50/* rakuten/images/


In [None]:
import pandas as pd

#On charge le dataset d'entrainement
X_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/X_train.csv", sep=',',index_col=0)
y_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Y_train.csv", sep=',',index_col=0)

# On charge les intitulés de catégorie
categories = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/categories.csv",sep='\t',index_col=0)
#categories2 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/categories.csv",sep=';')

In [None]:
# On crée un dataset global un peu nettoyé
X = pd.concat([X_train, y_train], axis=1)
X["categorie"] = X.prdtypecode.map(categories.prdlabelcode)
X["image"] = "image_" + X.imageid.astype(str) + "_product_" + X.productid.astype(str) + ".jpg"
X["texte"] = X.designation + " " + X.description.fillna('').astype(str)
X.drop(["productid", "imageid", "prdtypecode", "designation", "description"], axis=1, inplace=True)

X.drop_duplicates(subset="texte", inplace=True)

# On enleve les tags htmls dans le texte (titre + description) des annonces
from io import StringIO
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

X["texte"] = X.texte.apply(strip_tags)

In [None]:
print(X.info())
X.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83502 entries, 0 to 84915
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   categorie  83502 non-null  object
 1   image      83502 non-null  object
 2   texte      83502 non-null  object
dtypes: object(3)
memory usage: 2.5+ MB
None


Unnamed: 0,categorie,image,texte
0,Livre,image_1263597046_product_3804725264.jpg,Olivia: Personalisiertes Notizbuch / 150 Seite...
1,"journeaux, magazines",image_1008141237_product_436067568.jpg,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
2,Accessoires Jeux,image_938777978_product_201115110.jpg,Grand Stylet Ergonomique Bleu Gamepad Nintendo...
3,jeux types playmobil,image_457047496_product_50418756.jpg,Peluche Donald - Europe - Disneyland 2000 (Mar...
4,Livres 2,image_1077757786_product_278535884.jpg,La Guerre Des Tuques Luc a des idées de grande...


In [None]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

tokenizer = RegexpTokenizer("\w+")

stop_words_fr = stopwords.words("french")
stop_words_en = stopwords.words("english")
stop_words_de = stopwords.words("german")
stop_words = stop_words_fr + stop_words_en + stop_words_de + [str(i) for i in range(0, 100)] + ["x"]
stop_words = {word:0 for word in stop_words} # Optimisation for faster filtering

def stop_words_filtering(string_list):
    return [ w for w in string_list if w not in stop_words ]

In [None]:
#On passe l'ensemble du texte en minuscule
X['texte'] = X['texte'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#lemmatisation pour avoir uniquement la racine des mots
from nltk.stem import WordNetLemmatizer

lemmatisation = WordNetLemmatizer()
X['texte'] = X['texte'].apply(lambda x: " ".join(lemmatisation.lemmatize(x) for x in x.split()))

X['texte'] = X['texte'].apply(lambda text: stop_words_filtering(tokenizer.tokenize(text)))

X.head(15)

Unnamed: 0,categorie,image,texte
0,Livre,image_1263597046_product_3804725264.jpg,"[olivia, personalisiertes, notizbuch, 150, sei..."
1,"journeaux, magazines",image_1008141237_product_436067568.jpg,"[journal, art, 133, 09, 2001, art, marche, sal..."
2,Accessoires Jeux,image_938777978_product_201115110.jpg,"[grand, stylet, ergonomique, bleu, gamepad, ni..."
3,jeux types playmobil,image_457047496_product_50418756.jpg,"[peluche, donald, europe, disneyland, 2000, ma..."
4,Livres 2,image_1077757786_product_278535884.jpg,"[guerre, tuques, luc, idées, grandeur, veut, o..."
5,"journeaux, magazines",image_393356830_product_5862738.jpg,"[afrique, contemporaine, 212, hiver, 2004, dos..."
6,Livre,image_907794536_product_91920807.jpg,"[christof, e, bildungsprozessen, spur]"
7,papeterie et accessoire papeterie,image_999581347_product_344240059.jpg,"[conquérant, sept, cahier, couverture, polypro..."
8,jeux types playmobil,image_1325918866_product_4239126071.jpg,"[puzzle, scooby, doo, poster, 2x35, piece]"
9,meubles jardin,image_1245644185_product_3793572222.jpg,"[tente, pliante, v3s5, pro, pvc, blanc, 4m50, ..."


## Création des jeux d'entrainement et de test

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X.drop("categorie", axis=1), X.categorie, test_size = 0.2)

vectorizer = CountVectorizer(min_df=5)#, max_features=20000)
vectorizer.fit(np.concatenate(X_train.texte.array))

#X_train_text_mat = vectorizer.transform(X_train.texte.str.join(" ")).astype('uint8')
#X_test_text_mat = vectorizer.transform(X_test.texte.str.join(" ")).astype('uint8')
#X_train_vect =  pd.DataFrame.sparse.from_spmatrix(X_train_text_mat)
#X_test_vect =  pd.DataFrame.sparse.from_spmatrix(X_test_text_mat)
X_train_vect = pd.DataFrame(vectorizer.transform(X_train.texte.str.join(" ")).astype('uint8'))
X_test_vect = pd.DataFrame(vectorizer.transform(X_test.texte.str.join(" ")).astype('uint8'))


In [None]:
import tensorflow as tf

def load_image_dataset_in_memory(df, image_dir = "rakuten/images/", input_size=(128, 128, 3)):
  result = []
  for index, row in df.iterrows():
    path = f"{image_dir}{row.image}"
    image = tf.keras.utils.load_img(
      path,
      grayscale=False,
      color_mode='rgb',
      target_size=input_size,
      interpolation='bilinear'
    )
    image = tf.keras.preprocessing.image.img_to_array(image).astype('uint8')
    image = image.reshape(input_size[0] * input_size[1] * input_size[2])
    result.append(image)
  return result
    
#def x_generator(df, df_text, y, image_dir = "rakuten/images/", input_size=(224, 224, 3)):
#  path = f"{image_dir}{df.iloc[index].image}"
#  image = tf.keras.utils.load_img(
#    path,
#    grayscale=False,
#    color_mode='rgb',
#    target_size=self.input_size,
#    interpolation='bilinear'
#  )
#  image = tf.keras.preprocessing.image.img_to_array(image)
#  yield [image, self.df_text.iloc[index]]

In [None]:
len(X_train)

66801

In [None]:
len(X_test)

16701

In [None]:
from scipy.sparse import hstack

#X_test_union = hstack([np.asarray(load_image_dataset_in_memory(X_test)), X_test_text_mat])
#X_train_union = hstack([np.asarray(load_image_dataset_in_memory(X_train)), X_train_text_mat])
X_train_union = pd.concat([pd.DataFrame(np.asarray(load_image_dataset_in_memory(X_train))), X_train_vect], axis=1)
X_test_union = pd.concat([pd.DataFrame(np.asarray(load_image_dataset_in_memory(X_test))), X_test_vect], axis=1)

In [None]:
from scipy import sparse

#train_imgs = sparse.csr_matrix(np.asarray(load_image_dataset_in_memory(X_train)))
#test_imgs = sparse.csr_matrix(np.asarray(load_image_dataset_in_memory(X_test)))

In [None]:
#X_train_text_mat = X_train_text_mat.astype('uint8')
#X_test_text_mat = X_test_text_mat.astype('uint8')

In [None]:
#train_imgs = train_imgs.reshape(train_imgs.shape[0], 128*128*3)
#test_imgs = test_imgs.reshape(test_imgs.shape[0], 128*128*3)

In [None]:
#pd.concat([test, X_train_vect.head(3)], axis=1)

In [None]:
#X_train_text_arr = X_train_text_mat.astype('uint8').toarray()
#X_test_text_arr = X_test_text_mat.astype('uint8').toarray()

In [None]:
#X_train_union = list(zip(train_imgs, X_train_text_arr))
#X_test_union = list(zip(test_imgs, X_test_text_arr))

In [None]:
#X_train_union = np.asarray(X_train_union)
#X_test_union = np.asarray(X_test_union)

In [None]:
import scipy 

X_train_union = scipy.sparse.csr_matrix(X_train_union.values, dtype='uint8')
X_test_union = scipy.sparse.csr_matrix(X_test_union.values, dtype='uint8')

## Modele de regression logistique sur le texte

In [None]:
#Classification / apprentissage supervisé.
from sklearn.linear_model import LogisticRegression

use_loaded_model = True

#creation du classifieur de regression logistique avec les paramètres par défauts et construction du modèle
#sur les données d'entrainement
if not use_loaded_model:
  clf_reglog = LogisticRegression(C=1.0, max_iter=10000)
  clf_reglog.fit(X_train_vect, y_train)

  print(clf_reglog.score(X_test_vect, y_test))

In [None]:
import pickle

filename = 'drive/MyDrive/logistic_regression.sav'

In [None]:
if not use_loaded_model:
  pickle.dump(clf_reglog, open(filename, 'wb'))

In [None]:
clf_reglog = pickle.load(open(filename, 'rb'))

## Modele Deep learning sur les images

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input

nb_of_classes = len(categories)
train_size = 67933
valid_size = 16983

base_model = VGG16(weights='imagenet', include_top=False)
for layer in base_model.layers:
    layer.trainable = False

model = tf.keras.Sequential()
model.add(layers.Lambda(preprocess_input, name='preprocessing', input_shape=(128, 128, 3)))
model.add(base_model)
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(256,activation='relu'))
model.add(layers.Dense(256,activation='relu'))
model.add(layers.Dense(256,activation='relu'))
model.add(layers.Dense(nb_of_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
checkpoint_filepath = 'drive/MyDrive/checkpoint_vgg16_3_small_images_3'

try:
  model.load_weights(checkpoint_filepath)
except Exception:
  print(f"Checkpoint {checkpoint_filepath} not found")

## Unification des modeles


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from keras.wrappers.scikit_learn import KerasClassifier 

def get_image(union_sample):
  return union_sample[:128*128*3]

def get_text(union_sample):
  return union_sample[128*128*3:]

pipe_reg = Pipeline([('select_input', FunctionTransformer(get_text)), ('reglog', clf_reglog)])
cnn = KerasClassifier(lambda: model)
cnn._estimator_type = "classifier"
pipe_cnn = Pipeline([('select_input', FunctionTransformer(get_image)), ('cnn', cnn)])
clf = StackingClassifier(estimators=[('reg', pipe_reg), ('cnn', pipe_cnn)], final_estimator=LogisticRegression())

  del sys.path[0]


In [None]:
clf.fit(X_train_union, y_train)

In [None]:
np.asarray(y_train)

array(['figurines 2', 'equipement baignade jardin', 'Jeux-vidéos', ...,
       'papeterie et accessoire papeterie', 'accessoires jardin',
       'accessoires, linges de maison'], dtype=object)

In [None]:
pipe.score(test_x_gen, y_test)

TypeError: ignored

In [None]:
# Pas de fit sur les generateurs!!
# Pas de partial_fit sur StackingClassifier


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline


pipe_reg = Pipeline([('select_input', FunctionTransformer(lambda x: x[1])), ('reglog', clf_reglog)])
pipe_cnn = Pipeline([('select_input', FunctionTransformer(lambda x: x[0])), ('cnn', model)])
clf = VotingClassifier(estimators=[('reg', pipe_reg), ('cnn', pipe_cnn)], voting='soft')

In [None]:
# - Couche concatenate pour merger les modeles puis couche classification
# - essayer de crop puis de resize les images

# https://github.com/cerlymarco/MEDIUM_NoteBook/blob/master/NeuralNet_Ensemble/NeuralNet_Ensemble.ipynb 

In [None]:
ds = tf.keras.preprocessing.image_dataset_from_directory(
    "rakuten/images/", 
    label_mode=None, 
    batch_size=None, 
    image_size=(224, 224), 
    crop_to_aspect_ratio=False,
    shuffle=False
)
ds = ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)


Found 84916 files belonging to 1 classes.


In [None]:
#tf.data.Dataset.from_tensor_slices(X_train_vect)
X_train_vect

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43406,43407,43408,43409,43410,43411,43412,43413,43414,43415
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66799,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Création d'un generateur qui regroupe les images et le texte

class CustomSequenceGenerator(tf.keras.utils.Sequence):

    def __init__(self, df, df_text, y, image_dir = "rakuten/images/", input_size=(224, 224, 3)):
        self.df = df
        self.df_text = df_text
        self.y = y
        self.image_dir = image_dir
        self.input_size = input_size
        self.size = len(self.df)

    def __len__(self):
        return self.size

    def __getitem__(self, index):

        path = f"{self.image_dir}{self.df.iloc[index].image}"
        image = tf.keras.utils.load_img(
          path,
          grayscale=False,
          color_mode='rgb',
          target_size=self.input_size,
          interpolation='bilinear'
        )
        image = tf.keras.preprocessing.image.img_to_array(image)
        x = [image, self.df_text.iloc[index]]
        y = self.y.iloc[index]
        return x, y


In [None]:
train_gen = CustomSequenceGenerator(X_train, X_train_vect, y_train)
test_gen = CustomSequenceGenerator(X_test, X_test_vect, y_test)