- OpenClassrooms Project 6, Data Scientist
- Author : Oumeima EL GHARBI
- Date : October, November 2022

### Importing libraries

In [None]:
%reset -f

import warnings
warnings.filterwarnings(action="ignore")

import pandas as pd
# to compute time of pipeline
from time import time, strftime, gmtime

from common_graphs import *

import os
import tensorflow as tf
import tensorflow_hub as hub
from transformers import TFAutoModel

from NLP_functions.bag_of_words import *
from NLP_functions.Word2Vec import *
from NLP_functions.BERT import *
from NLP_functions.USE import *

%matplotlib inline
%autosave 300

In [None]:
os.environ["TF_KERAS"]='1'

print(tf.__version__)
#print(tensorflow.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())

In [None]:
# Starting time
t0 = time()

input_path = "./dataset/cleaned/"
input_filename = "final_data_text.csv"
input_file = "{}{}".format(input_path, input_filename)

output_path = "./dataset/cleaned/"

## Transformations

In [None]:
data = pd.read_csv(input_file)

In [None]:
data.head()

In [None]:
l_cat = list(set(data['category']))
print(l_cat)

print("catégories : ", l_cat)
y_cat_num = data["target"]
y_cat_num

#### 1) Bag of Words : Count words + TF-IDF

#### 1) fit / transform description - text

##### 1) Preparing the vectors

In [None]:
feat = ['text'] # 'text' 0.399 et 0.5567 / text_bag-of-words_lem

In [None]:
cv_transform, ctf_transform = create_bag_of_words_vectors(data, feat, feat)

In [None]:
print(cv_transform.shape)
print(ctf_transform.shape)

##### 2) Executing the models

In [None]:
print("CountVectorizer : ")
print("-----------------")
ARI, X_tsne, labels = ARI_fct(cv_transform, l_cat, y_cat_num)
print()
print("Tf-idf : ")
print("--------")
ARI, X_tsne, labels = ARI_fct(ctf_transform, l_cat, y_cat_num)


##### 3) Visualization

In [None]:
TSNE_visu_fct(X_tsne, y_cat_num, l_cat, labels, ARI)

#### 2) fit / transform product_name + text

##### 1) Preparing the vectors

In [None]:
# création du bag of words (CountVectorizer et Tf-idf)
print("Separate vocabulary")

textual_columns = ['product_name', 'text']

cv_transform, ctf_transform = create_bag_of_words_vectors(data, textual_columns, textual_columns)


In [None]:
print(cv_transform.shape)
print(ctf_transform.shape)

##### 2) Executing the models

In [None]:
print("CountVectorizer : ")
print("-----------------")
ARI, X_tsne, labels = ARI_fct(cv_transform, l_cat, y_cat_num)
print()
print("Tf-idf : ")
print("--------")
ARI, X_tsne, labels = ARI_fct(ctf_transform, l_cat, y_cat_num)


KeyboardInterrupt: 

##### 3) Visualization

In [None]:
TSNE_visu_fct(X_tsne, y_cat_num, l_cat, labels, ARI)

#### 3) fit product_name / transform text

##### 1) Preparing the vectors

In [None]:
# création du bag of words (CountVectorizer et Tf-idf)

feat_fit = ['product_name']
feat_transform = ['text_bag-of-words_lem'] # text / text_bag-of-words_lem

cv_transform, ctf_transform = create_bag_of_words_vectors(data, feat_fit, feat_transform)

In [None]:
print(cv_transform.shape)
print(ctf_transform.shape)

##### 2) Executing the models

In [None]:
print("CountVectorizer : ")
print("-----------------")
ARI, X_tsne, labels = ARI_fct(cv_transform, l_cat, y_cat_num)
print()
print("Tf-idf : ")
print("--------")
ARI, X_tsne, labels = ARI_fct(ctf_transform, l_cat, y_cat_num)


##### 3) Visualization

In [None]:
TSNE_visu_fct(X_tsne, y_cat_num, l_cat, labels, ARI)

##### Conclusion bag of words
We have tried counting the occurrences of each sentence according to the vocabulary and we have also tried tf-idf.
We got the best results while using "product_name".

We got better results with the raw sentences than with the cleaned ones.

The best ARI = 0.56 with fit/transform product_name or fit/transform text.

### II) Words Embeddings : Word2Vec

##### 1) Creating a Word2Vec model

In [None]:
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
maxlen = 24 # adapt to length of sentences
sentences = data['product_name'].to_list() # text : 0.19 ARI / text_bag-of-words_lem : 0.21
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]

In [None]:
w2v_words, model_vectors = create_w2v_model(sentences, w2v_min_count, w2v_size, w2v_window, w2v_epochs)

##### 2) Preparing the sentences (tokenization)

In [None]:
x_sentences, tokenizer = tokenize_sentences(sentences, maxlen)

##### 3) Creating the embedding matrix

In [None]:
embedding_matrix, vocab_size = create_embedding_matrix(w2v_words, model_vectors, tokenizer)

##### 4) Creating the embedded model

In [None]:
embed_model = create_embedding_model(x_sentences, maxlen, vocab_size, w2v_size, embedding_matrix)

##### 5) Execution of the model

In [None]:
embeddings = embed_model.predict(x_sentences)
embeddings.shape

In [None]:
ARI, X_tsne, labels = ARI_fct(embeddings, l_cat, y_cat_num)

In [None]:
TSNE_visu_fct(X_tsne, y_cat_num, l_cat, labels, ARI)

##### Conclusion Word2Vec
We get good results with Word2Vec word embedding.

We got better results with the raw sentences than with the cleaned ones.

The best ARI = 0.5 with product_name.

### III) Words Embeddings : BERT

#### 1) BERT HuggingFace
* Using pre-trained model 'bert-base-uncased'

In [None]:
max_length = 64
batch_size = 10
model_type = 'bert-base-uncased'
model = TFAutoModel.from_pretrained(model_type)
sentences = data['product_name'].to_list() # clean_text 0.29

Creating the features

In [None]:
features_bert, last_hidden_states_tot = feature_BERT_fct(model, model_type, sentences,
                                                         max_length, batch_size, mode='HF')

In [None]:
ARI, X_tsne, labels = ARI_fct(features_bert, l_cat, y_cat_num)

In [None]:
TSNE_visu_fct(X_tsne, y_cat_num, l_cat, labels, ARI)

#### 2) BERT hub Tensorflow
* Using pre-trained model : "bert_en_uncased_L-12_H-768_A-12/4'"

In [None]:
# Guide about Tensorflow hub : https://www.tensorflow.org/text/tutorials/classify_text_with_bert
model_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
bert_layer = hub.KerasLayer(model_url, trainable=True)

In [None]:
sentences = data['product_name'].to_list() # text

In [None]:
max_length = 64
batch_size = 10
model_type = 'bert-base-uncased'
model = bert_layer

features_bert, last_hidden_states_tot = feature_BERT_fct(model, model_type, sentences,
                                                         max_length, batch_size, mode='TFhub')

In [None]:
ARI, X_tsne, labels = ARI_fct(features_bert, l_cat, y_cat_num)

In [None]:
TSNE_visu_fct(X_tsne, y_cat_num, l_cat, labels, ARI)

In [None]:
TSNE_visu_fct(X_tsne, y_cat_num, l_cat, labels, ARI)

##### Conclusion BERT

We get good results with BERT word embedding.

We got better results with the raw sentences than with the cleaned ones.

The best ARI = 0.62 with product_name for the pre-trained model : 'bert-base-uncased' from Hugging Face

### IV) Words Embeddings : USE (Universal Sentence Encoder)

https://tfhub.dev/google/universal-sentence-encoder/4

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

##### Using product_name

In [None]:
batch_size = 10
sentences = data['product_name'].to_list() # no preprocessing better !! ? to check

In [None]:
features_USE = feature_USE_fct(embed, sentences, batch_size)

In [None]:
ARI, X_tsne, labels = ARI_fct(features_USE, l_cat, y_cat_num)

In [None]:
TSNE_visu_fct(X_tsne, y_cat_num, l_cat, labels, ARI)

##### Using text / clean_text : cleaned or not

In [None]:
batch_size = 10
sentences = data['text_deeplearning'].to_list() # no preprocessing better !! ? to check

In [None]:
features_USE = feature_USE_fct(embed, sentences, batch_size)

In [None]:
ARI, X_tsne, labels = ARI_fct(features_USE, l_cat, y_cat_num)

In [None]:
TSNE_visu_fct(X_tsne, y_cat_num, l_cat, labels, ARI)

##### Conclusion USE
We get good results with USE word embedding.

We got better results with the raw sentences than with the cleaned ones.

The best ARI = 0.71 with product_name.

### Conclusion
* It seems like the classification of products based on NLP might be doable.
* Using KMeans clustering, we categorized the products using bag of words and word embeddings.
* Best ARI : 0.71 for product_name as the corpus and using USE word embedding method.

In [None]:
# End of notebook time
t1 = time()
print("computing time : {:8.6f} sec".format(t1 - t0))
print("computing time : " + strftime('%H:%M:%S', gmtime(t1 - t0)))