# E-commerce Text Classification by Neural Language Processing (NLP)

Task: Categorize the unseen products into 4 categories namely "Electronics", "Household", "Books", and "Clothing & Accessories".

This task means we need to build a prediction machine learning based on a Neural Language Processing (NLP) model for product categorization using product names and descriptions on the dataset.

In [1]:
# 1. Import packages

import os
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras


2024-12-25 18:44:10.572041: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-25 18:44:10.584035: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-25 18:44:10.587579: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-25 18:44:10.597274: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# 2. Data Loading

CSV_PATH = "Dataset/ecommerceDataset.csv"

df = pd.read_csv(CSV_PATH, names = ['category', 'text'])

print(df)

          category                                               text
0        Household  Paper Plane Design Framed Wall Hanging Motivat...
1        Household  SAF 'Floral' Framed Painting (Wood, 30 inch x ...
2        Household  SAF 'UV Textured Modern Art Print Framed' Pain...
3        Household  SAF Flower Print Framed Painting (Synthetic, 1...
4        Household  Incredible Gifts India Wooden Happy Birthday U...
...            ...                                                ...
50420  Electronics  Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421  Electronics  CrossBeats Wave Waterproof Bluetooth Wireless ...
50422  Electronics  Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423  Electronics  Samsung Guru FM Plus (SM-B110E/D, Black) Colou...
50424  Electronics                   Micromax Canvas Win W121 (White)

[50425 rows x 2 columns]


In [3]:
# Check the distinct category

distinct_categories = df['category'].unique()
print(distinct_categories)

['Household' 'Books' 'Clothing & Accessories' 'Electronics']


In [4]:
# Count the values of each categories

print(df['category'].value_counts())

# All values are seem imbalanced.
# So, class_weight is considered to be put in model.fit()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64


In [5]:
# 3. Data Inspection

# a. Shape of the data

print(df.shape)

(50425, 2)


In [6]:
# b. Data Info

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  50425 non-null  object
 1   text      50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB
None


In [7]:
# c. Data Description

df.describe()

Unnamed: 0,category,text
count,50425,50424
unique,4,27802
top,Household,Think & Grow Rich About the Author NAPOLEON HI...
freq,19313,30


In [8]:
# d. Check the missing values

print(df.isna().sum())

# The text has one missing value,
# Thus, we need to delete a row with the missing values.
# Why?
# Because there is no name or description of a product.

category    0
text        1
dtype: int64


In [9]:
# Delecting the row

updated_df = df.dropna(axis=0)

df = updated_df

print(df.isna().sum())

category    0
text        0
dtype: int64


In [10]:
# d. Check duplication

print(df.duplicated().sum())

22622


In [11]:
# 4. Data Preprocessing

# a. Seperate the feature and the target

import sklearn.preprocessing

feature = df['text'].values
target = df['category'].values

In [12]:
# b. Perform target encoding to the category column

target_encoder = sklearn.preprocessing.LabelEncoder()
target_encoded = target_encoder.fit_transform(target)
target_encoded[:5]

array([3, 3, 3, 3, 3])

In [13]:
# Inverse transform the target enconding to check what numbers inside target encoding represents:

sample_categories = target_encoder.inverse_transform([0,1,2,3]) # There are 4 categories
print(sample_categories)

['Books' 'Clothing & Accessories' 'Electronics' 'Household']


In [14]:
# 5. Perform train-val-test split
import sklearn.model_selection

seed = 42
X_train,X_split,y_train,y_split = sklearn.model_selection.train_test_split(feature,target_encoded,train_size=0.7,random_state=seed)
X_val,X_test,y_val,y_test = sklearn.model_selection.train_test_split(X_split,y_split,train_size=0.5,random_state=seed)

In [15]:
# 6. Neural Language Processing (NLP) development

# a. Tokenization
# Tokenization will convert individual text (word) into integers,
# the converted integers are called tokens.

tokenizer = keras.layers.TextVectorization(
    max_tokens=7500,
    output_sequence_length = 500
    )

tokenizer.adapt(X_train)

I0000 00:00:1735123454.125597   11097 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735123454.146845   11097 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735123454.148099   11097 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1735123454.150156   11097 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

In [16]:
# Test how the tokenizer works

sample_tokens = tokenizer(X_train[:2])
print(X_train[:2])

["Acer 18.5 inch (46.99 cm) LED Monitor - EB192Q (Black) Specifications LED 18.5 '' ACER EB192Qb (B). Brand ACER Model EB192Qb Response Time 5 ms Max. Resolution 1366x768 @ 60Hz Contrast Ratio 100 million: 1 (ACM). Brightness 200 nits (cd / m2). Display 18.5 inch Color System 16.7 m POWER Supply (100V-240V): Internal Power Consumption (Off): 0.45W Power Consumption (Sleep): 14W Power Consumption (on): 18W. VGA Port 1 Port."
 'SOUMIK ELECTRICALS 5-inch Subwoofer with Maximum 4 ohm(100 W) Thisb product is from the brand SOUMIK ELECTRICALS it presents a 5 Inch subwoofer with maximum 4 ohm and comes with 100W. Use in your home theatre.']


In [17]:
print(sample_tokens)

tf.Tensor(
[[3365 4947  173    1  147  152  622    1   55  595  152 4947 3365    1
  1613  151 3365  535    1 1478   61  105 1551  739  649    1    1 1619
  1972  115 1539   36    1 1495 1270    1 1373 3153  259 4947  173   65
   146    1  805   59  781    1 1290   59 1111  253    1   59 1111  943
     1   59 1111   15    1 1210  377   36  377    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0  

In [18]:
# b. Embedding
# Embendding converts tokens into a long vector.
# This long vector is a special representation that can help your machine learning model to understand the context of the words.

embedding = keras.layers.Embedding(7500, 128)

In [19]:
# 7. Model development

model = keras.Sequential()

# a. NLP layers

model.add(tokenizer)
model.add(embedding)

# b. Recurrent Neural Network (RNN)

model.add(keras.layers.Bidirectional(keras.layers.LSTM(16,return_sequences=False)))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(len(df['category'].unique()), activation='softmax'))

# Note that the RNN layers is developed simple and added with dropout to prevent an overfitting.

In [20]:
# 8. Model compile
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [21]:
# Before we train the model, we need to create the class weight in order to balance the data.

books = df['category'].value_counts()['Books']
clothing = df['category'].value_counts()['Clothing & Accessories']
electronics = df['category'].value_counts()['Electronics']
household = df['category'].value_counts()['Household']

total = books + clothing + electronics + household
print("Total =", total)

Total = 50424


In [22]:
# Scaling by total / 2.0 helps keep the loss to a similar magnitude

weight_for_0 = (1 / books) * (total / 2.0)
weight_for_1 = (1 / clothing) * (total / 2.0)
weight_for_2 = (1 / electronics) * (total / 2.0)
weight_for_3 = (1 / household) * (total / 2.0)

print(round(weight_for_0, 2)) # Books
print(round(weight_for_1, 2)) # Clothing & Accesories
print(round(weight_for_2, 2)) # Electronics
print(round(weight_for_3, 2)) # Household

2.13
2.91
2.37
1.31


In [23]:
class_weight = {0:weight_for_0, 1:weight_for_1, 2:weight_for_2, 3:weight_for_3}

In [24]:
# 9. Model Training

logpath = "tensorboard/nlp/" + datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")

tb = keras.callbacks.TensorBoard(logpath)
es = keras.callbacks.EarlyStopping(patience=3,verbose=3)

history = model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=20,batch_size=32,callbacks=[tb,es], class_weight=class_weight)

Epoch 1/20


2024-12-25 18:44:18.020104: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 21ms/step - accuracy: 0.8231 - loss: 1.0673 - val_accuracy: 0.9681 - val_loss: 0.1252
Epoch 2/20
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 20ms/step - accuracy: 0.9766 - loss: 0.1930 - val_accuracy: 0.9722 - val_loss: 0.1069
Epoch 3/20
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 20ms/step - accuracy: 0.9851 - loss: 0.1190 - val_accuracy: 0.9769 - val_loss: 0.1025
Epoch 4/20
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 20ms/step - accuracy: 0.9914 - loss: 0.0720 - val_accuracy: 0.9767 - val_loss: 0.1135
Epoch 5/20
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 20ms/step - accuracy: 0.9947 - loss: 0.0440 - val_accuracy: 0.9736 - val_loss: 0.1230
Epoch 6/20
[1m1103/1103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 20ms/step - accuracy: 0.9941 - loss: 0.0444 - val_accuracy: 0.9763 - val_loss: 0.1176
Epoch 6: early 

In [25]:
# Further evaluate with test data

evaluation = model.evaluate(X_test,y_test)
print(evaluation)

# Result : [test_loss, test_accuracy]

[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9799 - loss: 0.0995
[0.13389645516872406, 0.9754098653793335]


In [26]:
# Classification reports - to evaluate accuracy and f1 score of the model

import sklearn.metrics
from sklearn.metrics import classification_report

prediction = model.predict(X_test)
prediction_index = np.argmax(prediction, axis = 1)

model_report = classification_report(y_test, prediction_index)

print(model_report)


[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      1756
           1       0.98      0.98      0.98      1312
           2       0.96      0.97      0.97      1560
           3       0.98      0.98      0.98      2936

    accuracy                           0.98      7564
   macro avg       0.97      0.98      0.97      7564
weighted avg       0.98      0.98      0.98      7564



In [45]:
# 10. Test the model for predicting suitable category for each product descriptions.

predictions = model.predict(X_test[:20])
class_predictions = target_encoder.inverse_transform(np.argmax(predictions,axis=1))

print(X_test[4])

print("")

print(class_predictions[4])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Jack & Jones Men's Cotton  Sweater

Clothing & Accessories


In [28]:
# 11. Save the necessary components

import pickle

# a. Tokenizer

with open("tokenizer.json", "wb") as f:
    pickle.dump(tokenizer, f)

In [29]:
# b. Target Encoder

with open ("target_encoder.json", "wb") as f:
    pickle.dump(target_encoder, f)

In [30]:
# c. Model in .h5

keras.models.save_model(model, "saved_models/Classification.h5")

