# Importing library and data

In [None]:
pip install tensorflow_text

In [None]:
import tensorflow as tf
tf.__version__

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
import pandas as pd

dt = pd.read_csv('./datasets/ecommerceDataset.csv')
dt.head(10)

Unnamed: 0,Labels,Text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
5,Household,Pitaara Box Romantic Venice Canvas Painting 6m...
6,Household,Paper Plane Design Starry Night Vangoh Wall Ar...
7,Household,Pitaara Box Romantic Venice Canvas Painting 6m...
8,Household,SAF 'Ganesh Modern Art Print' Painting (Synthe...
9,Household,Paintings Villa UV Textured Modern Art Print F...


## **Data preprocessing**
### *Getting all categories*

In [3]:
categories = dt['Labels'].unique()
print(categories)

['Household' 'Books' 'Clothing & Accessories' 'Electronics']


In [4]:
dt.groupby('Labels').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Books,11820,6256,Think & Grow Rich About the Author NAPOLEON HI...,30
Clothing & Accessories,8670,5674,Diverse Men's Formal Shirt Diverse is a wester...,23
Electronics,10621,5308,HP 680 Original Ink Advantage Cartridge (Black...,26
Household,19313,10564,Nilkamal Series-24 Chest of Drawers (Cream Tra...,13


### *Handling dataframes*

In [11]:
df_accessories = dt[dt['Labels'] == 'Clothing & Accessories']
df_accessories.head(10)
df_accessories.shape


(8671, 2)

In [7]:
df_books = dt[dt['Labels'] == 'Books']
df_books.shape

(11820, 2)

In [8]:
df_household = dt[dt['Labels'] == 'Household']
df_household.shape

(19313, 2)

In [9]:
df_elec = dt[dt['Labels'] == 'Electronics']
df_elec.shape

(10621, 2)

### ***Balancing dataframes***

In [12]:
df_books_downsample = df_books.head(df_accessories.shape[0])
df_books_downsample.shape

(8671, 2)

In [13]:
df_household_downsample = df_household.head(df_accessories.shape[0])
df_household_downsample.shape

(8671, 2)

In [14]:
df_elec_downsample = df_elec.head(df_accessories.shape[0])
df_elec_downsample.shape
df_elec_downsample.head()

Unnamed: 0,Labels,Text
39804,Electronics,Dell 19.5V-3.34AMP 65W Laptop Adapter (Without...
39805,Electronics,Bluetooth Dongle USB CSR 4.0 Adapter Receiver ...
39806,Electronics,"Wi-Fi Receiver 300Mbps, 2.4GHz, 802.11b/g/n US..."
39807,Electronics,SanDisk 64GB Class 10 microSDXC Memory Card wi...
39808,Electronics,Gizga Essentials Laptop Power Cable Cord- 3 Pi...


### ***Merging dataframes***

In [15]:
dfs_array = [df_accessories, df_books_downsample, df_household_downsample, df_elec_downsample]
df_merged = pd.concat(dfs_array)
df_merged[df_merged['Labels'] == 'Household']

Unnamed: 0,Labels,Text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
...,...,...
8666,Household,Glync 5W LED Recessed Spot Light Ceiling Downl...
8667,Household,Philips AstraPrime 10-Watt Recessed LED Panel ...
8668,Household,D'MakTM LED Ceiling COB Spot Light - 3 Watt - ...
8669,Household,Philips AstraPrime 5-Watt Recessed LED Panel C...


### ***Adding 'type' column***

In [16]:
df_merged

Unnamed: 0,Labels,Text
31133,Clothing & Accessories,Woopower 36M Pink for 024M Baby Trouser Top Se...
31134,Clothing & Accessories,Amour Butterfly Design Sunglasses For Girls 6+...
31135,Clothing & Accessories,Vaenait Baby 024M Baby Girls Rashguard Swimwea...
31136,Clothing & Accessories,Amour Butterfly Design Sunglasses For Girls 6+...
31137,Clothing & Accessories,Kuchipoo Girl's Cotton Regular Fit T-Shirt - P...
...,...,...
48470,Electronics,LG GH24NSD1 Internal SATA DVD Writer The M-DIS...
48471,Electronics,LG GP65NB60 External DVD Writer (Black) LG GP6...
48472,Electronics,PIONEER DVD PLAYER DV-3052V Pioneer DV-3052 Mu...
48473,Electronics,LG DP546 DVD Player (Black) DivX-This is a for...


In [80]:
df_merged.drop(['Household', 'Books', 'Electronics', 'Clothing & Accessories'], axis = 1)

Unnamed: 0,Labels,Text,Type
31133,Clothing & Accessories,Woopower 36M Pink for 024M Baby Trouser Top Se...,"[0, 0, 0, 1]"
31134,Clothing & Accessories,Amour Butterfly Design Sunglasses For Girls 6+...,"[0, 0, 0, 1]"
31135,Clothing & Accessories,Vaenait Baby 024M Baby Girls Rashguard Swimwea...,"[0, 0, 0, 1]"
31136,Clothing & Accessories,Amour Butterfly Design Sunglasses For Girls 6+...,"[0, 0, 0, 1]"
31137,Clothing & Accessories,Kuchipoo Girl's Cotton Regular Fit T-Shirt - P...,"[0, 0, 0, 1]"
...,...,...,...
48470,Electronics,LG GH24NSD1 Internal SATA DVD Writer The M-DIS...,"[0, 1, 0, 0]"
48471,Electronics,LG GP65NB60 External DVD Writer (Black) LG GP6...,"[0, 1, 0, 0]"
48472,Electronics,PIONEER DVD PLAYER DV-3052V Pioneer DV-3052 Mu...,"[0, 1, 0, 0]"
48473,Electronics,LG DP546 DVD Player (Black) DivX-This is a for...,"[0, 1, 0, 0]"


In [17]:
Household = [1, 0, 0, 0]
Electronics = [0, 1, 0, 0]
Books = [0, 0, 1, 0]
Clo_Acce = [0, 0, 0, 1]

df_merged['Type'] = df_merged['Labels'].apply(
    lambda x: Household if x == 'Household'
    else Electronics if x == 'Electronics'
    else Books if x == 'Books'
    else Clo_Acce
)

df_merged.sample(10)

Unnamed: 0,Labels,Text,Type
41415,Electronics,Samsung EVO Plus Grade 3 Class 10 128GB MicroS...,"[0, 1, 0, 0]"
38597,Clothing & Accessories,Amour Butterfly Design Sunglasses For Girls 6+...,"[0, 0, 0, 1]"
47279,Electronics,Hikvision DS-2CE1AD0T-IRP 2MP 1080P Full HD Ni...,"[0, 1, 0, 0]"
33811,Clothing & Accessories,Elk Kids Baby Girls Plain Panties Brief Innerw...,"[0, 0, 0, 1]"
32744,Clothing & Accessories,EIO® New Born Baby Multi-Color Long Sleeve Cot...,"[0, 0, 0, 1]"
22728,Books,Errorless Physics (Universal Self Scorer) for ...,"[0, 0, 1, 0]"
34915,Clothing & Accessories,Amazon Brand - Symbol Men's Regular Fit Cotton...,"[0, 0, 0, 1]"
4742,Household,Painting Mantra Art Street Round Wall Mirror (...,"[1, 0, 0, 0]"
6920,Household,Yun Hai Hokipo Striped Design Bamboo Dinning T...,"[1, 0, 0, 0]"
26153,Books,Comprehensive Guide to SBI Bank PO Preliminary...,"[0, 0, 1, 0]"


In [18]:
df_merged['Type'].head(1).dtype

dtype('O')

In [122]:
df_merged['Type'][1]


[1, 0, 0, 0]

In [19]:
df_merged['Household'] = df_merged['Labels'].apply(
    lambda x: 1 if x == 'Household' else 0
)
df_merged['Electronics'] = df_merged['Labels'].apply(
    lambda x: 1 if x == 'Electronics' else 0
)
df_merged['Books'] = df_merged['Labels'].apply(
    lambda x: 1 if x == 'Books' else 0
)

df_merged['Clothing & Accessories'] = df_merged['Labels'].apply(
    lambda x: 1 if x == 'Clothing & Accessories' else 0
)

df_merged.sample(50)

Unnamed: 0,Labels,Text,Type,Household,Electronics,Books,Clothing & Accessories
19745,Books,EAT THAT FROG 3RD EDITION,"[0, 0, 1, 0]",0,0,1,0
33535,Clothing & Accessories,BLINKIN Mesh Yoga Gym and Active Sports Fitnes...,"[0, 0, 0, 1]",0,0,0,1
3043,Household,Usha Crisp Air Premia BV 100mm Exhaust Fan (Wh...,"[1, 0, 0, 0]",1,0,0,0
20059,Books,Monopoly Electronic Banking Board Game Style N...,"[0, 0, 1, 0]",0,0,1,0
32229,Clothing & Accessories,US Polo Association Boys' Trousers Tend to you...,"[0, 0, 0, 1]",0,0,0,1
3112,Household,Usha Duos Mist Air 400mm Table Fan (Purple) Ae...,"[1, 0, 0, 0]",1,0,0,0
32618,Clothing & Accessories,FabSeasons Acrylic Woolen Winter Gloves for Ba...,"[0, 0, 0, 1]",0,0,0,1
26683,Books,The Challenger Sale: Taking Control of the Cus...,"[0, 0, 1, 0]",0,0,1,0
36245,Clothing & Accessories,Karatcart Platinum Plated Trendy Elegant Austr...,"[0, 0, 0, 1]",0,0,0,1
8467,Household,Pigma Peerithi Rechargeable Emergency Automati...,"[1, 0, 0, 0]",1,0,0,0


### ***Splitting dataframe***

In [108]:
cols = df_merged.select_dtypes(include=['object'])
for col in cols.columns.values:
    df_merged[col] = df_merged[col].fillna('')

In [110]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_merged['Text'], df_merged['Type'], train_size=0.09, stratify=df_merged['Type'])
y_train.head()

36681    [0, 0, 0, 1]
35662    [0, 0, 0, 1]
26569    [0, 0, 1, 0]
7349     [1, 0, 0, 0]
40428    [0, 1, 0, 0]
Name: Type, dtype: object

### ***Importing BERT and getting embeding vectors for data***

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

**Example getting embeding of sentence**

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

**Building model**

In [None]:
#BERT Layer
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name = "text")
preprocessed_inputs = bert_preprocess(text_input)
encoded_outputs = bert_encoder(preprocessed_inputs)

#Neural network
layer = tf.keras.layers.Dropout(0.1, name='dropout') (encoded_outputs['pooled_output'])
layer= tf.keras.layers.Dense(4, activation='sigmoid', name="output")(layer)

#Construct the final model
model = tf.keras.Model(inputs=[text_input], outputs=[layer])

In [150]:
encoded_outputs['pooled_output'][4]

<KerasTensor: shape=(768,) dtype=float32 (created by layer 'tf.__operators__.getitem_3')>

In [145]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                

In [None]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer='adam',
              metrics=[tf.keras.metrics.BinaryAccuracy(name = 'Accuracy'),
                       tf.keras.metrics.Precision(name = 'Precision'), 
                       tf.keras.metrics.Recall(name = 'Recall')])

In [135]:
y_train = y_train.astype(str)

In [136]:
y_train.head(10)

36681    [0, 0, 0, 1]
35662    [0, 0, 0, 1]
26569    [0, 0, 1, 0]
7349     [1, 0, 0, 0]
40428    [0, 1, 0, 0]
44173    [0, 1, 0, 0]
46171    [0, 1, 0, 0]
4267     [1, 0, 0, 0]
19940    [0, 0, 1, 0]
21308    [0, 0, 1, 0]
Name: Type, dtype: object

In [137]:
from keras.models import Sequential
from keras.layers import Dense

MODEL = Sequential()
MODEL.add(Dense(20, input_dim = 768, kernel_initializer='he_uniform', activation='relu'))
MODEL.add(Dense(4, activation = 'sigmoid'))
MODEL.compile(loss='binary_crossentropy', optimizer='adam')

In [138]:
MODEL.fit(X_train, y_train, verbose=0, epochs=3)





ValueError: in user code:

    File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\input_spec.py", line 277, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer "sequential_12" "                 f"(type Sequential).
    
    Input 0 of layer "dense_24" is incompatible with the layer: expected axis -1 of input shape to have value 768, but received input with shape (None, 1)
    
    Call arguments received by layer "sequential_12" "                 f"(type Sequential):
      • inputs=tf.Tensor(shape=(None, 1), dtype=string)
      • training=True
      • mask=None


In [None]:
import numpy as np
y_train = np.asarray(y_train).astype(np.int64)
model.fit(X_train, y_train, epochs = 10)