# Importing library and data

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [6]:
import pandas as pd

dt = pd.read_csv('./datasets/ecommerceDataset.csv', header=None)
print(dt)

                 0                                                  1
0        Household  Paper Plane Design Framed Wall Hanging Motivat...
1        Household  SAF 'Floral' Framed Painting (Wood, 30 inch x ...
2        Household  SAF 'UV Textured Modern Art Print Framed' Pain...
3        Household  SAF Flower Print Framed Painting (Synthetic, 1...
4        Household  Incredible Gifts India Wooden Happy Birthday U...
...            ...                                                ...
50420  Electronics  Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421  Electronics  CrossBeats Wave Waterproof Bluetooth Wireless ...
50422  Electronics  Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423  Electronics  Samsung Guru FM Plus (SM-B110E/D, Black) Colou...
50424  Electronics                   Micromax Canvas Win W121 (White)

[50425 rows x 2 columns]


## **Data preprocessing**
### *Getting all categories*

In [10]:
categories = dt[0].unique()
print(categories)

['Household' 'Books' 'Clothing & Accessories' 'Electronics']


In [11]:
dt.groupby(0).describe()

Unnamed: 0_level_0,1,1,1,1
Unnamed: 0_level_1,count,unique,top,freq
0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Books,11820,6256,Think & Grow Rich About the Author NAPOLEON HI...,30
Clothing & Accessories,8670,5674,Diverse Men's Formal Shirt Diverse is a wester...,23
Electronics,10621,5308,HP 680 Original Ink Advantage Cartridge (Black...,26
Household,19313,10564,Nilkamal Series-24 Chest of Drawers (Cream Tra...,13


### *Handling dataframes*

In [12]:
df_accessories = dt[dt[0] == 'Clothing & Accessories']
df_accessories.head(10)
df_accessories.shape


(8671, 2)

In [13]:
df_books = dt[dt[0] == 'Books']
df_books.shape

(11820, 2)

In [14]:
df_household = dt[dt[0] == 'Household']
df_household.shape

(19313, 2)

In [15]:
df_elec = dt[dt[0] == 'Electronics']
df_elec.shape

(10621, 2)

### ***Balancing dataframes***

In [16]:
df_books_downsample = df_books.head(df_accessories.shape[0])
df_books_downsample.shape

(8671, 2)

In [17]:
df_household_downsample = df_household.head(df_accessories.shape[0])
df_household_downsample.shape

(8671, 2)

In [18]:
df_elec_downsample = df_elec.head(df_accessories.shape[0])
df_elec_downsample.shape
df_elec_downsample.head()

Unnamed: 0,0,1
39804,Electronics,Dell 19.5V-3.34AMP 65W Laptop Adapter (Without...
39805,Electronics,Bluetooth Dongle USB CSR 4.0 Adapter Receiver ...
39806,Electronics,"Wi-Fi Receiver 300Mbps, 2.4GHz, 802.11b/g/n US..."
39807,Electronics,SanDisk 64GB Class 10 microSDXC Memory Card wi...
39808,Electronics,Gizga Essentials Laptop Power Cable Cord- 3 Pi...


### ***Merging dataframes***

In [20]:
dfs_array = [df_accessories, df_books_downsample, df_household_downsample, df_elec_downsample]
df_merged = pd.concat(dfs_array)
df_merged

Unnamed: 0,0,1
31133,Clothing & Accessories,Woopower 36M Pink for 024M Baby Trouser Top Se...
31134,Clothing & Accessories,Amour Butterfly Design Sunglasses For Girls 6+...
31135,Clothing & Accessories,Vaenait Baby 024M Baby Girls Rashguard Swimwea...
31136,Clothing & Accessories,Amour Butterfly Design Sunglasses For Girls 6+...
31137,Clothing & Accessories,Kuchipoo Girl's Cotton Regular Fit T-Shirt - P...
...,...,...
48470,Electronics,LG GH24NSD1 Internal SATA DVD Writer The M-DIS...
48471,Electronics,LG GP65NB60 External DVD Writer (Black) LG GP6...
48472,Electronics,PIONEER DVD PLAYER DV-3052V Pioneer DV-3052 Mu...
48473,Electronics,LG DP546 DVD Player (Black) DivX-This is a for...


In [None]:
df_merged['type'] = df_merged[0].apply(lambda x: )

### ***Splitting dataframe***

In [70]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import numpy as np

labels = df_merged[0]
texts = df_merged[1]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(labels)
encoded_Y = encoder.transform(labels)
# convert integers to dummy variables (i.e. one hot encoded)
y_cat = np_utils.to_categorical(encoded_Y)
y_cat = y_cat.astype(int)
y_cat

array([[0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       ...,
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0]])

In [69]:
cols = df_merged.select_dtypes(include=['object'])
for col in cols.columns.values:
    df_merged[col] = df_merged[col].fillna('')

In [71]:
import sklearn
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_merged[1], y_cat, train_size=0.2)
#y_train = y_train.astype('string')
y_train[:2]

array([[0, 0, 0, 1],
       [1, 0, 0, 0]])

### ***Importing BERT and getting embeding vectors for data***

In [33]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3")

**Example getting embeding of sentence**

In [34]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351695, -0.5132727 , -0.88845736, ..., -0.74748874,
        -0.75314736,  0.91964495],
       [-0.87208354, -0.50543964, -0.94446677, ..., -0.8584749 ,
        -0.7174534 ,  0.88082975]], dtype=float32)>

**Building model**

In [111]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import Adam


#BERT Layer
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name = "text")
preprocessed_inputs = bert_preprocess(text_input)
encoded_outputs = bert_encoder(preprocessed_inputs)

#Neural network
layer = tf.keras.layers.Dropout(0.2, name="dropout")(encoded_outputs['pooled_output'])
layer= tf.keras.layers.Dense(4, activation='softmax', name="output")(layer)

#Construct the final model
model = tf.keras.Model(inputs=[text_input], outputs=[layer])

In [86]:
encoded_outputs['pooled_output']

<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'keras_layer_8')>

In [112]:
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_7 (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                

In [118]:
from keras.optimizers import Adam

model.compile(Adam(learning_rate = 0.01), "categorical_crossentropy", metrics=[tf.keras.metrics.BinaryAccuracy(name = 'Accuracy'),
                       tf.keras.metrics.Precision(name = 'Precision'), 
                       tf.keras.metrics.Recall(name = 'Recall')])

In [51]:
y_train = np.asarray(y_train.astype(float))

In [113]:
y_train

array([[0, 0, 0, 1],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       ...,
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0]])

In [119]:
model.fit(x_train, y_train, epochs = 1)

  5/217 [..............................] - ETA: 54:01 - loss: 2.9355 - Accuracy: 0.4094 - Precision: 0.2465 - Recall: 0.6625

KeyboardInterrupt: 