In [1]:
# Importing useful libariries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss, mean_squared_error, accuracy_score
import math
import os
import xgboost as xgb
from time import time

from sklearn.preprocessing import OneHotEncoder


In [2]:
# Reading given csv file
df = pd.read_csv("ecommerceDataset.csv", header=None, names=["Types","Text"])

In [3]:
# Looking at first 5 rows of the dataframe
df.head()

Unnamed: 0,Types,Text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [4]:
# Prininting target types and their distributions
df["Types"].value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: Types, dtype: int64

In [5]:
# Looking for missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Types   50425 non-null  object
 1   Text    50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [6]:
# Dropping missing value as there is only one row having missing value
df.dropna(inplace=True)

In [7]:
# Cross-checking
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50424 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Types   50424 non-null  object
 1   Text    50424 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [8]:
df.Text[0]

'Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for eternal blis

# Data Preprocessing

In [9]:
import re
import nltk
import numpy as np
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
# Setting stopwords
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.remove("not")

# Function for words like can't, don't ==> can not , do not
def expand(text):
    expanded_text = []
    for word in text:
        if re.search("n't", word):
            expanded_text.append(word.split("n't")[0])
            expanded_text.append("not")
        else:
            expanded_text.append(word)
    return expanded_text

In [11]:
#removing special characters
df['Clean_text'] = df['Text'].str.replace("[^a-zA-Z' ]","")
df.head()

Unnamed: 0,Types,Text,Clean_text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",SAF 'Floral' Framed Painting Wood inch x inc...
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",SAF Flower Print Framed Painting Synthetic in...
4,Household,Incredible Gifts India Wooden Happy Birthday U...,Incredible Gifts India Wooden Happy Birthday U...


In [12]:
df.tail(10)

Unnamed: 0,Types,Text,Clean_text
50415,Electronics,TP-Link TL-WN725N 150Mbps Wireless N Nano USB ...,TPLink TLWNN Mbps Wireless N Nano USB Adapter ...
50416,Electronics,"Lenovo Tab4 10 Tablet (10.1 inch,16GB,Wi-Fi + ...",Lenovo Tab Tablet inchGBWiFi G LTE Slate Bl...
50417,Electronics,SanDisk 128GB Class 10 microSDXC Memory Card w...,SanDisk GB Class microSDXC Memory Card with A...
50418,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou...",Samsung Guru FM Plus SMBED Black ColourBlack ...
50419,Electronics,"Nokia Lumia 530 (Dual SIM, Grey) Colour:Grey ...",Nokia Lumia Dual SIM Grey ColourGrey Produc...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...,Strontium MicroSD Class GB Memory Card Black ...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...,Karbonn Titanium Wind W White Karbonn Titanium...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou...",Samsung Guru FM Plus SMBED Black ColourBlack ...
50424,Electronics,Micromax Canvas Win W121 (White),Micromax Canvas Win W White


In [13]:
types = df.Types.value_counts().index.tolist()

In [14]:
# Observing text rows from each type
for i in types:
    print(df[df["Types"]==i].head(4),"\n")

       Types                                               Text  \
0  Household  Paper Plane Design Framed Wall Hanging Motivat...   
1  Household  SAF 'Floral' Framed Painting (Wood, 30 inch x ...   
2  Household  SAF 'UV Textured Modern Art Print Framed' Pain...   
3  Household  SAF Flower Print Framed Painting (Synthetic, 1...   

                                          Clean_text  
0  Paper Plane Design Framed Wall Hanging Motivat...  
1  SAF 'Floral' Framed Painting Wood  inch x  inc...  
2  SAF 'UV Textured Modern Art Print Framed' Pain...  
3  SAF Flower Print Framed Painting Synthetic  in...   

       Types                                               Text  \
19313  Books  Inner Engineering: A Yogi's Guide to Joy About...   
19314  Books       Muslims and Missionaries in Pre-Mutiny India   
19315  Books  The PCOD - Thyroid Book - Compiled From Women ...   
19316  Books  The Monk Who Sold His Ferrari Review Everyone ...   

                                              Clean

In [15]:
#removing emebbed links if any
df['Clean_text'] = df['Clean_text'].replace(re.compile(r"((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))"), "")
df.head()

Unnamed: 0,Types,Text,Clean_text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",SAF 'Floral' Framed Painting Wood inch x inc...
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",SAF Flower Print Framed Painting Synthetic in...
4,Household,Incredible Gifts India Wooden Happy Birthday U...,Incredible Gifts India Wooden Happy Birthday U...


In [16]:
#removing single characters
df['Clean_text'] = df['Clean_text'].replace(re.compile(r"(^| ).( |$)"), "")
df.tail()

Unnamed: 0,Types,Text,Clean_text
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...,Strontium MicroSD Class GB Memory Card Black ...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...,Karbonn Titanium WindWhite Karbonn Titanium Wi...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou...",Samsung Guru FM Plus SMBED Black ColourBlackCo...
50424,Electronics,Micromax Canvas Win W121 (White),Micromax Canvas WinWhite


In [17]:
df[df.Types=="Electronics"].Clean_text

39804    Dell VAMPLaptop Adapter Without power Cord Des...
39805    Bluetooth Dongle USB CSR  Adapter Receiver Tra...
39806    WiFi Receiver Mbps GHz bgn USB  Wireless Mini ...
39807    SanDisk GB Class  microSDXC Memory Card with A...
39808    Gizga Essentials Laptop Power Cable Cord  Pin ...
                               ...                        
50420    Strontium MicroSD Class  GB Memory Card Black ...
50421    CrossBeats Wave Waterproof Bluetooth Wireless ...
50422    Karbonn Titanium WindWhite Karbonn Titanium Wi...
50423    Samsung Guru FM Plus SMBED Black ColourBlackCo...
50424                             Micromax Canvas WinWhite
Name: Clean_text, Length: 10621, dtype: object

In [18]:
#tokenizing
df['Clean_text'] = df['Clean_text'].str.split()
df['Clean_text'] = df['Clean_text'].apply(lambda text: [word for word in text if word not in STOPWORDS])
df['Clean_text'] = df['Clean_text'].apply(lambda text: expand(text))
df.head()

Unnamed: 0,Types,Text,Clean_text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,"[Paper, Plane, Design, Framed, Wall, Hanging, ..."
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...","[SAF, 'Floral', Framed, Painting, Wood, inch, ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,"[SAF, 'UV, Textured, Modern, Art, Print, Frame..."
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...","[SAF, Flower, Print, Framed, Painting, Synthet..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...,"[Incredible, Gifts, India, Wooden, Happy, Birt..."


In [19]:
#Initialising word processing methods
wordNetLemmatizer = WordNetLemmatizer()
porterStemmer = PorterStemmer()

df['Clean_text'] = df['Clean_text'].apply(lambda text: [wordNetLemmatizer.lemmatize(word) for word in text])
df['Clean_text'] = df['Clean_text'].apply(lambda text: [porterStemmer.stem(word) for word in text])

df.head()

Unnamed: 0,Types,Text,Clean_text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,"[paper, plane, design, frame, wall, hang, moti..."
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...","[saf, 'floral', frame, paint, wood, inch, inch..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,"[saf, 'uv, textur, modern, art, print, framed'..."
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...","[saf, flower, print, frame, paint, synthet, in..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...,"[incred, gift, india, wooden, happi, birthday,..."


In [20]:
# combining words back to original
df['Clean_text'] = df['Clean_text'].apply(lambda text: ' '.join(text))
df.head()

Unnamed: 0,Types,Text,Clean_text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,paper plane design frame wall hang motiv offic...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",saf 'floral' frame paint wood inch inch specia...
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,saf 'uv textur modern art print framed' paint ...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",saf flower print frame paint synthet inch inch...
4,Household,Incredible Gifts India Wooden Happy Birthday U...,incred gift india wooden happi birthday uniqu ...


In [21]:
df.tail()

Unnamed: 0,Types,Text,Clean_text
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...,strontium microsd class gb memori card black w...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...,crossbeat wave waterproof bluetooth wireless e...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...,karbonn titanium windwhit karbonn titanium win...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou...",samsung guru fm plu smbed black colourblackcom...
50424,Electronics,Micromax Canvas Win W121 (White),micromax canva winwhit


In [22]:
# Observing text rows from each type
for i in types:
    print(df[df["Types"]==i].head(4),"\n")

       Types                                               Text  \
0  Household  Paper Plane Design Framed Wall Hanging Motivat...   
1  Household  SAF 'Floral' Framed Painting (Wood, 30 inch x ...   
2  Household  SAF 'UV Textured Modern Art Print Framed' Pain...   
3  Household  SAF Flower Print Framed Painting (Synthetic, 1...   

                                          Clean_text  
0  paper plane design frame wall hang motiv offic...  
1  saf 'floral' frame paint wood inch inch specia...  
2  saf 'uv textur modern art print framed' paint ...  
3  saf flower print frame paint synthet inch inch...   

       Types                                               Text  \
19313  Books  Inner Engineering: A Yogi's Guide to Joy About...   
19314  Books       Muslims and Missionaries in Pre-Mutiny India   
19315  Books  The PCOD - Thyroid Book - Compiled From Women ...   
19316  Books  The Monk Who Sold His Ferrari Review Everyone ...   

                                              Clean

In [23]:
#df.to_csv("data/clean_text.csv",index=False)

In [26]:
train_data, test_data = train_test_split(df, train_size=0.9, stratify=df.Types, random_state=100)

In [27]:
import tensorflow as tf

# Train Data Labels
train_data["Types"]  = train_data["Types"].astype('category')
train_data["Type_Labels"] = train_data["Types"].cat.codes

# OneHotEncoding target using tensorflow
train_features, train_labels = train_data["Clean_text"], tf.one_hot(train_data["Type_Labels"], 4)

# Test Data Labels
test_data["Types"]  = test_data["Types"].astype('category')
test_data["Type_Labels"] = test_data["Types"].cat.codes

# OneHotEncoding target using tensorflow
test_features, test_labels = test_data["Clean_text"], tf.one_hot(test_data["Type_Labels"], 4)

In [33]:
#train_data.head()

In [34]:
#test_data.head()

In [32]:
# Tokenizing
import nltk
from nltk.tokenize import word_tokenize
tokenized_train_features = [word_tokenize(each_train_text) for each_train_text in train_features]
tokenized_test_features = [word_tokenize(each_test_text) for each_test_text in test_features]

In [35]:
# Generating Word2vec model
from gensim.models import word2vec

#config
vector_size = 300

w2v_model = word2vec.Word2Vec(
    tokenized_train_features,
    vector_size=vector_size,  # Dimensionality of the word vectors
    window=20,
    min_count=1,
    sg=1  # setting as 1 for skip-gram
)

In [36]:
# Fetching vocab list
vocab_list = list(w2v_model.wv.key_to_index.keys())

# Function for OOV(Out of Vocab) words
def remove_OOV_vocab(sample: list, list_vocab):
    """ Takes in tokenized sample in the form of list 
    and the vocabulary list and removes tokens from sample
    that are not in the vocabulary list"""
    in_vocab_sample = []
    for each_token in sample:
        if each_token in list_vocab:
            in_vocab_sample.append(each_token)
    return in_vocab_sample
  
tokenized_test_features = [remove_OOV_vocab(each_test_sample, vocab_list) for each_test_sample in tokenized_test_features]

In [39]:
# Embedding matrix for embedding layer
vocab = w2v_model.wv.key_to_index.keys()
embedding_matrix = w2v_model.wv[vocab]

In [41]:
embedding_matrix.shape

(115611, 300)

In [52]:
df["length"] = df["Clean_text"].apply(lambda x: len(x))

In [53]:
df["length"].describe()

count    50424.000000
mean       484.392928
std        625.507034
min          2.000000
25%        157.000000
50%        337.000000
75%        640.000000
max      31620.000000
Name: length, dtype: float64

In [60]:
# Chossing value between 50-75th percentile
max_seq_len = 500

In [61]:
# Replacing the tokens with the index of the word in the word2vec vocabulary and padding
from tensorflow.keras.preprocessing.sequence import pad_sequences

def w2v_indexed_token_sequences(w2v_model, list_features):
    indexed_features = []
    for each_seq in list_features:
        list_token_indices = []
        for each_token in each_seq:
            try:
                list_token_indices.append(w2v_model.wv.key_to_index[each_token])
            except KeyError as e:
                continue
        indexed_features.append(list_token_indices)
    return indexed_features

indexed_train_features = w2v_indexed_token_sequences(w2v_model, tokenized_train_features)
indexed_test_features = w2v_indexed_token_sequences(w2v_model, tokenized_test_features)

padded_train = pad_sequences(indexed_train_features, padding = 'post', maxlen=max_seq_len, truncating='post')
padded_test = pad_sequences(indexed_test_features, padding = 'post', maxlen=max_seq_len, truncating='post')

In [68]:
padded_train

array([[  989, 19701, 63065, ...,     0,     0,     0],
       [ 7561, 15501,   366, ...,     0,     0,     0],
       [   61,   134,   104, ...,     0,     0,     0],
       ...,
       [ 4402,    72,     1, ...,     0,     0,     0],
       [36741,  1205,  1950, ...,     0,     0,     0],
       [ 7885, 13714,    72, ...,     0,     0,     0]])

In [69]:
padded_test

array([[ 1902,  5136,  1007, ...,     0,     0,     0],
       [ 3585,  6048,  1494, ...,     0,     0,     0],
       [ 7245,   280,    20, ...,     0,     0,     0],
       ...,
       [ 4264, 13514, 34955, ...,     0,     0,     0],
       [ 5093,  4893, 14271, ...,     0,     0,     0],
       [ 1827, 30862,    39, ...,     0,     0,     0]])

In [70]:
print(padded_train.shape,"......",padded_test.shape)

(45381, 500) ...... (5043, 500)


In [58]:
# Keras imports
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM

In [66]:
len(w2v_model.wv)

115611

In [75]:
vocab_size = len(w2v_model.wv)

def get_model():
    model = Sequential()
    model.add(
        Embedding(input_dim=vocab_size,
                  output_dim=vector_size,
                  weights=[embedding_matrix],
                  input_length=max_seq_len))
    model.add(Dropout(0.4))
    model.add(LSTM(max_seq_len,return_sequences=True))
    model.add(LSTM(4))
    model.add(Dense(4,activation='softmax'))
    return model

In [76]:
# Adding callbacks for best model checkpoint
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                  patience=2,
                                  verbose=1,
                                  restore_best_weights=True),
    keras.callbacks.ModelCheckpoint(filepath='models/lstm_with_w2v.hdf5',
                                    verbose=1,
                                    save_best_only=True)
]

In [77]:
# Compiling
model = get_model()
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
tf.config.run_functions_eagerly(True)

In [None]:
history = model.fit(padded_train, 
                    train_labels,
                    validation_split=0.33,
                    callbacks=callbacks,
                    epochs=2)

In [None]:
model_with_w2v = keras.models.load_model('data/lstm_with_w2v.hdf5')

In [None]:
y_pred_one_hot_encoded = (model_with_w2v.predict(padded_train)> 0.5).astype("int32")

# Output:
# array([[0, 0, 0, 0, 1, 0],
#        [0, 0, 0, 0, 1, 0],
#        [1, 0, 0, 0, 0, 0],
#        ...,
#        [0, 0, 1, 0, 0, 0],
#        [1, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 1, 0]])

# Decode the one-hot encoded predictions
y_pred_train = np.array(tf.argmax(y_pred_one_hot_encoded, axis=1))

In [None]:
y_pred_one_hot_encoded = (model_with_w2v.predict(padded_test)> 0.5).astype("int32")
y_pred_test = np.array(tf.argmax(y_pred_one_hot_encoded, axis=1)

In [None]:
from sklearn.metrics import classification_report
# Training set
print(classification_report(train_data['emotion_label'], y_pred_train))

# Test Set
print(classification_report(train_data['emotion_label'], y_pred_test))