### **The objective of the project is to classify a product into the four categories Electronics, Household, Books and Clothing & Accessories, based on its description available on the e-commerce platform.**

In [1]:
import time
import json

# Data manipulation
import numpy as np
import pandas as pd

# NLP
import string, re, nltk
from string import punctuation
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from textblob import TextBlob

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Model evaluation
from sklearn import metrics
from sklearn.metrics import accuracy_score


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_csv('/content/ecommerceDataset.csv', names = ['label', 'description'])
data = data[['description', 'label']]

data

Unnamed: 0,description,label
0,Paper Plane Design Framed Wall Hanging Motivat...,Household
1,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",Household
2,SAF 'UV Textured Modern Art Print Framed' Pain...,Household
3,"SAF Flower Print Framed Painting (Synthetic, 1...",Household
4,Incredible Gifts India Wooden Happy Birthday U...,Household
...,...,...
50420,Strontium MicroSD Class 10 8GB Memory Card (Bl...,Electronics
50421,CrossBeats Wave Waterproof Bluetooth Wireless ...,Electronics
50422,Karbonn Titanium Wind W4 (White) Karbonn Titan...,Electronics
50423,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou...",Electronics


In [3]:
# Example description
data['description'].iloc[0]

'Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for eternal blis

In [4]:
# Missing values
data.isna().sum()

description    1
label          0
dtype: int64

In [5]:
# Duplicate observations
data.duplicated().sum()

22622

In [6]:
data.dropna(inplace = True) # Dropping observations with missing values
data.drop_duplicates(inplace = True) # Dropping duplicate observations
data.reset_index(drop = True, inplace = True) # Resetting index

In [7]:
data.shape

(27802, 2)

In [8]:
# Manually encoding of labels
label_dict = {'Electronics': 0, 'Household': 1, 'Books': 2, 'Clothing & Accessories': 3}
data.replace({'label': label_dict}, inplace = True)

data

Unnamed: 0,description,label
0,Paper Plane Design Framed Wall Hanging Motivat...,1
1,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",1
2,SAF 'UV Textured Modern Art Print Framed' Pain...,1
3,"SAF Flower Print Framed Painting (Synthetic, 1...",1
4,Incredible Gifts India Wooden Happy Birthday U...,1
...,...,...
27797,Micromax Bharat 5 Plus Zero impact on visual d...,0
27798,Microsoft Lumia 550 8GB 4G Black Microsoft lum...,0
27799,"Microsoft Lumia 535 (Black, 8GB) Colour:Black ...",0
27800,Karbonn Titanium Wind W4 (White) Karbonn Titan...,0


In [9]:
data['label'].value_counts()

1    10564
2     6256
3     5674
0     5308
Name: label, dtype: int64

Train-Validation-Test Split

In [10]:
# Feature-target split
X = data.drop('label', axis = 1)
y = data['label']

# Train-test split (from complete data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 40)
data_train = pd.concat([X_train, y_train], axis = 1)

# Validation-test split (from test data)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 40)
data_val, data_test = pd.concat([X_val, y_val], axis = 1), pd.concat([X_test, y_test], axis = 1)


In [11]:
print('shape of data_train = ',data_train.shape)
print('shape of data_test = ',data_test.shape)
print('shape of data_val = ',data_val.shape)

shape of data_train =  (22241, 2)
shape of data_test =  (2781, 2)
shape of data_val =  (2780, 2)


## Text Normalization




* Convertion to Lowercase Removal of Whitespaces
* Removal of Punctuations
* Removal of Unicode Characters
* Substitution of Acronyms
* Substitution of Contractions
* Removal of Stop Words
* Spelling Correction
* Stemming and Lemmatization
* Discardment of Non-alphabetic Words
* Integration of the Processes
* Implementation on Product Description

In [12]:
# Converting to lowercase
def convert_to_lowercase(text):
  return text.lower()

In [13]:
# Removing whitespaces
def remove_whitespace(text):
  return text.strip()

In [14]:
# Removing punctuations
def remove_punctuation(text):
  punct_str = string.punctuation
  punct_str = punct_str.replace("'", "") # discarding apostrophe from the string to keep the contractions intact
  return text.translate(str.maketrans("", "", punct_str))


In [15]:
# Removing HTML tags
def remove_html(text):
  html = re.compile(r'<.*?>')
  return html.sub(r'', text)

## Example
# text = '<a href = "https://www.kaggle.com/datasets/ecommerce-text-classification"> Ecommerce Text Classification </a>'
# print("Input: {}".format(text))
# print("Output: {}".format(remove_html(text)))

In [16]:
# Removing emojis
def remove_emoji(text):
  emoji_pattern = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                          u"\U0001F680-\U0001F6FF"  # transport & map symbols
                          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                          u"\U00002702-\U000027B0"
                          u"\U000024C2-\U0001F251"
                          "]+", flags = re.UNICODE)
  return emoji_pattern.sub(r'', text)

# # Example
# text = "This innovative hd printing technique results in durable and spectacular looking prints 😊"
# print("Input: {}".format(text))
# print("Output: {}".format(remove_emoji(text)))

In [17]:
# Removing other unicode characters
def remove_http(text):
  http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
  pattern = r"({})".format(http) # creating pattern
  return re.sub(pattern, "", text)

# # Example
# text = "It's a function that removes links starting with http: or https such as https://en.wikipedia.org/wiki/Unicode_symbols"
# print("Input: {}".format(text))
# print("Output: {}".format(remove_http(text)))

In [18]:
# Substitution of Acronyms

# Dictionary of acronyms
acronyms_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_acronyms.json'
acronyms_dict = pd.read_json(acronyms_url, typ = 'series')

print("Example: Original form of the acronym 'fyi' is '{}'".format(acronyms_dict['fyi']))

# Dataframe of acronyms
print('\n')
pd.DataFrame(acronyms_dict.items(), columns = ['acronym', 'original']).head()

Example: Original form of the acronym 'fyi' is 'for your information'




Unnamed: 0,acronym,original
0,aka,also known as
1,asap,as soon as possible
2,brb,be right back
3,btw,by the way
4,dob,date of birth


In [19]:
# Function to convert contractions in a text
def convert_acronyms(text):
  text_words = text.split()
  for i, word in enumerate(text_words):
      if word.lower() in acronyms_dict:
          text_words[i] = acronyms_dict[word]
  return ' '.join(text_words)

text = "btw you've to fill in the details including dob"
print("Input: {}".format(text))
print("Output: {}".format(convert_acronyms(text)))

Input: btw you've to fill in the details including dob
Output: by the way you've to fill in the details including date of birth


In [20]:
# Substitution of Contractions

# Dictionary of contractions
contractions_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_contractions.json'
contractions_dict = pd.read_json(contractions_url, typ = 'series')

# Dataframe of contractions
pd.DataFrame(contractions_dict.items(), columns = ['contraction', 'original']).head()

Unnamed: 0,contraction,original
0,'aight,alright
1,ain't,are not
2,amn't,am not
3,arencha,are not you
4,aren't,are not


In [21]:
# Function to convert contractions in a text
def convert_contractions(text):
  text_words = text.split()
  for i, word in enumerate(text_words):
      if word.lower() in contractions_dict:
          text_words[i] = contractions_dict[word]
  return ' '.join(text_words)

text = "he's doin' fine"
print("Input: {}".format(text))
print("Output: {}".format(convert_contractions(text)))

Input: he's doin' fine
Output: he is doing fine


In [22]:
# Stopwords
stops = stopwords.words("english") # stopwords
addstops = ["among", "onto", "shall", "thrice", "thus", "twice", "unto", "us", "would"] # additional stopwords
allstops = stops + addstops

In [23]:
# Function to remove stopwords from a list of texts
def remove_stopwords(text):
  words = text.split()
  return " ".join([word for word in words if word not in allstops])

In [24]:
# Spelling Correction
def spellchecker(text):
  word_list = text.split()
  word_list_corrected = []
  for word in word_list:
      corr_spell = TextBlob(word)
      word_list_corrected.append(str(corr_spell.correct()))

  text_corrected = " ".join(word_list_corrected)
  return text_corrected


In [25]:
# Stemming
stemmer = PorterStemmer()
def text_stemmer(text):
  word_list = text.split()
  text_stem = [stemmer.stem(word) for word in word_list]
  return " ".join(text_stem)


In [26]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def text_lemmatizer(text):
  word_list = text.split()
  text_lemm = [lemmatizer.lemmatize(word, pos="v") for word in word_list]
  return " ".join(text_lemm)


In [27]:
# Discardment of Non-alphabetic Words
def discard_non_alpha(text):
  word_list = text.split()
  word_list_non_alpha = [word for word in word_list if word.isalpha()]
  text_non_alpha = " ".join(word_list_non_alpha)
  return text_non_alpha


In [28]:
# Integration of the Processes
def text_normalizer(text):
  text = convert_to_lowercase(text)
  text = remove_whitespace(text)
  text = re.sub('\n' , '', text) # converting text to one line
  text = re.sub('\[.*?\]', '', text) # removing square brackets
  text = remove_http(text)
  text = remove_punctuation(text)
  text = remove_html(text)
  text = remove_emoji(text)
  text = convert_acronyms(text)
  text = convert_contractions(text)
  text = remove_stopwords(text)
  #text = spellchecker(text)
  #text = text_stemmer(text)
  text = text_lemmatizer(text)
  text = discard_non_alpha(text)
  return text

# Example
# text = "Combine all functions into 1 SINGLE FUNCTION 🙂 & apply on @product #descriptions https://en.wikipedia.org/wiki/Text_normalization"
# print("Input: {}".format(text))
# print("Output: {}".format(text_normalizer(text)))

In [29]:
%%time
# Implementing text normalization
data_train_norm, data_val_norm, data_test_norm = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

data_train_norm['normalized description'] = data_train['description'].apply(text_normalizer)
data_val_norm['normalized description'] = data_val['description'].apply(text_normalizer)
data_test_norm['normalized description'] = data_test['description'].apply(text_normalizer)

data_train_norm['label'] = data_train['label']
data_val_norm['label'] = data_val['label']
data_test_norm['label'] = data_test['label']

data_train_norm

CPU times: user 34.3 s, sys: 187 ms, total: 34.5 s
Wall time: 41.5 s


Unnamed: 0,normalized description,label
15525,practical approach acupuncture author author g...,2
1536,nice goods leatherette office arm chair brown ...,1
21984,ekan fashionable fedora hat girls boys fedora ...,3
25056,techyshop professional concert showlightning c...,0
25213,marantz fully automatic beltdrive turntable pr...,0
...,...,...
23992,apple ipad pro tablet inch wifi gold matter ta...,0
27640,printelligent laptop skin stickers super heroe...,0
14501,challenger sale take control customer conversa...,2
14555,international mathematics olympiad work book o...,2


# Text Vectorization

In [30]:
# Features and labels
X_train_norm, y_train = data_train_norm['normalized description'].tolist(), data_train_norm['label'].tolist()
X_val_norm, y_val = data_val_norm['normalized description'].tolist(), data_val_norm['label'].tolist()
X_test_norm, y_test = data_test_norm['normalized description'].tolist(), data_test_norm['label'].tolist()

In [31]:
# TF-IDF vectorization
TfidfVec = TfidfVectorizer()
X_train_tfidf = TfidfVec.fit_transform(X_train_norm)
X_val_tfidf = TfidfVec.transform(X_val_norm)
X_test_tfidf = TfidfVec.transform(X_test_norm)

In [32]:
#TF-IDF Baseline Modeling

# Classifiers
names = [
    "Logistic Regression",
    "Linear SVM",
    "Random Forest",
    "XGBoost"
]

models = [
    LogisticRegression(max_iter = 1000),
    svm.SVC(kernel = 'linear'),
    RandomForestClassifier(n_estimators = 100),
    XGBClassifier()
]

In [33]:
# Function to return summary of baseline models
def score(X_train, y_train, X_val, y_val, names = names, models = models):
  score_df, score_train, score_val = pd.DataFrame(), [], []

  for model in models:
      model.fit(X_train, y_train)
      y_train_pred, y_val_pred = model.predict(X_train), model.predict(X_val)
      score_train.append(accuracy_score(y_train, y_train_pred))
      score_val.append(accuracy_score(y_val, y_val_pred))

  score_df["Classifier"], score_df["Training accuracy"], score_df["Validation accuracy"] = names, score_train, score_val
  score_df.sort_values(by = 'Validation accuracy', ascending = False, inplace = True)
  return score_df

In [34]:
# Summary of baseline models
score(X_train_tfidf, y_train, X_val_tfidf, y_val, names = names, models = models)

Unnamed: 0,Classifier,Training accuracy,Validation accuracy
1,Linear SVM,0.981161,0.956475
0,Logistic Regression,0.969561,0.948201
2,Random Forest,0.999955,0.929137
3,XGBoost,0.96502,0.92518
