In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
corpus= ['i am nitin verma and i work in optum UHG.']
v= TfidfVectorizer()
t= v.fit_transform(corpus)

v.vocabulary_

{'am': 0,
 'nitin': 3,
 'verma': 6,
 'and': 1,
 'work': 7,
 'in': 2,
 'optum': 4,
 'uhg': 5}

In [49]:
s= t.toarray()
s

array([[0.35355339, 0.35355339, 0.35355339, 0.35355339, 0.35355339,
        0.35355339, 0.35355339, 0.35355339]])

In [50]:
import pandas as pd

In [51]:
data= pd.read_csv('resources/ecommerceDataset.csv', names= ['label', 'text'])
data.head()

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [52]:
data.label.value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: label, dtype: int64

In [53]:
# data is imbalancced 
#under sampling

d_h= data[data.label== 'Household'].sample(n= 8671, random_state= 3241)
d_b= data[data.label== 'Books'].sample(n= 8671, random_state= 3241)
d_e= data[data.label== 'Electronics'].sample(n= 8671, random_state= 3241)
d_c= data[data.label== 'Clothing & Accessories'].sample(n= 8671, random_state= 3241)

df= pd.concat([d_h, d_b, d_e, d_c], axis= 0)
df.head()

Unnamed: 0,label,text
13967,Household,Fortune Nestwell Stainless Steel Power Free Ha...
16546,Household,Scotch-Brite Bathroom Squeegee Wiper Style:Bat...
5039,Household,"Orpio Double Layer Plastic Egg Box (32 Grid ,W..."
9163,Household,Alda Non Stick Carbon Steel Bunt Form Cake Tin...
8169,Household,"Cello Alpha 3 Polypropylene Drawer, Grey Color..."


In [54]:
df.label.value_counts()

Household                 8671
Books                     8671
Electronics               8671
Clothing & Accessories    8671
Name: label, dtype: int64

In [55]:
df.shape

(34684, 2)

In [56]:
#this means that in one of the data entries, the text is missing 
df.isnull().sum()

label    0
text     1
dtype: int64

In [57]:
#this is the empty data entry
# since it is just one we can drop it
m= df[df.isnull().text== True]
m

Unnamed: 0,label,text
39330,Clothing & Accessories,


In [58]:
df= df.dropna()

In [59]:
m= df[df.isnull().text== True]
m

Unnamed: 0,label,text


In [60]:
# handle categorical variables 
from sklearn.preprocessing import LabelEncoder
l= LabelEncoder()
df.label= l.fit_transform(df.label)
df.label.value_counts()

3    8671
0    8671
2    8671
1    8670
Name: label, dtype: int64

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [62]:
x_train, x_test, y_train, y_test= train_test_split(df.text, df.label, test_size= 0.2, stratify= df.label, random_state= 3241)

In [65]:
# without any preprocessing
#tfidf with trigram
clf= Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range= (1, 3))),
    ('nb', MultinomialNB())
])

clf.fit(x_train, y_train)
y_pred= clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      1735
           1       0.97      0.98      0.97      1734
           2       0.96      0.96      0.96      1734
           3       0.93      0.96      0.94      1734

    accuracy                           0.96      6937
   macro avg       0.96      0.96      0.96      6937
weighted avg       0.96      0.96      0.96      6937



In [67]:
import spacy
nlp= spacy.load('en_core_web_sm')

def preprocess(text):
    doc= nlp(text)
    filter= []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            filter.append(token.lemma_)
    return ' '.join(filter)  

In [68]:
#preprocessing

df['text_new']= df.text.apply(preprocess)

In [69]:
x_train, x_test, y_train, y_test= train_test_split(df.text_new, df.label, test_size= 0.2, stratify= df.label, random_state= 3241)

In [72]:
# After preprocessing
#tfidf
clf= Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range= (1, 2))),
    ('nb', MultinomialNB())
])

clf.fit(x_train, y_train)
y_pred= clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.96      1735
           1       0.97      0.98      0.98      1734
           2       0.96      0.96      0.96      1734
           3       0.93      0.96      0.94      1734

    accuracy                           0.96      6937
   macro avg       0.96      0.96      0.96      6937
weighted avg       0.96      0.96      0.96      6937

