In [1]:
''' 
Term Frequency(TF) = [number of times word appeared / total no of words in a document]
Inverse Document Frequency(IDF) = [log(Total number of documents / number of documents that contains the word)]
TF-IDF = Term Frequency(TF) * Inverse Document Frequency(IDF)
'''

' \nTerm Frequency(TF) = [number of times word appeared / total no of words in a document]\nInverse Document Frequency(IDF) = [log(Total number of documents / number of documents that contains the word)]\nTF-IDF = Term Frequency(TF) * Inverse Document Frequency(IDF)\n'

In [2]:
import pandas as pd
import numpy as np
import spacy 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report

In [5]:
text = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [24]:
vectorizer=TfidfVectorizer()
vectorizer.fit(text)
len(vectorizer.vocabulary_)
features=vectorizer.get_feature_names_out()
features

array(['already', 'am', 'amazon', 'and', 'announcing', 'apple', 'are',
       'ate', 'biryani', 'dot', 'eating', 'eco', 'google', 'grapes',
       'iphone', 'ironman', 'is', 'loki', 'microsoft', 'model', 'new',
       'pixel', 'pizza', 'surface', 'tesla', 'thor', 'tomorrow', 'you'],
      dtype=object)

In [28]:
for word in features:
    index=vectorizer.vocabulary_.get(word)
    print(word, np.round(vectorizer.idf_[index],3)) #score for each word

already 2.386
am 2.386
amazon 2.386
and 2.386
announcing 1.288
apple 2.386
are 2.386
ate 2.386
biryani 2.386
dot 2.386
eating 1.981
eco 2.386
google 2.386
grapes 2.386
iphone 2.386
ironman 2.386
is 1.134
loki 2.386
microsoft 2.386
model 2.386
new 1.288
pixel 2.386
pizza 2.386
surface 2.386
tesla 2.386
thor 2.386
tomorrow 1.288
you 2.386


In [27]:
text_transform=vectorizer.transform(text)
text_transform.toarray()[0]

array([0.24266547, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.24266547, 0.        , 0.        ,
       0.40286636, 0.        , 0.        , 0.        , 0.        ,
       0.24266547, 0.11527033, 0.24266547, 0.        , 0.        ,
       0.        , 0.        , 0.72799642, 0.        , 0.        ,
       0.24266547, 0.        , 0.        ])

In [36]:
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [46]:
df=pd.read_csv("Ecommerce.txt")
classes=df["label"].unique()
classes_dic={'Household':0,'Electronics':1,'Clothing & Accessories':2,'Books':3}
df["label_num"]=df["label"].map(classes_dic)

In [57]:
x_train, x_test,y_train,y_test=train_test_split(df["Text"],df["label_num"], random_state=123,stratify=df.label_num)

In [58]:
model=Pipeline([
    ("TfidfVectorizer",TfidfVectorizer()),
    ("NB",MultinomialNB())
])

model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [59]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.97      0.95      1500
           1       0.96      0.97      0.97      1500
           2       0.98      0.98      0.98      1500
           3       0.98      0.92      0.95      1500

    accuracy                           0.96      6000
   macro avg       0.96      0.96      0.96      6000
weighted avg       0.96      0.96      0.96      6000



# Modelling with preprocessing of text

In [3]:
nlp=spacy.load("en_core_web_sm")
def preprocess(text):
    filtered_text=[]
    doc=nlp(text)
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        else:
            filtered_text.append(token.lemma_)
    return(" ".join(filtered_text))        

In [4]:
df=pd.read_csv("Ecommerce.txt")
classes=df["label"].unique()
classes_dic={'Household':0,'Electronics':1,'Clothing & Accessories':2,'Books':3}
df["label_num"]=df["label"].map(classes_dic)
df["Text_new"]=df["Text"].map(preprocess)

In [15]:
df.to_csv("Ecommerce_pro.txt")

In [5]:
x_train, x_test,y_train,y_test=train_test_split(df["Text_new"],df["label_num"], random_state=123,stratify=df.label_num)

In [6]:
model=Pipeline([
    ("TfidfVectorizer",TfidfVectorizer()),
    ("NB",MultinomialNB())
])

model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [7]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1500
           1       0.96      0.97      0.97      1500
           2       0.97      0.98      0.98      1500
           3       0.99      0.92      0.95      1500

    accuracy                           0.96      6000
   macro avg       0.96      0.96      0.96      6000
weighted avg       0.96      0.96      0.96      6000

