In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [3]:
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)

In [4]:
print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [5]:
i = v.vocabulary_.get("eating")
print(i)

10


In [6]:
v.idf_[i]

np.float64(1.9808292530117262)

In [7]:
all_feature_names = v.get_feature_names_out()

In [8]:
for word in all_feature_names:
  indx = v.vocabulary_.get(word)
  print(f"{word} : {v.idf_[indx]}")

already : 2.386294361119891
am : 2.386294361119891
amazon : 2.386294361119891
and : 2.386294361119891
announcing : 1.2876820724517808
apple : 2.386294361119891
are : 2.386294361119891
ate : 2.386294361119891
biryani : 2.386294361119891
dot : 2.386294361119891
eating : 1.9808292530117262
eco : 2.386294361119891
google : 2.386294361119891
grapes : 2.386294361119891
iphone : 2.386294361119891
ironman : 2.386294361119891
is : 1.1335313926245225
loki : 2.386294361119891
microsoft : 2.386294361119891
model : 2.386294361119891
new : 1.2876820724517808
pixel : 2.386294361119891
pizza : 2.386294361119891
surface : 2.386294361119891
tesla : 2.386294361119891
thor : 2.386294361119891
tomorrow : 1.2876820724517808
you : 2.386294361119891


In [9]:
print(transform_output.toarray())

[[0.24266547 0.         0.         0.         0.         0.
  0.         0.24266547 0.         0.         0.40286636 0.
  0.         0.         0.         0.24266547 0.11527033 0.24266547
  0.         0.         0.         0.         0.72799642 0.
  0.         0.24266547 0.         0.        ]
 [0.         0.         0.         0.         0.30652086 0.5680354
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5680354  0.         0.26982671 0.
  0.         0.         0.30652086 0.         0.         0.
  0.         0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.26982671 0.
  0.         0.5680354  0.30652086 0.         0.         0.
  0.5680354  0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.
  0.

In [11]:
df = pd.read_csv('/content/Ecommerce_data.csv', engine='python', on_bad_lines='skip')

In [13]:
df.head(5)

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [15]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Books,5332
Household,5328
Clothing & Accessories,5325
Electronics,5303


In [17]:
df.shape

(21288, 2)

In [18]:
df['label_num'] = df.label.map({'Household':0, 'Electronics':1, 'Clothing & Accessories':2, 'Books':3})

In [19]:
df.head(5)

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


In [20]:
X_train, X_test, y_train, y_test = train_test_split(df.Text, df.label_num, test_size=0.2, random_state=20)

In [21]:
len(X_train)

17030

In [22]:
len(X_test)

4258

In [23]:
tf = TfidfVectorizer()
X_train_count = tf.fit_transform(X_train.values)
X_train_count

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1148085 stored elements and shape (17030, 45748)>

In [24]:
X_test_tf = tf.transform(X_test)

In [27]:
from sklearn.metrics import classification_report

# Instantiate and train a Decision Tree Classifier
model = DecisionTreeClassifier(random_state=20) # Added random_state for reproducibility
model.fit(X_train_count, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tf)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.92      1076
           1       0.96      0.94      0.95      1077
           2       0.97      0.95      0.96      1062
           3       0.93      0.96      0.95      1043

    accuracy                           0.94      4258
   macro avg       0.94      0.94      0.94      4258
weighted avg       0.94      0.94      0.94      4258



In [29]:
#msg = ["Indira Designer Women's Art Mysore Silk Saree With Blouse Piece (Star-Red) This Saree Is Of Art Mysore Silk & Comes With Blouse Piece."]
msg = ["Satyajit's designer women art saree silk blouse piece, saree with pipili chandua work"]
msg_tf = tf.transform(msg)

model.predict(msg_tf)

array([2])