In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
corpus = [
    "the cat is sleeping on the cat bed",
    "the dog is barking at the other dog",
    "a bird is flying over the bird nest",
    "the cat and the dog are playing together",
    "the bird is watching the cat and the dog",
    "the dog chased the cat through the garden",
    "a cat a dog a bird all in the house"
]

In [27]:
v = TfidfVectorizer()
v.fit(corpus)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [28]:
transform_output = v.transform(corpus)

In [29]:
print(v.get_feature_names_out())

['all' 'and' 'are' 'at' 'barking' 'bed' 'bird' 'cat' 'chased' 'dog'
 'flying' 'garden' 'house' 'in' 'is' 'nest' 'on' 'other' 'over' 'playing'
 'sleeping' 'the' 'through' 'together' 'watching']


In [30]:
print(v.vocabulary_)

{'the': 21, 'cat': 7, 'is': 14, 'sleeping': 20, 'on': 16, 'bed': 5, 'dog': 9, 'barking': 4, 'at': 3, 'other': 17, 'bird': 6, 'flying': 10, 'over': 18, 'nest': 15, 'and': 1, 'are': 2, 'playing': 19, 'together': 23, 'watching': 24, 'chased': 8, 'through': 22, 'garden': 11, 'all': 0, 'in': 13, 'house': 12}


In [31]:
tmp = v.vocabulary_.get('cat')
v.idf_[tmp]

np.float64(1.2876820724517808)

In [32]:
# Print the idf of each word

all_feature_name = v.get_feature_names_out()

for word in all_feature_name:
    indx = v.vocabulary_.get(word)
    idf_score = v.idf_[tmp]
    print(f"{word} : {idf_score}")

all : 1.2876820724517808
and : 1.2876820724517808
are : 1.2876820724517808
at : 1.2876820724517808
barking : 1.2876820724517808
bed : 1.2876820724517808
bird : 1.2876820724517808
cat : 1.2876820724517808
chased : 1.2876820724517808
dog : 1.2876820724517808
flying : 1.2876820724517808
garden : 1.2876820724517808
house : 1.2876820724517808
in : 1.2876820724517808
is : 1.2876820724517808
nest : 1.2876820724517808
on : 1.2876820724517808
other : 1.2876820724517808
over : 1.2876820724517808
playing : 1.2876820724517808
sleeping : 1.2876820724517808
the : 1.2876820724517808
through : 1.2876820724517808
together : 1.2876820724517808
watching : 1.2876820724517808


In [33]:
# Print Transform output from tf-idf

print(transform_output.toarray())

[[0.         0.         0.         0.         0.         0.43657447
  0.         0.47116493 0.         0.         0.         0.
  0.         0.         0.26893834 0.         0.43657447 0.
  0.         0.         0.43657447 0.3659016  0.         0.
  0.        ]
 [0.         0.         0.         0.43657447 0.43657447 0.
  0.         0.         0.         0.47116493 0.         0.
  0.         0.         0.26893834 0.         0.         0.43657447
  0.         0.         0.         0.3659016  0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.60133854 0.         0.         0.         0.42375842 0.
  0.         0.         0.26104341 0.42375842 0.         0.
  0.42375842 0.         0.         0.17758011 0.         0.
  0.        ]
 [0.         0.3722     0.44838734 0.         0.         0.
  0.         0.24195688 0.         0.24195688 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.44838734 0.         0.37580

### Working with E-commarce model

------------------------------------------

In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [35]:
df = pd.read_csv('./Ecommerce_data.csv')
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [36]:
df['label'].value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [37]:
df.shape

(24000, 2)

In [38]:
df['label_num'] = df['label'].map({
    'Household' : 0,
    'Electronics' : 1,
    'Clothing & Accessories' : 2,
    'Books' : 3
})

#### Train Test split

In [39]:
x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['label_num'], test_size=.20)

In [40]:
x_train.shape

(19200,)

#### TF-IDF

In [41]:
tf = TfidfVectorizer()
x_train_tf = tf.fit_transform(x_train)
x_test_tf = tf.transform(x_test)

#### Classification Model

In [45]:
clf = DecisionTreeClassifier()
clf.fit(x_train_tf, y_train)

y_pred = clf.predict(x_test_tf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      1184
           1       0.95      0.96      0.96      1201
           2       0.98      0.96      0.97      1234
           3       0.96      0.96      0.96      1181

    accuracy                           0.95      4800
   macro avg       0.95      0.95      0.95      4800
weighted avg       0.95      0.95      0.95      4800



#### Testing on new data

In [53]:
test_sentence = ["stylish cotton t shirt for men and boys t shirt with soft cotton fabric"]

msg_tf = tf.transform(test_sentence)

clf.predict(msg_tf)

array([2])