##  TF-IDE

In [1]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Sample dataset
corpus = [
    "I love NLP and is Machine Learning",
    "genai is amazing",
    "deep learning is good is easy"
]



In [2]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Convert text to TF-IDF matrix
transformed_output= vectorizer.fit_transform(corpus)



In [3]:
vectorizer.vocabulary_

{'love': 8,
 'nlp': 10,
 'and': 1,
 'is': 6,
 'machine': 9,
 'learning': 7,
 'genai': 4,
 'amazing': 0,
 'deep': 2,
 'good': 5,
 'easy': 3}

In [4]:
all_fearture_names=vectorizer.get_feature_names_out()

In [5]:
for word in all_fearture_names:
    indx=vectorizer.vocabulary_.get(word)
    print(f"{word} {vectorizer.idf_[indx]}")

amazing 1.6931471805599454
and 1.6931471805599454
deep 1.6931471805599454
easy 1.6931471805599454
genai 1.6931471805599454
good 1.6931471805599454
is 1.0
learning 1.2876820724517808
love 1.6931471805599454
machine 1.6931471805599454
nlp 1.6931471805599454


In [6]:
# observe tf-idf score for repeating value is less ex -> is

In [7]:
transformed_output.toarray()[:1]

array([[0.        , 0.45050407, 0.        , 0.        , 0.        ,
        0.        , 0.26607496, 0.34261996, 0.45050407, 0.45050407,
        0.45050407]])

In [8]:
# example

In [9]:
import pandas as pd

In [10]:
df=pd.read_csv("Ecommerce_data.csv");
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [11]:
df.shape

(24000, 2)

In [12]:
df.label.value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [13]:
df['label_num']=df.label.map({
    'Household':0,
    'Books':1,
    'Electronics':2,
    'Clothing & Accessories':3    
})

In [47]:
df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,3
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,3


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train,X_test,y_train,y_test=train_test_split(
    df.Text,
    df.label_num,#independent var
    test_size=0.2,
    random_state=2022,
    stratify=df.label_num
)

In [17]:
X_train.shape

(19200,)

In [18]:
#Model


In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf=Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
])

clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)




In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      1200
           1       0.97      0.95      0.96      1200
           2       0.97      0.97      0.97      1200
           3       0.97      0.98      0.97      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [21]:
X_test[:3]

20706    Lal Haveli Designer Handmade Patchwork Decorat...
19166    GOTOTOP Classical Retro Cotton & PU Leather Ne...
15209    FabSeasons Camouflage Polyester Multi Function...
Name: Text, dtype: object

In [22]:
y_pred[:3] 

array([0, 2, 3])

In [23]:
#  'Household':0,
#     'Books':1,
#     'Electronics':2,
#     'Clothing & Accessories':3    

In [24]:
from sklearn.naive_bayes import MultinomialNB
clf=Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('multi NB',MultinomialNB())
])

clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)


In [36]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1200
           1       0.97      0.97      0.97      1200
           2       0.98      0.97      0.97      1200
           3       0.98      0.98      0.98      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [48]:
# testing model
#  'Household':0,
#     'Books':1,
#     'Electronics':2,
#     'Clothing & Accessories':3    

new_book = ["Indira Designer Women's Art Mysore Silk Saree",
           "Science book",
            "HP external USB DVD Drive DVDRW DVD-ROM A2U56A"
           ]
prediction = clf.predict(new_book)
print("Predicted label:", prediction)


#observe based on input our midel predicting what it is 

Predicted label: [3 1 2]


In [31]:
#observe RandomForestClassifier giving best

from sklearn.ensemble import RandomForestClassifier
clf=Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('Random Forest',RandomForestClassifier())
])

clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1200
           1       0.97      0.97      0.97      1200
           2       0.98      0.97      0.97      1200
           3       0.98      0.98      0.98      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800

