<h1 style="text-align:center;color:mediumvioletred">TF-IDF</h1>

## TF-IDF Basics

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
docs = [
    "Iron Man fights alongside Captain America.",
    "Spider Man teams up with Iron Man.",
    "Thor and Hulk battle powerful enemies together.",
    "Black Widow helps Captain America in the mission.",
    "Doctor Strange joins Spider-Man to protect the city.",
    "Hulk and Thor are strong Avengers.",
    "Captain America leads the Avengers with Iron Man.",
    "Black Widow and Hawkeye fight as a team."
]

In [32]:
v = TfidfVectorizer()
transformed_docs = v.fit_transform(docs)
print(v.vocabulary_)

{'iron': 18, 'man': 21, 'fights': 13, 'alongside': 0, 'captain': 8, 'america': 1, 'spider': 25, 'teams': 29, 'up': 34, 'with': 36, 'thor': 31, 'and': 2, 'hulk': 16, 'battle': 6, 'powerful': 23, 'enemies': 11, 'together': 33, 'black': 7, 'widow': 35, 'helps': 15, 'in': 17, 'the': 30, 'mission': 22, 'doctor': 10, 'strange': 26, 'joins': 19, 'to': 32, 'protect': 24, 'city': 9, 'are': 3, 'strong': 27, 'avengers': 5, 'leads': 20, 'hawkeye': 14, 'fight': 12, 'as': 4, 'team': 28}


In [33]:
v.get_feature_names_out()

array(['alongside', 'america', 'and', 'are', 'as', 'avengers', 'battle',
       'black', 'captain', 'city', 'doctor', 'enemies', 'fight', 'fights',
       'hawkeye', 'helps', 'hulk', 'in', 'iron', 'joins', 'leads', 'man',
       'mission', 'powerful', 'protect', 'spider', 'strange', 'strong',
       'team', 'teams', 'the', 'thor', 'to', 'together', 'up', 'widow',
       'with'], dtype=object)

In [34]:
all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    indx = v.vocabulary_.get(word)
    print(f"{word}: {round(v.idf_[indx],3)}")

alongside: 2.504
america: 1.811
and: 1.811
are: 2.504
as: 2.504
avengers: 2.099
battle: 2.504
black: 2.099
captain: 1.811
city: 2.504
doctor: 2.504
enemies: 2.504
fight: 2.504
fights: 2.504
hawkeye: 2.504
helps: 2.504
hulk: 2.099
in: 2.504
iron: 1.811
joins: 2.504
leads: 2.504
man: 1.588
mission: 2.504
powerful: 2.504
protect: 2.504
spider: 2.099
strange: 2.504
strong: 2.504
team: 2.504
teams: 2.504
the: 1.811
thor: 2.099
to: 2.504
together: 2.504
up: 2.504
widow: 2.099
with: 2.099


In [35]:
docs[:2]

['Iron Man fights alongside Captain America.',
 'Spider Man teams up with Iron Man.']

In [36]:
transformed_docs.toarray()[0]

array([0.50181732, 0.36291057, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.36291057, 0.        ,
       0.        , 0.        , 0.        , 0.50181732, 0.        ,
       0.        , 0.        , 0.        , 0.36291057, 0.        ,
       0.        , 0.31819258, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ])

In [37]:
transformed_docs.toarray()[1]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.30736636, 0.        ,
       0.        , 0.53898511, 0.        , 0.        , 0.        ,
       0.35619419, 0.        , 0.        , 0.        , 0.42501315,
       0.        , 0.        , 0.        , 0.        , 0.42501315,
       0.        , 0.35619419])

## Text Classification Using TF-IDF

#### Reading and exploring dataset

In [38]:
import pandas as pd

df = pd.read_csv("Ecommerce_data.csv")
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [39]:
df.shape

(24000, 2)

In [40]:
df.label.value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

#### Label Mapping

In [42]:
df['label_num'] = df.label.map({
    'Books': 0,
    'Clothing & Accessories': 1,
    'Electronics': 2,
    'Household': 3
})

df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,3
1,"Contrast living Wooden Decorative Box,Painted ...",Household,3
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,1
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,1


#### Train Test Split

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text,
    df.label_num,
    test_size = 0.2,
    random_state = 69,
    stratify = df.label_num
)

In [48]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (19200,)
Shape of X_test:  (4800,)


In [45]:
y_train.value_counts()

label_num
0    4800
2    4800
3    4800
1    4800
Name: count, dtype: int64

In [47]:
y_test.value_counts()

label_num
2    1200
1    1200
0    1200
3    1200
Name: count, dtype: int64

### Model Training and Testing

#### KNN

In [51]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('tf_idf',TfidfVectorizer()),
    ('KNN',KNeighborsClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1200
           1       0.98      0.98      0.98      1200
           2       0.96      0.97      0.97      1200
           3       0.96      0.96      0.96      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [52]:
X_test[:5]

16069    SAMSUNG M378T5663QZ3-CF7 Samsung 2gb Ddr2 800m...
15343    Generic High Gain 16dBi 2.4GHz Wifi Yagi Anten...
11342    Acer Switch Atom 10.1-inch Laptop (2GB/32GB/Wi...
17139    Sathiyas Girls Graphic Printed T-Shirt - (Pack...
20846    IT VISION ™ 15 PIN MALE TO MALE VGA CABLE 1.5 ...
Name: Text, dtype: object

In [57]:
X_test[:5][16069]

'SAMSUNG M378T5663QZ3-CF7 Samsung 2gb Ddr2 800mhz Pc2-6400 240pins 256mx64 C SAMSUNG 2gb ddr2 800mhz pc2-6400 240pins desktop RAM part number m378t5663qz3-cf7.'

| Label                   | Class |
|--------------------------|-------|
| Books                   | 0     |
| Clothing & Accessories  | 1     |
| Electronics             | 2     |
| Household               | 3     |

In [53]:
y_test[:5]

16069    2
15343    2
11342    2
17139    1
20846    2
Name: label_num, dtype: int64

In [54]:
y_pred[:5]

array([2, 2, 2, 1, 2])

#### Naive Bayes

In [58]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('tf_idf',TfidfVectorizer()),
    ('NB',MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.96      1200
           1       0.99      0.99      0.99      1200
           2       0.97      0.97      0.97      1200
           3       0.93      0.98      0.95      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



#### Random Forest

In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('tf_idf',TfidfVectorizer()),
    ('rf',RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1200
           1       0.98      0.98      0.98      1200
           2       0.99      0.98      0.98      1200
           3       0.97      0.97      0.97      1200

    accuracy                           0.98      4800
   macro avg       0.98      0.98      0.98      4800
weighted avg       0.98      0.98      0.98      4800



### Model training after preprocessing

In [64]:
import spacy

nlp = spacy.load("en_core_web_md")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return ' '.join(filtered_tokens)

In [66]:
df['preprocessed_text'] = df['Text'].apply(preprocess)

In [67]:
df.head()

Unnamed: 0,Text,label,label_num,preprocessed_text
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,3,Urban Ladder Eisner Low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,3,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2,IO Crest SY pci40010 PCI RAID Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,1,ISAKAA Baby Socks bear 8 years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,1,Indira Designer Women Art Mysore Silk Saree Bl...


In [68]:
df.Text[0]

'Urban Ladder Eisner Low Back Study-Office Computer Chair(Black) A study in simple. The Eisner study chair has a firm foam cushion, which makes long hours at your desk comfortable. The flexible meshed back is designed for air-circulation and support when you lean back. The curved arms provide ergonomic forearm support. Adjust the height using the gas lift to find that comfortable position and the nylon castors make it easy to move around your space. Chrome legs refer to the images for dimension details any assembly required will be done by the UL team at the time of delivery indoor use only.'

In [69]:
df.preprocessed_text[0]

'Urban Ladder Eisner Low Study Office Computer Chair(Black study simple Eisner study chair firm foam cushion make long hour desk comfortable flexible mesh design air circulation support lean curved arm provide ergonomic forearm support adjust height gas lift find comfortable position nylon castor easy space chrome leg refer image dimension detail assembly require UL team time delivery indoor use'

In [70]:
X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_text,
    df.label_num,
    test_size = 0.2,
    random_state = 69,
    stratify = df.label_num
)

In [71]:
clf = Pipeline([
    ('tf_idf',TfidfVectorizer()),
    ('rf',RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1200
           1       0.98      0.98      0.98      1200
           2       0.98      0.98      0.98      1200
           3       0.98      0.97      0.98      1200

    accuracy                           0.98      4800
   macro avg       0.98      0.98      0.98      4800
weighted avg       0.98      0.98      0.98      4800

