<a href="https://colab.research.google.com/github/namantam1/ml-ai-dnn/blob/main/TF_IDF_NLP_Ecommerce_data_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Term Frequency(TF) = [number of times word appeared / total no of words in a document]
# Inverse Document Frequency
#     (IDF) = [log(Total number of documents / number of documents that contains the word)]
# TF-IDF = Term Frequency(TF) * Inverse Document Frequency(IDF)


corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]
corpus

['Thor eating pizza, Loki is eating pizza, Ironman ate pizza already',
 'Apple is announcing new iphone tomorrow',
 'Tesla is announcing new model-3 tomorrow',
 'Google is announcing new pixel-6 tomorrow',
 'Microsoft is announcing new surface tomorrow',
 'Amazon is announcing new eco-dot tomorrow',
 'I am eating biryani and you are eating grapes']

In [21]:
#let's create the vectorizer and fit the corpus and transform them accordingly
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
  doc = nlp(text)
  return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])


v = TfidfVectorizer(
    preprocessor=preprocess
)
v.fit(corpus[:1])

op = v.transform(corpus[:2]).A

print(op.shape)

op[:2]

(2, 5)


array([[0.21821789, 0.21821789, 0.65465367, 0.65465367, 0.21821789],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [7]:
print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [8]:
import pandas as pd

df = pd.read_csv("https://github.com/codebasics/nlp-tutorials/raw/main/12_tf_idf/Ecommerce_data.csv")

print(df.shape)

df.head()

(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [9]:
df.label.value_counts()

Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: label, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text,
    df.label,
    test_size=0.2,
    random_state=1,
    stratify=df.label
)

y_train.value_counts()

Books                     4800
Electronics               4800
Clothing & Accessories    4800
Household                 4800
Name: label, dtype: int64

In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


clf = make_pipeline(
    TfidfVectorizer(
        ngram_range=(1, 1),
        preprocessor=preprocess # takes lot of time 
    ),
    MultinomialNB()
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

display(
    y_pred[:10],
    y_test[:10].to_numpy(),
)

print(classification_report(y_pred, y_test))

array(['Clothing & Accessories', 'Household', 'Books', 'Household',
       'Household', 'Household', 'Clothing & Accessories',
       'Clothing & Accessories', 'Books', 'Electronics'], dtype='<U22')

array(['Clothing & Accessories', 'Household', 'Books', 'Household',
       'Household', 'Household', 'Clothing & Accessories',
       'Clothing & Accessories', 'Books', 'Electronics'], dtype=object)

                        precision    recall  f1-score   support

                 Books       0.90      0.99      0.95      1094
Clothing & Accessories       0.97      0.96      0.97      1221
           Electronics       0.97      0.96      0.97      1214
             Household       0.97      0.91      0.94      1271

              accuracy                           0.96      4800
             macro avg       0.96      0.96      0.96      4800
          weighted avg       0.96      0.96      0.96      4800



In [23]:
# import pickle

# with open("model.pkl", "wb") as fp:
#   pickle.dump(clf, fp)

In [24]:
from sklearn.neighbors import KNeighborsClassifier

clf = make_pipeline(
    TfidfVectorizer(
        ngram_range=(1, 1),
    ),
    KNeighborsClassifier()
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

display(
    y_pred[:10],
    y_test[:10].to_numpy(),
)

print(classification_report(y_pred, y_test))

array(['Clothing & Accessories', 'Household', 'Books', 'Household',
       'Clothing & Accessories', 'Household', 'Clothing & Accessories',
       'Clothing & Accessories', 'Books', 'Electronics'], dtype=object)

array(['Clothing & Accessories', 'Household', 'Books', 'Household',
       'Household', 'Household', 'Clothing & Accessories',
       'Clothing & Accessories', 'Books', 'Electronics'], dtype=object)

                        precision    recall  f1-score   support

                 Books       0.94      0.97      0.96      1166
Clothing & Accessories       0.98      0.96      0.97      1214
           Electronics       0.97      0.96      0.96      1214
             Household       0.96      0.95      0.96      1206

              accuracy                           0.96      4800
             macro avg       0.96      0.96      0.96      4800
          weighted avg       0.96      0.96      0.96      4800



In [25]:
from sklearn.linear_model import LogisticRegression

clf = make_pipeline(
    TfidfVectorizer(
        ngram_range=(1, 1),
    ),
    LogisticRegression()
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

display(
    y_pred[:10],
    y_test[:10].to_numpy(),
)

print(classification_report(y_pred, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['Clothing & Accessories', 'Household', 'Books', 'Household',
       'Household', 'Household', 'Clothing & Accessories',
       'Clothing & Accessories', 'Books', 'Electronics'], dtype=object)

array(['Clothing & Accessories', 'Household', 'Books', 'Household',
       'Household', 'Household', 'Clothing & Accessories',
       'Clothing & Accessories', 'Books', 'Electronics'], dtype=object)

                        precision    recall  f1-score   support

                 Books       0.96      0.97      0.96      1188
Clothing & Accessories       0.98      0.98      0.98      1204
           Electronics       0.98      0.98      0.98      1202
             Household       0.97      0.96      0.96      1206

              accuracy                           0.97      4800
             macro avg       0.97      0.97      0.97      4800
          weighted avg       0.97      0.97      0.97      4800



In [26]:
from sklearn.svm import SVC

clf = make_pipeline(
    TfidfVectorizer(
        ngram_range=(1, 1),
    ),
    SVC() # very time consuming
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

display(
    y_pred[:10],
    y_test[:10].to_numpy(),
)

print(classification_report(y_pred, y_test))

array(['Clothing & Accessories', 'Household', 'Books', 'Household',
       'Household', 'Household', 'Clothing & Accessories',
       'Clothing & Accessories', 'Books', 'Electronics'], dtype=object)

array(['Clothing & Accessories', 'Household', 'Books', 'Household',
       'Household', 'Household', 'Clothing & Accessories',
       'Clothing & Accessories', 'Books', 'Electronics'], dtype=object)

                        precision    recall  f1-score   support

                 Books       0.97      0.98      0.97      1185
Clothing & Accessories       0.98      0.98      0.98      1196
           Electronics       0.98      0.98      0.98      1205
             Household       0.98      0.97      0.97      1214

              accuracy                           0.98      4800
             macro avg       0.98      0.98      0.98      4800
          weighted avg       0.98      0.98      0.98      4800



# Exercise

In [28]:
df = pd.read_csv("https://github.com/codebasics/nlp-tutorials/raw/main/12_tf_idf/Emotion_classify_Data.csv")
print(df.shape)
df.head()

(5937, 2)


Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [29]:
df.Emotion.value_counts()

anger    2000
joy      2000
fear     1937
Name: Emotion, dtype: int64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    df.Comment,
    df.Emotion,
    test_size=.2,
    random_state=1,
    stratify=df.Emotion
)

y_train.value_counts()

joy      1600
anger    1600
fear     1549
Name: Emotion, dtype: int64

In [32]:
clf = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2)),
    MultinomialNB()
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

display(
    y_test[:10].to_numpy(),
    y_pred[:10]
)

print(classification_report(y_pred, y_test))

array(['anger', 'joy', 'fear', 'joy', 'anger', 'fear', 'anger', 'joy',
       'anger', 'anger'], dtype=object)

array(['anger', 'joy', 'anger', 'joy', 'anger', 'anger', 'anger', 'joy',
       'anger', 'anger'], dtype='<U5')

              precision    recall  f1-score   support

       anger       0.92      0.86      0.89       428
        fear       0.85      0.91      0.87       362
         joy       0.89      0.90      0.89       398

    accuracy                           0.89      1188
   macro avg       0.89      0.89      0.89      1188
weighted avg       0.89      0.89      0.89      1188



In [33]:
clf = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2)),
    KNeighborsClassifier()
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

display(
    y_test[:10].to_numpy(),
    y_pred[:10]
)

print(classification_report(y_pred, y_test))

array(['anger', 'joy', 'fear', 'joy', 'anger', 'fear', 'anger', 'joy',
       'anger', 'anger'], dtype=object)

array(['anger', 'joy', 'fear', 'joy', 'anger', 'fear', 'anger', 'fear',
       'anger', 'anger'], dtype=object)

              precision    recall  f1-score   support

       anger       0.76      0.62      0.69       488
        fear       0.75      0.68      0.71       431
         joy       0.52      0.78      0.62       269

    accuracy                           0.68      1188
   macro avg       0.68      0.69      0.67      1188
weighted avg       0.70      0.68      0.68      1188



In [34]:
clf = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2)),
    LogisticRegression()
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

display(
    y_test[:10].to_numpy(),
    y_pred[:10]
)

print(classification_report(y_pred, y_test))

array(['anger', 'joy', 'fear', 'joy', 'anger', 'fear', 'anger', 'joy',
       'anger', 'anger'], dtype=object)

array(['anger', 'joy', 'fear', 'joy', 'anger', 'fear', 'anger', 'joy',
       'anger', 'anger'], dtype=object)

              precision    recall  f1-score   support

       anger       0.90      0.89      0.90       403
        fear       0.86      0.91      0.88       366
         joy       0.90      0.86      0.88       419

    accuracy                           0.89      1188
   macro avg       0.89      0.89      0.89      1188
weighted avg       0.89      0.89      0.89      1188



In [35]:
from sklearn.ensemble import RandomForestClassifier

clf = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2)),
    RandomForestClassifier()
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

display(
    y_test[:10].to_numpy(),
    y_pred[:10]
)

print(classification_report(y_pred, y_test))

array(['anger', 'joy', 'fear', 'joy', 'anger', 'fear', 'anger', 'joy',
       'anger', 'anger'], dtype=object)

array(['anger', 'joy', 'fear', 'joy', 'anger', 'anger', 'anger', 'joy',
       'anger', 'joy'], dtype=object)

              precision    recall  f1-score   support

       anger       0.91      0.91      0.91       401
        fear       0.87      0.93      0.89       363
         joy       0.92      0.87      0.89       424

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188

