### 🗞️E-Commerce Product Classification: BoW + ML

📂Step 1: Install & Import Libraries

In [1]:
import pandas as pd
import spacy

🗂️Step 2: Load data set

In [2]:
df = pd.read_csv("processed_ecommerceDataset.csv") 
df.head()

Unnamed: 0,label,text,processed_text
0,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",saf floral framed painting wood 30 inch x 10 i...
1,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,saf uv texture modern art print framed paintin...
2,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",saf flower print framed painting synthetic 13....
3,Household,Incredible Gifts India Wooden Happy Birthday U...,incredible gifts india wooden happy birthday u...
4,Household,Pitaara Box Romantic Venice Canvas Painting 6m...,pitaara box romantic venice canvas painting 6 ...


🔍Step 3: Explore Data

In [None]:
df.isnull().sum()

label             0
text              0
processed_text    1
dtype: int64

In [6]:
df.dropna(inplace=True)
df.shape

(27800, 3)

🪓Step 5: train test split

In [7]:
from sklearn.model_selection import train_test_split

X = df["processed_text"]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

🤖Step 6: Train the Model

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

🧪Attempt 1 (Baseline - Unigrams Only)

In [9]:
# Attempt 1: Unigrams (1,1)
clf_1 = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 1))),
    ('model', MultinomialNB())
])
clf_1.fit(X_train, y_train)
y_pred_1 = clf_1.predict(X_test)
print("Attempt 1 (Unigrams):")
print(classification_report(y_test, y_pred_1))

Attempt 1 (Unigrams):
                      precision    recall  f1-score   support

               Books       0.97      0.92      0.95      1270
Clothing_Accessories       0.95      0.98      0.96      1124
         Electronics       0.92      0.93      0.93      1049
           Household       0.94      0.96      0.95      2117

            accuracy                           0.95      5560
           macro avg       0.95      0.95      0.95      5560
        weighted avg       0.95      0.95      0.95      5560



🧪Attempt 2 (Unigrams + Bigrams)

In [10]:
# Attempt 2: Unigrams + Bigrams (1,2)
clf_2 = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),  
    ('model', MultinomialNB())
])
clf_2.fit(X_train, y_train)
y_pred_2 = clf_2.predict(X_test)
print("Attempt 2 (Unigrams+Bigrams):")
print(classification_report(y_test, y_pred_2))

Attempt 2 (Unigrams+Bigrams):
                      precision    recall  f1-score   support

               Books       0.98      0.92      0.95      1270
Clothing_Accessories       0.97      0.97      0.97      1124
         Electronics       0.94      0.93      0.94      1049
           Household       0.93      0.97      0.95      2117

            accuracy                           0.95      5560
           macro avg       0.96      0.95      0.95      5560
        weighted avg       0.95      0.95      0.95      5560



🧪Attempt 3 (Bigrams Only)

In [11]:
# Attempt 3: Bigrams only (2,2)
clf_3 = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(2, 2))), 
    ('model', MultinomialNB())
])
clf_3.fit(X_train, y_train)
y_pred_3 = clf_3.predict(X_test)
print("Attempt 3 (Bigrams Only):")
print(classification_report(y_test, y_pred_3))

Attempt 3 (Bigrams Only):
                      precision    recall  f1-score   support

               Books       0.98      0.88      0.92      1270
Clothing_Accessories       0.95      0.97      0.96      1124
         Electronics       0.93      0.92      0.93      1049
           Household       0.91      0.96      0.94      2117

            accuracy                           0.94      5560
           macro avg       0.94      0.93      0.94      5560
        weighted avg       0.94      0.94      0.94      5560



🧪Attempt 4 (Unigrams to Trigrams)

In [12]:
# Attempt 4: Unigrams to Trigrams (1,3)
clf_4 = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 3))), 
    ('model', MultinomialNB())
])
clf_4.fit(X_train, y_train)
y_pred_4 = clf_4.predict(X_test)
print("Attempt 4 (Unigrams+Bigrams+Trigrams):")
print(classification_report(y_test, y_pred_4))

Attempt 4 (Unigrams+Bigrams+Trigrams):
                      precision    recall  f1-score   support

               Books       0.98      0.92      0.95      1270
Clothing_Accessories       0.98      0.96      0.97      1124
         Electronics       0.94      0.93      0.94      1049
           Household       0.93      0.98      0.95      2117

            accuracy                           0.95      5560
           macro avg       0.96      0.95      0.95      5560
        weighted avg       0.95      0.95      0.95      5560



⚖️Compare Results & Save Best Model

In [13]:
from sklearn.metrics import f1_score

# Store results in a dictionary
results = {
    'Attempt 1 (1,1)': f1_score(y_test, y_pred_1, average='weighted'),
    'Attempt 2 (1,2)': f1_score(y_test, y_pred_2, average='weighted'),
    'Attempt 3 (2,2)': f1_score(y_test, y_pred_3, average='weighted'),
    'Attempt 4 (1,3)': f1_score(y_test, y_pred_4, average='weighted')
}

# Find best model
best_attempt = max(results, key=results.get)
print(f"\nBest Model: {best_attempt} | F1-Score: {results[best_attempt]:.4f}")


Best Model: Attempt 2 (1,2) | F1-Score: 0.9516


In [14]:
# Save the best pipeline
if best_attempt == 'Attempt 1 (1,1)':
    best_model = clf_1
elif best_attempt == 'Attempt 2 (1,2)':
    best_model = clf_2
elif best_attempt == 'Attempt 3 (2,2)':
    best_model = clf_3
else:
    best_model = clf_4

In [15]:
import joblib
joblib.dump(best_model, 'best_model(bow).pkl')  
print("Best model saved as 'best_fake_news_model.pkl'")

Best model saved as 'best_fake_news_model.pkl'
