### Import Dataset

In [49]:
import pandas as pd

ecom = pd.read_csv(r'data/ecommerceDataset.csv', header=None, names=['class', 'comment'])
ecom = ecom[~ecom['comment'].isnull()]

In [50]:
ecom.head()

Unnamed: 0,class,comment
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


### Handling imbalance class

In [53]:
ecom.groupby('class')['comment'].count().reset_index()

Unnamed: 0,class,comment
0,Books,11820
1,Clothing & Accessories,8670
2,Electronics,10621
3,Household,19313


- Since there is an imbalace into the class we will try to follow the oversampling method using text modifiction
- Text modification will include synonims replacement but we can also use LLM to write similar sentances or augment the sentances
- We will compare both the results and look at the best model

In [51]:
os_data = ecom.copy()

In [52]:
import nltk
import random
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

# Function to replace words with synonyms
def synonym_replacement(sentence, n=1):
    words = sentence.split()
    new_words = words.copy()
    random_words = list(set(words))
    random.shuffle(random_words)
    
    num_replaced = 0
    for word in random_words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if w == word else w for w in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    
    return ' '.join(new_words)

# Function to augment sentences
def augment_sentences(df, category_column, text_column, augment_factor=1):
    augmented_texts = []
    augmented_labels = []
    
    class_counts = df[category_column].value_counts()
    
    class_count_data = class_counts.reset_index()
    class_count_data['count'] = 19313-class_count_data['count']
    values = class_count_data[class_count_data['class']!='Household'][['class', 'count']].values
    
    for category_count in values:
        subset = df[df[category_column] == category_count[0]]
        for _, row in subset.iterrows():
            for _ in range(augment_factor):
                augmented_texts.append(synonym_replacement(row[text_column]))
                augmented_labels.append(category_count[0])
    
    augmented_df = pd.DataFrame({text_column: augmented_texts, category_column: augmented_labels})
    return pd.concat([df, augmented_df], ignore_index=True)


os_data = augment_sentences(os_data, category_column='class', text_column='comment', augment_factor=1)
print("Data augmentation completed!")


[nltk_data] Downloading package wordnet to /Users/omshree/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/omshree/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Data augmentation completed!


In [54]:
os_data.groupby('class')['comment'].count().reset_index()

Unnamed: 0,class,comment
0,Books,23640
1,Clothing & Accessories,17340
2,Electronics,21242
3,Household,19313


In [55]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/omshree/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/omshree/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/omshree/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/omshree/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Noise Removal and cleaning function
- For noise removal we will follow following steps:
    - Convert to lowercase
    - Remove html tags
    - Special character removal
    - Remove punctuations
    - Lemmatization

- After the cleaning is done we will use TF-IDF vectorization method to get the frequency of the words 

In [61]:
df_1 = ecom.copy()
df_2 = os_data.copy()


# Text Cleaning Function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)  # Remove special characters
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = nltk.word_tokenize(text)  # Tokenization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatization & Stopword Removal
    return ' '.join(tokens)

# Apply Cleaning
df_1['comment'] = df_1['comment'].apply(clean_text)

df_2['comment'] = df_2['comment'].apply(clean_text)


# Encode Labels
df_1['class'] = df_1['class'].astype('category').cat.codes

df_2['class'] = df_2['class'].astype('category').cat.codes

In [66]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(df_1['comment'], df_1['class'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Initialize Models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest with Imbalance": RandomForestClassifier(n_estimators=100, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": MultinomialNB()
}

# Train & Evaluate Each Model
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    y_pred_train = model.predict(X_train_tfidf)

    training_accuracy = accuracy_score(y_train, y_pred_train)


    print(f"\n{name} Performance:")
    print("Test Accuracy:", accuracy_score(y_test, y_pred))
    print(f"Training Accuracy: {training_accuracy}")

    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))





Logistic Regression Performance:
Test Accuracy: 0.964700049578582
Training Accuracy: 0.9737722799276135
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.95      0.96      2378
           1       0.98      0.98      0.98      1750
           2       0.96      0.95      0.95      2082
           3       0.96      0.97      0.97      3875

    accuracy                           0.96     10085
   macro avg       0.97      0.96      0.96     10085
weighted avg       0.96      0.96      0.96     10085

Confusion Matrix:
 [[2268    8   34   68]
 [   9 1711   13   17]
 [  30    8 1978   66]
 [  36   24   43 3772]]

Random Forest with Imbalance Performance:
Test Accuracy: 0.9704511650966783
Training Accuracy: 0.9998016807555963
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      2378
           1       0.98      0.97      0.97      1750
           2       0.9

- We see that without over sampling although the Random Forest is performing quite well but the training accuracy is really high which suggests overfitting problem

In [68]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(df_2['comment'], df_2['class'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_os = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_os.fit_transform(X_train)
X_test_tfidf = tfidf_os.transform(X_test)

# Initialize Models
models_os = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": MultinomialNB()
}

# Train & Evaluate Each Model
for name, model in models_os.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    y_pred_train = model.predict(X_train_tfidf)

    training_accuracy = accuracy_score(y_train, y_pred_train)


    print(f"\n{name} Performance:")
    print("Test Accuracy:", accuracy_score(y_test, y_pred))
    print(f"Training Accuracy: {training_accuracy}")

    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




Logistic Regression Performance:
Test Accuracy: 0.972036548721408
Training Accuracy: 0.9776629668240633
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      4747
           1       0.99      0.98      0.99      3556
           2       0.97      0.97      0.97      4210
           3       0.95      0.97      0.96      3794

    accuracy                           0.97     16307
   macro avg       0.97      0.97      0.97     16307
weighted avg       0.97      0.97      0.97     16307

Confusion Matrix:
 [[4623   12   39   73]
 [  13 3499   14   30]
 [  58    9 4066   77]
 [  42   25   64 3663]]

Random Forest Performance:
Test Accuracy: 0.990249586067333
Training Accuracy: 0.9997547065677317
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      4747
           1       0.99      0.99      0.99      3556
           2       0.99      0.99     

- After oversampling the accuracy improved quite a lot and given us a good model

### Grid Search
- We will find the best parameters for the random forest model

In [69]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(df_2['comment'], df_2['class'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_os = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_os.fit_transform(X_train)
X_test_tfidf = tfidf_os.transform(X_test)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_tfidf, y_train)

best_rf = grid_search.best_estimator_
print("Best Random Forest Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.8s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.9s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.9s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   5.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   5.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   5.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   2.9s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   3.0s
[



[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=  23.4s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=  15.1s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=  15.2s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=  23.5s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=  15.3s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  12.7s
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=  18.9s
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  13.0s
[CV] END bootstrap=False, max_depth=30, min_samples_

### Best model evaluation

In [71]:
X_train, X_test, y_train, y_test = train_test_split(df_2['comment'], df_2['class'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_os = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_os.fit_transform(X_train)
X_test_tfidf = tfidf_os.transform(X_test)


y_pred = grid_search.predict(X_test_tfidf)
y_pred_train = grid_search.predict(X_train_tfidf)

training_accuracy = accuracy_score(y_train, y_pred_train)


print(f"Performance:")
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(f"Training Accuracy: {training_accuracy}")

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Performance:
Test Accuracy: 0.9404550193168578
Training Accuracy: 0.9506806892745446
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.93      4747
           1       0.98      0.97      0.97      3556
           2       0.98      0.92      0.95      4210
           3       0.92      0.90      0.91      3794

    accuracy                           0.94     16307
   macro avg       0.94      0.94      0.94     16307
weighted avg       0.94      0.94      0.94     16307

Confusion Matrix:
 [[4606   23   14  104]
 [  64 3435   10   47]
 [ 187   12 3864  147]
 [ 261   41   61 3431]]


- Some how it has reduced the overfitting of the model and found the best estimator for random forest

### Save the model for deployment

In [72]:
# Save Best Model (Example: Logistic Regression)
joblib.dump(models_os["Random Forest"], "models/best_model.pkl")
joblib.dump(tfidf_os, "models/tfidf_vectorizer.pkl")

print("Model and Vectorizer saved!")

Model and Vectorizer saved!
