In [2]:
import pandas as pd
import spacy

In [3]:
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

In [6]:
df = pd.read_csv("Ecommerce_data.csv")

In [7]:
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [10]:
df.label.value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [12]:
df.label.unique()

array(['Household', 'Electronics', 'Clothing & Accessories', 'Books'],
      dtype=object)

In [14]:
df['label_num'] = df['label'].map({
    'Household': 0,
    'Electronics': 1,
    'Clothing & Accessories': 2,
    'Books': 3
})

In [16]:
df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
td_idf_vc = TfidfVectorizer()

In [24]:
td_idf_vc.fit(df.Text.to_numpy())

In [25]:
transformed_text = td_idf_vc.transform(df.Text.to_numpy())

In [27]:
dir(td_idf_vc)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_params',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_t

#### Vocabulary

In [31]:
td_idf_vc.vocabulary_

{'urban': 47187,
 'ladder': 27341,
 'eisner': 17642,
 'low': 28538,
 'back': 8291,
 'study': 43467,
 'office': 32912,
 'computer': 13204,
 'chair': 11697,
 'black': 9453,
 'in': 24483,
 'simple': 41382,
 'the': 45011,
 'has': 22809,
 'firm': 19859,
 'foam': 20184,
 'cushion': 14587,
 'which': 48807,
 'makes': 29119,
 'long': 28401,
 'hours': 23713,
 'at': 7759,
 'your': 49891,
 'desk': 15653,
 'comfortable': 12933,
 'flexible': 20046,
 'meshed': 30072,
 'is': 25575,
 'designed': 15630,
 'for': 20293,
 'air': 6235,
 'circulation': 12281,
 'and': 6768,
 'support': 43886,
 'when': 48793,
 'you': 49874,
 'lean': 27684,
 'curved': 14580,
 'arms': 7407,
 'provide': 36564,
 'ergonomic': 18402,
 'forearm': 20317,
 'adjust': 5864,
 'height': 23095,
 'using': 47294,
 'gas': 21123,
 'lift': 28039,
 'to': 45519,
 'find': 19785,
 'that': 45003,
 'position': 35672,
 'nylon': 32693,
 'castors': 11376,
 'make': 29114,
 'it': 25653,
 'easy': 17368,
 'move': 31109,
 'around': 7423,
 'space': 42335,
 'ch

#### Vocabulary Size

In [34]:
len(td_idf_vc.vocabulary_)

50515

In [36]:
td_idf_vc.get_feature_names_out()

array(['00', '000', '0000', ..., '③ergonomics', '④anti', 'ﬁltration'],
      dtype=object)

#### Get TF-IDF score of each word

In [39]:
all_feature_names = td_idf_vc.get_feature_names_out()

for word in all_feature_names:
    
    idx = td_idf_vc.vocabulary_.get(word)

    score = td_idf_vc.idf_[idx]
    print(f'{word} : {score}')

00 : 7.3481811568453494
000 : 5.797583744434183
0000 : 7.950356559199568
00001 : 9.476412862694618
0001 : 10.392703594568772
0006 : 9.699556414008827
000764 : 9.476412862694618
0008a : 10.392703594568772
000ers : 9.987238486460608
000hrs : 10.392703594568772
000hz : 8.252637431072502
000hzcable : 10.392703594568772
000hzimpedance : 10.392703594568772
000mah : 9.476412862694618
000times : 10.392703594568772
001 : 7.589343213662238
001176 : 9.476412862694618
001276 : 9.987238486460608
001726 : 9.987238486460608
001773 : 10.392703594568772
002 : 8.313262052888938
00207 : 9.987238486460608
00251 : 9.699556414008827
003 : 8.600944125340718
003711 : 10.392703594568772
004 : 8.888626197792497
0040 : 10.392703594568772
004069 : 10.392703594568772
004654 : 9.987238486460608
0048 : 9.987238486460608
004_army : 10.392703594568772
005 : 9.699556414008827
006 : 9.294091305900663
006342 : 9.294091305900663
006p : 9.699556414008827
007 : 9.699556414008827
0075 : 10.392703594568772
008 : 10.3927035945

In [77]:
#print(transformed_text.toarray()[0])

#### More Analysis if possible

#### Train Test Split

In [45]:
from sklearn.model_selection import train_test_split

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    df.Text,
    df.label_num,
    test_size=0.2,
    random_state=100,
    stratify=df.label_num
)

In [49]:
X_train.shape

(19200,)

In [51]:
X_test.shape

(4800,)

In [53]:
y_train.value_counts()

label_num
0    4800
3    4800
1    4800
2    4800
Name: count, dtype: int64

In [55]:
y_test.value_counts()

label_num
3    1200
2    1200
1    1200
0    1200
Name: count, dtype: int64

## Without Text Preprocessing

### Without Pipeline

#### Transform Training Data

In [61]:
X_train_transformed = td_idf_vc.transform(X_train)

#### Trasnform Test Data

In [71]:
X_test_transformed = td_idf_vc.transform(X_test)

### Use KNN

In [64]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [66]:
knn = KNeighborsClassifier(n_neighbors=10, metric='euclidean')

##### Training

In [68]:
knn.fit(X_train_transformed, y_train)

##### Predictions

In [73]:
y_pred = knn.predict(X_test_transformed)

In [74]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95      1200
           1       0.96      0.97      0.96      1200
           2       0.97      0.98      0.97      1200
           3       0.97      0.93      0.95      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



### Use Naive Baise

In [88]:
from sklearn.naive_bayes import MultinomialNB

In [95]:
nb = MultinomialNB()

##### Training

In [97]:
nb.fit(X_train_transformed, y_train)

##### Predictions

In [101]:
y_pred_nb = nb.predict(X_test_transformed)

In [103]:
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1200
           1       0.97      0.97      0.97      1200
           2       0.98      0.98      0.98      1200
           3       0.99      0.92      0.95      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



### Use Random Forest

In [107]:
from sklearn.ensemble import RandomForestClassifier

In [109]:
rf = RandomForestClassifier()

##### Training

In [112]:
rf.fit(X_train_transformed, y_train)

##### Predictions

In [115]:
y_pred_rf = rf.predict(X_test_transformed)

In [117]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1200
           1       0.98      0.97      0.98      1200
           2       0.98      0.98      0.98      1200
           3       0.97      0.97      0.97      1200

    accuracy                           0.98      4800
   macro avg       0.98      0.98      0.98      4800
weighted avg       0.98      0.98      0.98      4800



## Text Preprocessing and Pipelines