## Data Preprocessing,

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import random
from numpy.linalg import norm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve, auc, \
accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import statistics
from sklearn.neighbors import KNeighborsClassifier
import time



In [2]:
%%time
# Import training data of ecommerce items including its labeled categories
df_item = pd.read_csv('ecommerce_items_v2.csv')
df_item = df_item[['category', 'sub_category', 'item_name']]
df_item.head()

CPU times: user 92.1 ms, sys: 12.6 ms, total: 105 ms
Wall time: 107 ms


Unnamed: 0,category,sub_category,item_name
0,"Tickets, Vouchers & Services",Telco,aon 6 ( 6 gb )
1,"Tickets, Vouchers & Services",Phone Data,[paling murah] jual kuota smartfren terlengkap
2,"Tickets, Vouchers & Services",Shopping,transaksi 1kg
3,"Tickets, Vouchers & Services",E-Money,kartu flazz bca gen 2 motif gucci custome nama
4,"Tickets, Vouchers & Services",Phone Data,voucher smartfren unlimited 7 hari vocer smart...


In [3]:
# Check whether there are null values in any of the dataframe columns
df_item.isnull().any()

category        False
sub_category    False
item_name       False
dtype: bool

In [4]:
# Set a prior distribution of the ecommerce categories (I)
prior_dist = df_item.groupby(by=["category"]).count().reset_index()

In [5]:
prior_dist

Unnamed: 0,category,sub_category,item_name
0,Audio,3500,3500
1,Automobiles,3500,3500
2,Baby & Kids Fashion,3500,3500
3,Beauty,3500,3500
4,Books & Magazines,3500,3500
5,Cameras & Drones,3500,3500
6,Computers & Accessories,3500,3500
7,Fashion Accessories,3500,3500
8,Food & Beverages,3500,3500
9,Gaming & Consoles,3500,3500


In [6]:
# Set a prior distribution of the ecommerce categories (II)
prior_dist['percentage'] = prior_dist.sub_category/sum(prior_dist.sub_category)
prior_dist

Unnamed: 0,category,sub_category,item_name,percentage
0,Audio,3500,3500,0.033333
1,Automobiles,3500,3500,0.033333
2,Baby & Kids Fashion,3500,3500,0.033333
3,Beauty,3500,3500,0.033333
4,Books & Magazines,3500,3500,0.033333
5,Cameras & Drones,3500,3500,0.033333
6,Computers & Accessories,3500,3500,0.033333
7,Fashion Accessories,3500,3500,0.033333
8,Food & Beverages,3500,3500,0.033333
9,Gaming & Consoles,3500,3500,0.033333


In [8]:
# Check total items of the whole dataset
sum(prior_dist.sub_category)

105000

In [9]:
# Drop any row with null item name
df_item_clean = df_item.loc[~df_item.isnull().any(axis = 1)]
df_item_clean = df_item_clean.reset_index(drop = True)
df_item_clean

Unnamed: 0,category,sub_category,item_name
0,"Tickets, Vouchers & Services",Telco,aon 6 ( 6 gb )
1,"Tickets, Vouchers & Services",Phone Data,[paling murah] jual kuota smartfren terlengkap
2,"Tickets, Vouchers & Services",Shopping,transaksi 1kg
3,"Tickets, Vouchers & Services",E-Money,kartu flazz bca gen 2 motif gucci custome nama
4,"Tickets, Vouchers & Services",Phone Data,voucher smartfren unlimited 7 hari vocer smart...
...,...,...,...
104995,Home & Living,Dinnerware,fynshop ♛ ah09 popsocket bt21 pop socket kpop ...
104996,Home & Living,Home Care Supplies,*cod*fh-c51 sendok garpu set alat makan anak b...
104997,Home & Living,Gardening,dithane m 45 80wp repack 50 gram fungisida pem...
104998,Home & Living,Kitchenware,device open the vial nurse ampule breakers doc...


In [10]:
# Now, there's no more null values in the data
df_item_clean.isnull().any()

category        False
sub_category    False
item_name       False
dtype: bool

In [11]:
# Copy the dataframe to maintain the original version before data pre-processing phase
df_nlp = df_item_clean.copy()
df_nlp

Unnamed: 0,category,sub_category,item_name
0,"Tickets, Vouchers & Services",Telco,aon 6 ( 6 gb )
1,"Tickets, Vouchers & Services",Phone Data,[paling murah] jual kuota smartfren terlengkap
2,"Tickets, Vouchers & Services",Shopping,transaksi 1kg
3,"Tickets, Vouchers & Services",E-Money,kartu flazz bca gen 2 motif gucci custome nama
4,"Tickets, Vouchers & Services",Phone Data,voucher smartfren unlimited 7 hari vocer smart...
...,...,...,...
104995,Home & Living,Dinnerware,fynshop ♛ ah09 popsocket bt21 pop socket kpop ...
104996,Home & Living,Home Care Supplies,*cod*fh-c51 sendok garpu set alat makan anak b...
104997,Home & Living,Gardening,dithane m 45 80wp repack 50 gram fungisida pem...
104998,Home & Living,Kitchenware,device open the vial nurse ampule breakers doc...


In [12]:
# Pre-process item_name to remove unnecessary symbols or numbers and to standardize the string values (I)

# UDF creation for standardizing item name

import re
import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer
def string_standardization(df_item_column):
    corpus = []
    for i in range(len(df_item_column)):
      review = re.sub('[^a-zA-Z]', ' ', df_item_column[i])
      review = review.lower()
      review = review.split()
    #   ps = PorterStemmer()
    #   all_stopwords = stopwords.words('english')
    #   all_stopwords.remove('not')
    #   review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
      review = ' '.join(review)
      corpus.append(review)
    return corpus



In [13]:
# Create a custom class weight from population data if used in the algorithm
prior_prob = pd.read_csv('prior_prob_population.csv')
class_labels = prior_prob['category']
class_weights = prior_prob['f0_']
prior_prob_dict = dict(zip(class_labels, class_weights))
prior_prob_dict['Health']

0.0626961041545133

In [14]:
%%time
# Remove redundant item names which have high similarity with others to remove overfitting
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
threshold = 0.95
df_training = pd.DataFrame({'item_name':[], 'category': []})
y = []
for category in class_labels:
    list_item = list(df_nlp['item_name'][df_nlp['category'] == category])
    corpus = string_standardization(list_item)
    cv = TfidfVectorizer(analyzer = 'char_wb', ngram_range = (3, 4), max_features = 10000, max_df = 0.034)
    X = cv.fit_transform(corpus).toarray()
    y = df_nlp['category'][df_nlp['category'] == category].values
    similarities = cosine_similarity(X, X)
    items_to_remove = set()
    for i in range(len(X)):
        for j in range(i+1, len(X)):
            if similarities[i][j] >= threshold:
                items_to_remove.add(j)  # Mark item j for removal
    filtered_item_names = [corpus[i] for i in range(len(corpus)) if i not in items_to_remove]
    filtered_category = [y[i] for i in range(len(y)) if i not in items_to_remove]
    df_category = pd.DataFrame({'item_name':filtered_item_names, 'category': filtered_category})
    df_training = pd.concat([df_training, df_category], ignore_index = True)
df_training

CPU times: user 4min 44s, sys: 29.5 s, total: 5min 14s
Wall time: 1min 9s


Unnamed: 0,item_name,category
0,biosyafa gastira kemasan baru isi ml,Health
1,curvit cl emulsion dgn tambahan curcuma impor ...,Health
2,paket hemat tradislim isi kapsul limmun box,Health
3,fhuta chuan yau wan obat sakit pinggang punggung,Health
4,nervofit spray solusi ampuh atasi syaraf kejepit,Health
...,...,...
102020,sendal wanita selop ban dua lisban non slip se...,Women Shoes
102021,sepatu slop bustong converse all star wanita p...,Women Shoes
102022,sandal jepit swallow nice wanita karet warna p...,Women Shoes
102023,sandal wedges jeans rumbai kp,Women Shoes


In [17]:
# deleting some objects to give more memory for the important objects
del similarities
del corpus
del cv
del df_category

NameError: name 'similarities' is not defined

## Model Training

In [19]:
%%time
# Pre-process item_name to remove unnecessary symbols or numbers and to standardize the string values (II)
corpus = string_standardization(df_training['item_name'])
corpus

CPU times: user 357 ms, sys: 4.62 ms, total: 362 ms
Wall time: 360 ms


['biosyafa gastira kemasan baru isi ml',
 'curvit cl emulsion dgn tambahan curcuma impor utk nafsu makan anak oleh klinik kecantikan athena',
 'paket hemat tradislim isi kapsul limmun box',
 'fhuta chuan yau wan obat sakit pinggang punggung',
 'nervofit spray solusi ampuh atasi syaraf kejepit',
 'kertas ph merck universal ph lakmus merck ph paper merck',
 'organicup size a the menstrual cup organi cup',
 'pen needle bd gx mm micro fine',
 'alat kesehatan',
 'dr el dr el premium baby toothpaste pasta gigi anak organic odol anak odol bayi dari korea dr el',
 'squalene vitasqua',
 'softlens bening bulanan bausch lomb bausch lomb bulanan soflen bening bausch lomb bulanan soflens bening bausch lomb minus',
 'bisa cod super grow up original peninggi badan',
 'flimty fiber pelangsing herbal detox sachet',
 'dental composite nanofill resin komposit tambal gigi gr',
 'nature s way alive men s men ultra potency multivitamin tabs once daily multivitamin men tabs men s energy multivitamin tabs men

In [21]:
%%time
# Build tokenization using TFIDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
cv = TfidfVectorizer(analyzer = 'char_wb', ngram_range = (3, 4), max_df = 0.034, max_features = 50000)
X = cv.fit_transform(corpus).toarray()
y = df_training['category'].values
# Convert to CSR format if it's not already sparse

CPU times: user 3.48 s, sys: 1.83 s, total: 5.31 s
Wall time: 6.52 s


In [23]:
%%time
# Transforming dense matrix to sparse matrix, allowing a bigger feature set before going out of memory
if not isinstance(X, csr_matrix):
tfidf_matrix = csr_matrix(X)

CPU times: user 22.2 s, sys: 12.3 s, total: 34.5 s
Wall time: 44.5 s


In [25]:
# Delete unnecessary objects
del X

In [26]:
# Create a class and function for CPU usage
import psutil
import random
import threading

class DisplayCPU(threading.Thread):

    def run(self):

        self.running = True

        currentProcess = psutil.Process()

        while self.running:
            print(currentProcess.cpu_percent(interval=1))

    def stop(self):
        self.running = False

In [27]:
%%time
# Predict classes of each string using Naive Bayes
# from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
display_cpu = DisplayCPU()

display_cpu.start()
try:
    # classifier = LinearSVC(random_state=42)
    # classifier = LinearSVC(random_state=42, class_weight = prior_prob_dict)
    # classifier = MultinomialNB()
    classifier = RandomForestClassifier(random_state=0, max_depth = 200, n_estimators = 200)
    classifier.fit(tfidf_matrix, y)
finally: # stop thread even when I press Ctrl+C
    display_cpu.stop()

99.6
100.1
99.8
99.8
100.5
100.0
99.5
100.5
99.6
100.4
100.3
99.8
99.3
100.1
99.8
99.8
100.4
100.3
100.0
99.9
100.0
99.5
100.5
99.3
100.5
100.1
99.9
99.6
100.6
100.0
100.1
99.5
100.5
100.4
99.7
100.3
100.0
100.0
99.6
100.5
100.1
99.6
100.6
99.4
100.4
99.5
100.5
100.1
100.0
100.1
99.2
100.8
99.5
100.5
99.5
100.5
99.9
100.4
99.3
100.7
99.8
99.9
99.7
100.7
99.7
100.2
100.1
99.6
100.5
100.1
100.0
100.2
99.9
99.6
100.5
100.2
99.5
100.1
99.0
99.3
100.0
100.5
99.5
100.6
97.4
100.0
99.8
100.3
99.9
100.1
96.4
100.8
99.5
100.4
97.0
100.0
100.1
99.9
100.1
100.0
100.0
99.6
100.5
99.7
97.6
100.1
99.8
100.1
100.2
100.0
99.9
99.5
99.3
99.9
99.3
100.5
100.3
99.8
99.3
100.3
100.4
99.6
100.5
99.5
100.7
99.3
99.9
99.6
100.3
100.2
99.5
100.5
100.0
100.1
99.6
100.5
99.5
100.6
100.0
100.0
99.5
100.6
100.0
99.5
100.5
98.7
99.6
100.4
99.7
100.0
100.4
99.7
99.8
100.5
99.6
100.5
99.5
100.6
100.1
99.6
100.5
99.7
99.9
99.6
100.5
99.8
96.6
100.0
99.4
100.4
99.7
100.2
99.9
100.3
100.5
100.1
99.2
100.4
100.2
99.6
99

In [28]:
%%time
# Try predicting the category of training set to see how good the prediction the is on known set
df_predict = pd.DataFrame(df_training['item_name'], columns = ['item_name'])
list_proba = classifier.predict(tfidf_matrix)
df_predict_proba = pd.DataFrame(data = list_proba, columns = ['predicted_cat'])
df_pred = pd.concat([df_predict, df_predict_proba], axis = 1)
df_pred['actual_label'] = y
df_pred

CPU times: user 8.86 s, sys: 866 ms, total: 9.72 s
Wall time: 10.1 s


Unnamed: 0,item_name,predicted_cat,actual_label
0,biosyafa gastira kemasan baru isi ml,Health,Health
1,curvit cl emulsion dgn tambahan curcuma impor ...,Health,Health
2,paket hemat tradislim isi kapsul limmun box,Health,Health
3,fhuta chuan yau wan obat sakit pinggang punggung,Health,Health
4,nervofit spray solusi ampuh atasi syaraf kejepit,Health,Health
...,...,...,...
102020,sendal wanita selop ban dua lisban non slip se...,Women Shoes,Women Shoes
102021,sepatu slop bustong converse all star wanita p...,Women Shoes,Women Shoes
102022,sandal jepit swallow nice wanita karet warna p...,Women Shoes,Women Shoes
102023,sandal wedges jeans rumbai kp,Women Shoes,Women Shoes


In [29]:
from sklearn.metrics import f1_score

# Check the functionality performance of the training set prediction (F1)
# Replace these with your actual ground truth and predicted labels
true_labels = y
predicted_labels = df_pred['predicted_cat']

# Calculate the macro-averaged F1 score
macro_f1 = f1_score(true_labels, predicted_labels, average='macro')

# Calculate the micro-averaged F1 score
micro_f1 = f1_score(true_labels, predicted_labels, average='micro')

# Calculate the F1 score for each class
class_f1_scores = f1_score(true_labels, predicted_labels, average=None)

print("Macro-averaged F1 score:", macro_f1)
print("Micro-averaged F1 score:", micro_f1)
print("F1 score for each class:", class_f1_scores)

Macro-averaged F1 score: 0.9763622069662062
Micro-averaged F1 score: 0.9750257289879931
F1 score for each class: [0.98135543 0.98668211 0.98283703 0.98531763 0.98632528 0.98506571
 0.9848105  0.97983461 0.97936806 0.98119605 0.98174971 0.97820753
 0.81428065 0.97514996 0.97952424 0.9887542  0.98460172 0.99114779
 0.98093842 0.99277874 0.98717949 0.98510544 0.98794831 0.97691973
 0.98091532 0.97261288 0.97832528 0.96538289 0.97608217 0.98046934]


In [30]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Check the functionality performance of the training set prediction (Precision/Recall/Accuracy)
# Calculate precision scores
precision_scores = precision_score(true_labels, predicted_labels, average=None)

# Calculate recall scores
recall_scores = recall_score(true_labels, predicted_labels, average=None)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)

print("Precision scores for each class:", precision_scores)
print("Recall scores for each class:", recall_scores)
print("Accuracy:", accuracy)


Precision scores for each class: [0.99281437 0.99358601 0.99436034 0.98833479 0.99705882 0.99427193
 0.99109528 0.98397436 0.97894433 0.9903547  0.9886231  0.98460198
 0.69240048 0.96577086 0.98695388 0.99617422 0.99283368 0.99758381
 0.99287622 0.99854736 0.99617759 0.99701493 0.99677703 0.98076363
 0.99175721 0.97855228 0.98338558 0.9844132  0.99224344 0.98749619]
Recall scores for each class: [0.97015799 0.97987349 0.97157773 0.98231884 0.97582038 0.97602841
 0.97860492 0.97572956 0.97979215 0.97220525 0.97497123 0.97189561
 0.98823529 0.98471301 0.97220562 0.9814439  0.97650514 0.98479428
 0.96928427 0.98707639 0.97834248 0.97347712 0.97927461 0.97310584
 0.97030792 0.96674514 0.97331679 0.94707439 0.96043893 0.97354179]
Accuracy: 0.9750257289879931


In [27]:
# Delete unnecessary objects to reduce memory
del tfidf_matrix

NameError: name 'tfidf_matrix' is not defined

## Performance Test on Example Set

In [28]:
# Let's try to predict sample cases, will it return to the right class?
# First, we need to pre-process the data to remove unnecessary symbols and numbers and to standardize its value
example = ['mouse gaming', 'baju kaos hitam lengan pendek', 'voucher xl 3gb', 'tas macbook air', 
           'atasan rok pria', 'uniland kasur lipat', 'pakaian', 'Vaseline UV Protect Sun Stick', 'Acne Tea Tree Oil Clay Mask123',
          'Flimty Fiber', 'us sliced beef', 'benang rajut katun']
corpus_example = string_standardization(example)
corpus_example

['mouse gaming',
 'baju kaos hitam lengan pendek',
 'voucher xl gb',
 'tas macbook air',
 'atasan rok pria',
 'uniland kasur lipat',
 'pakaian',
 'vaseline uv protect sun stick',
 'acne tea tree oil clay mask',
 'flimty fiber',
 'us sliced beef',
 'benang rajut katun']

In [29]:
# First, let's transform the example into vectorizer
corpus_example = cv.transform(corpus_example).toarray()
corpus_example

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
%%time
# Try predicting the category of example set to see how good the prediction the is on known set
df_item_predict = pd.DataFrame(example, columns = ['item_name'])
list_proba = classifier.predict(corpus_example)
df_predict_proba = pd.DataFrame(data = list_proba, columns = ['predicted_cat'])
df_pred = pd.concat([df_item_predict, df_predict_proba], axis = 1)
df_pred

CPU times: user 17.8 ms, sys: 14.4 ms, total: 32.2 ms
Wall time: 29.9 ms


Unnamed: 0,item_name,predicted_cat
0,mouse gaming,Computers & Accessories
1,baju kaos hitam lengan pendek,Men Clothes
2,voucher xl 3gb,"Tickets, Vouchers & Services"
3,tas macbook air,Computers & Accessories
4,atasan rok pria,Women Clothes
5,uniland kasur lipat,Beauty
6,pakaian,Home & Living
7,Vaseline UV Protect Sun Stick,Food & Beverages
8,Acne Tea Tree Oil Clay Mask123,Beauty
9,Flimty Fiber,Health


In [31]:
# Delete unnecessary objects to reduce memory
del corpus_example

In [34]:
# Import test data of ecommerce items including its labeled categories
df_test = pd.read_csv('item_cat_label - test_ecommerce_item_v2.csv')
df_test.head()

Unnamed: 0,item_name,predicted_cat,actual_label,is_correct,reassigned_label
0,speaker polytron pma pma bluetooth aux usb kar...,Audio,Audio,True,Audio
1,ready stok simbadda bluetooth speaker cst n cs...,Audio,Audio,True,Audio
2,lenyes s speaker bluetooth waterproof tahan air,Audio,Audio,True,Audio
3,koo broadband receiver sdr khz ke ghz untuk ms...,Audio,Audio,True,Audio
4,speaker gaming rgb with rhythm equalizeer and ...,Audio,Audio,True,Audio


In [36]:
# Check whether there are null values in any of the dataframe columns
df_test.isnull().any()

item_name            True
predicted_cat       False
actual_label        False
is_correct          False
reassigned_label    False
dtype: bool

## Model Performance on Test Set

In [37]:
# Set a prior distribution of the ecommerce categories (I)
prior_dist_test = df_test.groupby(by=["actual_label"]).count().reset_index()

In [39]:
# Set a prior distribution of the ecommerce categories (II)
prior_dist_test['percentage'] = prior_dist_test.item_name/sum(prior_dist_test.item_name)
prior_dist_test

Unnamed: 0,actual_label,item_name,predicted_cat,is_correct,reassigned_label,percentage
0,Audio,500,500,500,500,0.033347
1,Automobiles,500,500,500,500,0.033347
2,Baby & Kids Fashion,500,500,500,500,0.033347
3,Beauty,500,500,500,500,0.033347
4,Books & Magazines,498,500,500,500,0.033213
5,Cameras & Drones,500,500,500,500,0.033347
6,Computers & Accessories,500,500,500,500,0.033347
7,Fashion Accessories,500,500,500,500,0.033347
8,Food & Beverages,500,500,500,500,0.033347
9,Gaming & Consoles,500,500,500,500,0.033347


In [41]:
# Check total items of the whole dataset
sum(prior_dist_test.item_name)

14994

In [42]:
# Drop any row with null item name and remove 'OTHERS' category
df_test_clean = df_test.loc[~df_test.isnull().any(axis = 1)]
df_test_clean = df_test_clean[df_test_clean['reassigned_label'] != 'OTHERS']
df_test_clean = df_test_clean.reset_index(drop = True)
df_test_clean

Unnamed: 0,item_name,predicted_cat,actual_label,is_correct,reassigned_label
0,speaker polytron pma pma bluetooth aux usb kar...,Audio,Audio,True,Audio
1,ready stok simbadda bluetooth speaker cst n cs...,Audio,Audio,True,Audio
2,lenyes s speaker bluetooth waterproof tahan air,Audio,Audio,True,Audio
3,koo broadband receiver sdr khz ke ghz untuk ms...,Audio,Audio,True,Audio
4,speaker gaming rgb with rhythm equalizeer and ...,Audio,Audio,True,Audio
...,...,...,...,...,...
14986,sepatu sneakers pria sup sport terlaris,Men Shoes,Women Shoes,False,Men Shoes
14987,sepatu kets keren polos,Men Shoes,Women Shoes,False,Women Shoes
14988,winola payless sepatu flat wanita import jelly...,Women Shoes,Women Shoes,True,Women Shoes
14989,beli gratis fashion sneakers,Women Shoes,Women Shoes,True,Women Shoes


In [43]:
# All correct labels that will be predicted
(df_test_clean['reassigned_label'].unique())

array(['Audio', 'Motorcycles', 'Computers & Accessories',
       'Home Appliances', 'Home & Living', 'Tickets, Vouchers & Services',
       'Hobbies & Collections', 'Mobile & Gadgets', 'Stationery',
       'Gaming & Consoles', 'Fashion Accessories', 'Health',
       'Cameras & Drones', 'Watches', 'Automobiles', 'Women Bags',
       'Beauty', 'Baby & Kids Fashion', 'Men Shoes', 'Sports & Outdoors',
       'Travel & Luggage', 'Pets', 'Men Bags', 'Women Clothes',
       'Muslim Fashion', 'Women Shoes', 'Mom & Baby', 'Men Clothes',
       'Books & Magazines', 'Food & Beverages'], dtype=object)

In [44]:
# Now, there's no more null values in the data
df_test_clean.isnull().any()

item_name           False
predicted_cat       False
actual_label        False
is_correct          False
reassigned_label    False
dtype: bool

In [45]:
# Copy the dataframe to maintain the original version before data pre-processing phase
df_nlp_test = df_test_clean.copy()
df_nlp_test

Unnamed: 0,item_name,predicted_cat,actual_label,is_correct,reassigned_label
0,speaker polytron pma pma bluetooth aux usb kar...,Audio,Audio,True,Audio
1,ready stok simbadda bluetooth speaker cst n cs...,Audio,Audio,True,Audio
2,lenyes s speaker bluetooth waterproof tahan air,Audio,Audio,True,Audio
3,koo broadband receiver sdr khz ke ghz untuk ms...,Audio,Audio,True,Audio
4,speaker gaming rgb with rhythm equalizeer and ...,Audio,Audio,True,Audio
...,...,...,...,...,...
14986,sepatu sneakers pria sup sport terlaris,Men Shoes,Women Shoes,False,Men Shoes
14987,sepatu kets keren polos,Men Shoes,Women Shoes,False,Women Shoes
14988,winola payless sepatu flat wanita import jelly...,Women Shoes,Women Shoes,True,Women Shoes
14989,beli gratis fashion sneakers,Women Shoes,Women Shoes,True,Women Shoes


In [46]:
# Pre-process test set item_name to remove unnecessary symbols or numbers and to standardize the string values (II)
test = string_standardization(df_nlp_test['item_name'])
test

['speaker polytron pma pma bluetooth aux usb karaoke',
 'ready stok simbadda bluetooth speaker cst n cst n cst',
 'lenyes s speaker bluetooth waterproof tahan air',
 'koo broadband receiver sdr khz ke ghz untuk msi sdr radio',
 'speaker gaming rgb with rhythm equalizeer and modes nyk x kronos',
 'speaker inci ashley lf v lf v watt komponen',
 'mixer microverb best bluetooth channel microverb',
 'spliter av amp load rca input output usat splitter video audio',
 'mixer yamaha mg xu mixer audio yamaha mg xu channel',
 'speker meeting bluetooth advance k',
 'mcdodo audio converter splitter for iphoneto mm dual lightning charging musik ca ca',
 'piston assy std gx',
 'headset gaming rgb microphone audio d rexus vonix f free splitter',
 'headset samsung galaxy buds pro buds live buds buds plus',
 'handsfree headset in ear jbl t a original',
 'headset gaming rexus vonix f dual jack dengan usb promo list putih',
 'membran keypad indicator sonic a e',
 'speaker bluetooth portable bass speaker b

In [47]:
%%time
# First, let's transform the example into vectorizer
corpus_test = cv.transform(test).toarray()

CPU times: user 543 ms, sys: 285 ms, total: 828 ms
Wall time: 994 ms


In [49]:
%%time
# Transform dense to sparse matrix to save memory
if not isinstance(corpus_test, csr_matrix):
    tfidf_matrix_test = csr_matrix(corpus_test)
y_test = df_nlp_test['reassigned_label'].values
tfidf_matrix_test

CPU times: user 3.27 s, sys: 2.23 s, total: 5.5 s
Wall time: 7.3 s


<14991x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 791699 stored elements in Compressed Sparse Row format>

In [50]:
# Delete unnecessary objects
del corpus_test

In [51]:
%%time
# Try predicting the category of training set to see how good the prediction the is on known set
df_test_predict = pd.DataFrame(test, columns = ['item_name'])
list_proba = classifier.predict(tfidf_matrix_test)
df_predict_proba_test = pd.DataFrame(data = list_proba, columns = ['predicted_label'])
df_pred_test = pd.concat([df_test_predict, df_predict_proba_test], axis = 1)
df_pred_test['reassigned_label'] = df_nlp_test['reassigned_label']
df_pred_test

CPU times: user 1.37 s, sys: 444 ms, total: 1.82 s
Wall time: 2.14 s


Unnamed: 0,item_name,predicted_label,reassigned_label
0,speaker polytron pma pma bluetooth aux usb kar...,Audio,Audio
1,ready stok simbadda bluetooth speaker cst n cs...,Audio,Audio
2,lenyes s speaker bluetooth waterproof tahan air,Audio,Audio
3,koo broadband receiver sdr khz ke ghz untuk ms...,Audio,Audio
4,speaker gaming rgb with rhythm equalizeer and ...,Audio,Audio
...,...,...,...
14986,sepatu sneakers pria sup sport terlaris,Men Shoes,Men Shoes
14987,sepatu kets keren polos,Women Shoes,Women Shoes
14988,winola payless sepatu flat wanita import jelly...,Women Shoes,Women Shoes
14989,beli gratis fashion sneakers,Women Shoes,Women Shoes


In [52]:
from sklearn.metrics import f1_score

# Replace these with your actual ground truth and predicted labels
true_labels = y_test
predicted_labels = df_pred_test['predicted_label']

# Calculate the macro-averaged F1 score
macro_f1 = f1_score(true_labels, predicted_labels, average='macro')

# Calculate the micro-averaged F1 score
micro_f1 = f1_score(true_labels, predicted_labels, average='micro')

# Calculate the F1 score for each class
class_f1_scores = f1_score(true_labels, predicted_labels, average=None)

print("Macro-averaged F1 score:", macro_f1)
print("Micro-averaged F1 score:", micro_f1)
print("F1 score for each class:", class_f1_scores)

Macro-averaged F1 score: 0.7630373346471706
Micro-averaged F1 score: 0.7571209392302047
F1 score for each class: [0.80694586 0.7794994  0.73148148 0.78416149 0.85239492 0.82207207
 0.79279279 0.7543554  0.7404783  0.87368421 0.6857671  0.57142857
 0.46782178 0.69806094 0.74663677 0.76996198 0.7857868  0.86964981
 0.73132664 0.83143744 0.8284264  0.83770651 0.75585284 0.68879668
 0.89846154 0.71100363 0.90391459 0.69742489 0.67900093 0.79478827]


In [53]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Calculate precision scores
precision_scores = precision_score(true_labels, predicted_labels, average=None)

# Calculate recall scores
recall_scores = recall_score(true_labels, predicted_labels, average=None)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)

print("Precision scores for each class:", precision_scores)
print("Recall scores for each class:", recall_scores)
print("Accuracy:", accuracy)


Precision scores for each class: [0.78217822 0.83631714 0.82291667 0.78294574 0.82732448 0.79347826
 0.80816327 0.77321429 0.66987179 0.8234127  0.73757455 0.62566845
 0.43150685 0.67741935 0.69957983 0.75984991 0.73434535 0.91038697
 0.72888889 0.78823529 0.83265306 0.87070707 0.84328358 0.77934272
 0.90495868 0.6419214  0.85810811 0.71902655 0.71819961 0.82993197]
Recall scores for each class: [0.83333333 0.72991071 0.65833333 0.78538103 0.87903226 0.85280374
 0.77799607 0.73639456 0.82772277 0.93049327 0.64075993 0.5258427
 0.51081081 0.72       0.80048077 0.78034682 0.84497817 0.83240223
 0.73378076 0.87964989 0.82424242 0.8071161  0.68484848 0.61710037
 0.89205703 0.79674797 0.95488722 0.67708333 0.64385965 0.7625    ]
Accuracy: 0.7571209392302047
