<a href="https://colab.research.google.com/github/mazayayumna/NLP-Product-Category/blob/main/Product_Categories_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import gensim
import nltk.corpus
from gensim.models import KeyedVectors
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from nltk.corpus import stopwords

MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 30
EMBEDDING_DIM = 100

file_embedding = "idwiki_word2vec_100.model"
'''id_w2v = gensim.models.word2vec.Word2Vec.load(file_embedding)
print(id_w2v.most_similar('sepatu'))'''
category_index = {"olahraga":1, "pertukangan":2, "fashion":3, "elektronik":4, "handphone":5}
category_reverse_index = dict((y,x) for (x,y) in category_index.items())
STOPWORDS = set(stopwords.words("indonesian"))

**LOADING DATA**

In [None]:
olahraga = pd.read_csv("olahraga.csv", sep=',')
pertukangan = pd.read_csv("pertukangan.csv", sep=',')
fashion = pd.read_csv("fashion.csv", sep=',')
elektronik = pd.read_csv("elektronik.csv", sep=',')
handphone = pd.read_csv("handphone.csv", sep=',')

datasets = [olahraga, pertukangan, fashion, elektronik, handphone]

print("Make sure there are no null values in the datasets")
for data in datasets:
    print("Has null values: ", data.isnull().values.any())

Make sure there are no null values in the datasets
Has null values:  False
Has null values:  False
Has null values:  False
Has null values:  False
Has null values:  False


**PREPROCESSING**

In [None]:
def preprocess(text):
    text= text.strip().lower().split()
    text = filter(lambda word: word not in STOPWORDS, text)
    return " ".join(text)
    
for dataset in datasets:
    dataset['title'] = dataset['title'].apply(preprocess)

In [None]:
all_texts = olahraga['title'] + pertukangan['title'] + fashion['title'] + elektronik['title'] + handphone['title']
all_texts = all_texts.drop_duplicates(keep=False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(all_texts)

olahraga_sequences = tokenizer.texts_to_sequences(olahraga['title'])
pertukangan_sequences = tokenizer.texts_to_sequences(pertukangan['title'])
fashion_sequences = tokenizer.texts_to_sequences(fashion['title'])
elektronik_sequences = tokenizer.texts_to_sequences(elektronik['title'])
handphone_sequences = tokenizer.texts_to_sequences(handphone['title'])

olahraga_data = pad_sequences(olahraga_sequences, maxlen=MAX_SEQUENCE_LENGTH)
pertukangan_data = pad_sequences(pertukangan_sequences, maxlen=MAX_SEQUENCE_LENGTH)
fashion_data = pad_sequences(fashion_sequences, maxlen=MAX_SEQUENCE_LENGTH)
elektronik_data = pad_sequences(elektronik_sequences, maxlen=MAX_SEQUENCE_LENGTH)
handphone_data = pad_sequences(handphone_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
word_index = tokenizer.word_index
test_string = "sepatu nike terbaru"
print("word\t\tid")
print("-" * 20)
for word in test_string.split():
    print("%s\t\t%s" % (word, word_index[word]))

word		id
--------------------
sepatu		10
nike		7
terbaru		290


In [None]:
test_sequence = tokenizer.texts_to_sequences(["sepatu nike terbaru", "sepatu adidas terbaru"])
padded_sequence = pad_sequences(test_sequence, maxlen=MAX_SEQUENCE_LENGTH)
print("Text to Vector", test_sequence)
print("Padded Vector", padded_sequence)

Text to Vector [[10, 7, 290], [10, 113, 290]]
Padded Vector [[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0  10   7 290]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0  10 113 290]]


In [None]:
print("olahraga: \t", to_categorical(category_index["olahraga"], 6))
print("pertukangan: \t", to_categorical(category_index["pertukangan"], 6))
print("fashion: \t", to_categorical(category_index["fashion"], 6))
print("elektronik: \t", to_categorical(category_index["elektronik"], 6))
print("handphone: \t", to_categorical(category_index["handphone"], 6))

olahraga: 	 [0. 1. 0. 0. 0. 0.]
pertukangan: 	 [0. 0. 1. 0. 0. 0.]
fashion: 	 [0. 0. 0. 1. 0. 0.]
elektronik: 	 [0. 0. 0. 0. 1. 0.]
handphone: 	 [0. 0. 0. 0. 0. 1.]


In [None]:
print("olahraga shape: ", olahraga_data.shape)
print("pertukangan shape: ", pertukangan_data.shape)
print("fashion shape: ", fashion_data.shape)
print("elektronik shape: ", elektronik_data.shape)
print("handphone shape: ", handphone_data.shape)

data = np.vstack((olahraga_data, pertukangan_data, fashion_data, elektronik_data, handphone_data))
category = pd.concat([olahraga['category'], pertukangan['category'], fashion['category'], elektronik['category'], handphone['category']]).values
category = to_categorical(category)
print("-"*10)
print("combined data shape: ", data.shape)
print(category)
print("combined category/label shape: ", category.shape)

olahraga shape:  (7838, 30)
pertukangan shape:  (1826, 30)
fashion shape:  (8910, 30)
elektronik shape:  (15897, 30)
handphone shape:  (6136, 30)
----------
combined data shape:  (40607, 30)
[[0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]]
combined category/label shape:  (40607, 6)


In [None]:
VALIDATION_SPLIT = 0.4
indices = np.arange(data.shape[0]) # get sequence of row index
np.random.shuffle(indices) # shuffle the row indexes
data = data[indices] # shuffle data/product-titles/x-axis
category = category[indices] # shuffle labels/category/y-axis
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = category[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = category[-nb_validation_samples:]

**WORD2VEC EMBEDDING**

In [None]:
word2vec = gensim.models.KeyedVectors.load(file_embedding)
print('Found %s word vectors of word2vec' % len(word2vec.wv.vocab))

Found 331792 word vectors of word2vec


In [None]:
from keras.layers import Embedding
word_index = tokenizer.word_index
nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.wv.vocab:
        embedding_matrix[i] = word2vec.wv.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

embedding_layer = Embedding(embedding_matrix.shape[0], # or len(word_index) + 1
                            embedding_matrix.shape[1], # or EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Null word embeddings: 362


**MODEL**

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, Flatten
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

model_1 = Sequential()
model_1.add(embedding_layer)
model_1.add(Conv1D(50,5,activation='relu'))
model_1.add(GlobalMaxPooling1D())
model_1.add(Dense(50))
model_1.add(Dropout(0.2))
model_1.add(Activation('relu'))
model_1.add(Dense(6))
model_1.add(Activation('softmax'))
model_1.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
model_1.summary()

Model: "sequential_41"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 30, 100)           80100     
_________________________________________________________________
conv1d_79 (Conv1D)           (None, 26, 50)            25050     
_________________________________________________________________
global_max_pooling1d_22 (Glo (None, 50)                0         
_________________________________________________________________
dense_82 (Dense)             (None, 50)                2550      
_________________________________________________________________
dropout_79 (Dropout)         (None, 50)                0         
_________________________________________________________________
activation_44 (Activation)   (None, 50)                0         
_________________________________________________________________
dense_83 (Dense)             (None, 6)               

In [None]:
model_1.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, batch_size=128)
score = model_1.evaluate(x_val, y_val, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.10311932116746902
Test accuracy: 0.960472822189331


In [None]:
example_product = "Paket Headband Lace Tamagoo Bandana Rambut Anak baby Premium"
example_product = preprocess(example_product)
example_sequence = tokenizer.texts_to_sequences([example_product])
example_padded_sequence = pad_sequences(example_sequence, maxlen=MAX_SEQUENCE_LENGTH)

print("-"*10)
#print("Predicted category: ", category_reverse_index[model_1.predict_classes(example_padded_sequence, verbose=0)[0]])
print("Predicted category: ", category_reverse_index[np.argmax(model_1.predict(example_padded_sequence), axis=-1)[0]])
print("-"*10)
probabilities = model_1.predict(example_padded_sequence, verbose=0)
probabilities = probabilities[0]
print("Olahraga Probability: ",probabilities[category_index["olahraga"]] )
print("Pertukangan Probability: ",probabilities[category_index["pertukangan"]] )
print("Fashion probability: ",probabilities[category_index["fashion"]] )
print("Elektronik probability: ",probabilities[category_index["elektronik"]] )
print("Handphone probability: ",probabilities[category_index["handphone"]] )

----------
Predicted category:  fashion
----------
Olahraga Probability:  0.00019297855
Pertukangan Probability:  2.6492437e-09
Fashion probability:  0.9992138
Elektronik probability:  0.00023739394
Handphone probability:  0.00035584954
