In [16]:
import re
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 2.9.0


In [2]:
def backlist(text):
    result = re.sub(r"\[|\]|\,|\'",'', text)
    return result
def tolist(text):
    return text.split()

In [3]:
df = pd.read_csv('final_data.csv')
df.tail()

Unnamed: 0.1,Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
15741,15741,15741,11.0,0.4868,"politik, milu, yang, orang, tunda, jadi, hanya...",di malaysia siuk tukar politik di negara luar ...
15742,15742,15742,5.0,0.6006,"politik, ada, nak, aru, akyat, johor, jadi, da...",makna isra miraj kita idak boleh putus asa dal...
15743,15743,15743,11.0,0.8951,"politik, milu, yang, orang, tunda, jadi, hanya...",pn bn mampu tahan rana amal politik matang
15744,15744,15744,4.0,0.7062,"politik, dan, ini, itu, milu, tunda, la, jika,...",gak mungkin jago itu airlangga hartanto ahy ca...
15745,15745,15745,0.0,0.9528,"ada, jaat, presiden, panjang, masa, partai, ta...",ada uni cancel uat majlis konvo kata s kes mak...


In [4]:
col = ['Text', 'Dominant_Topic']
df = df[col]
df = df[pd.notnull(df['Text'])]
df.tail()

Unnamed: 0,Text,Dominant_Topic
15741,di malaysia siuk tukar politik di negara luar ...,11.0
15742,makna isra miraj kita idak boleh putus asa dal...,5.0
15743,pn bn mampu tahan rana amal politik matang,11.0
15744,gak mungkin jago itu airlangga hartanto ahy ca...,4.0
15745,ada uni cancel uat majlis konvo kata s kes mak...,0.0


In [40]:
df['Dominant_Topic'] = df['Dominant_Topic'].apply(str)

In [41]:
df.isnull().sum()

Text              0
Dominant_Topic    0
dtype: int64

In [42]:
df['Dominant_Topic'].value_counts()

8     3465
0     1283
14    1222
15     893
5      883
13     877
12     842
9      799
11     723
6      719
16     679
17     675
2      615
1      602
7      565
3      374
4      370
10     160
Name: Dominant_Topic, dtype: int64

In [43]:
# Split data into train and test
train_size = int(len(df) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))

Train size: 12596
Test size: 3150


In [44]:
train_text = df['Text'][:train_size]
train_product = df['Dominant_Topic'][:train_size]

test_text = df['Text'][train_size:]
test_product = df['Dominant_Topic'][train_size:]

In [45]:
max_words = 10000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [46]:
tokenize.fit_on_texts(train_text) # only fit on train
x_train = tokenize.texts_to_matrix(train_text)
x_test = tokenize.texts_to_matrix(test_text)

In [47]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)

In [48]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [49]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (12596, 10000)
x_test shape: (3150, 10000)
y_train shape: (12596, 18)
y_test shape: (3150, 18)


In [50]:
batch_size = 32
epochs = 5

In [51]:
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [52]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [53]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.7299131155014038
Test accuracy: 0.8003174662590027


In [54]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(df.iloc[i][:50], "...")
    print('Actual label:' + test_product.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

Text              inggal gmn tmn2 di partai politik dengar gelom...
Dominant_Topic                                                    1
Name: 0, dtype: object ...
Actual label:6
Predicted label: 6

Text              bego di kami nyak pilih tuk di aju jd presiden...
Dominant_Topic                                                   14
Name: 1, dtype: object ...
Actual label:12
Predicted label: 11

Text              awal saya husnuzon angin kuat ila tengok ni es...
Dominant_Topic                                                   14
Name: 2, dtype: object ...
Actual label:15
Predicted label: 15

Text              contoh gila lueran histeria massal lukis artis...
Dominant_Topic                                                   17
Name: 3, dtype: object ...
Actual label:9
Predicted label: 9

Text              dpd medan minta pemko fokus
Dominant_Topic                              9
Name: 4, dtype: object ...
Actual label:13
Predicted label: 12

Text              kalau cari pasang cari yang ol

In [55]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 512)               5120512   
                                                                 
 activation_2 (Activation)   (None, 512)               0         
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_3 (Dense)             (None, 18)                9234      
                                                                 
 activation_3 (Activation)   (None, 18)                0         
                                                                 
Total params: 5,129,746
Trainable params: 5,129,746
Non-trainable params: 0
_________________________________________________________________


In [56]:
model.save('Model_Klasifikasi_Topik.h5')