In [1]:
import re
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 2.9.0


In [2]:
def backlist(text):
    result = re.sub(r"\[|\]|\,|\'",'', text)
    return result
def tolist(text):
    return text.split()

In [3]:
df = pd.read_csv('final_data.csv')
df.tail()

Unnamed: 0.1,Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,Emotion
15740,15740,15740,23.0,0.8127,"politik, ni, kuasa, tak, dah, nak, orang, kan,...",malaysia sibuk tukar politik negara luar terus...,negative
15741,15741,15741,1.0,0.5758,"politik, milik, bukan, seni, kerja, putus, mam...",makna isra miraj putus asa ada apa,negative
15742,15742,15742,1.0,0.8921,"politik, milik, bukan, seni, kerja, putus, mam...",prn bn mampu tahan rana amal politik matang,negative
15743,15743,15743,15.0,0.8666,"politik, indonesia, cari, buat, kalau, jadi, b...",gak mungkin jago airlangga hartanto ahy cak pa...,negative
15744,15744,15744,29.0,0.9429,"buat, kata, makin, tinggi, sbb, bagai, kes, of...",uni cancel buat majlis konvo kata sbb kes maki...,negative


In [4]:
col = ['Text', 'Dominant_Topic']
df = df[col]
df = df[pd.notnull(df['Text'])]
df.tail()

Unnamed: 0,Text,Dominant_Topic
15740,malaysia sibuk tukar politik negara luar terus...,23.0
15741,makna isra miraj putus asa ada apa,1.0
15742,prn bn mampu tahan rana amal politik matang,1.0
15743,gak mungkin jago airlangga hartanto ahy cak pa...,15.0
15744,uni cancel buat majlis konvo kata sbb kes maki...,29.0


In [5]:
df['Dominant_Topic'] = df['Dominant_Topic'].apply(str)

In [6]:
df.isnull().sum()

Text              0
Dominant_Topic    0
dtype: int64

In [7]:
df['Dominant_Topic'].value_counts()

4.0     3458
32.0     749
30.0     733
3.0      717
19.0     702
26.0     641
11.0     584
29.0     565
5.0      563
24.0     546
33.0     474
20.0     469
12.0     451
13.0     439
9.0      436
22.0     410
27.0     382
31.0     359
23.0     348
16.0     344
10.0     322
1.0      304
14.0     279
34.0     229
18.0     205
15.0     184
0.0      182
2.0      143
21.0     140
6.0       93
8.0       85
7.0       84
28.0      82
17.0      41
25.0       2
Name: Dominant_Topic, dtype: int64

In [8]:
# Split data into train and test
train_size = int(len(df) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))

Train size: 12596
Test size: 3149


In [9]:
train_text = df['Text'][:train_size]
train_product = df['Dominant_Topic'][:train_size]

test_text = df['Text'][train_size:]
test_product = df['Dominant_Topic'][train_size:]

In [10]:
max_words = 10000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [11]:
tokenize.fit_on_texts(train_text) # only fit on train
x_train = tokenize.texts_to_matrix(train_text)
x_test = tokenize.texts_to_matrix(test_text)

In [12]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)

In [13]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [14]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (12596, 10000)
x_test shape: (3149, 10000)
y_train shape: (12596, 35)
y_test shape: (3149, 35)


In [15]:
batch_size = 32
epochs = 5

In [16]:
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [17]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 1.0560972690582275
Test accuracy: 0.739599883556366


In [19]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_text.iloc[i][:50], "...")
    print('Actual label:' + test_product.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

just tak masyarakat lebih parah politik kotor ...
Actual label:30.0
Predicted label: 30.0

sikap ikat alumni ilmu politik iisip soal wacana t ...
Actual label:33.0
Predicted label: 19.0

indonesia objek politik mau kacau barat lewat panj ...
Actual label:15.0
Predicted label: 15.0

so malaysians sekarang ni dah la huru hara pasal p ...
Actual label:23.0
Predicted label: 23.0

nu nu kata mau jauh diri politik omong nada ama po ...
Actual label:33.0
Predicted label: 3.0

bukan zaman angkat senjata peperangan cuma sekadar ...
Actual label:5.0
Predicted label: 5.0

malam usaha selamat ajar spm smk tengku ampu intan ...
Actual label:27.0
Predicted label: 27.0

usul tunda milu hendak negara gaduh nafsu politik  ...
Actual label:19.0
Predicted label: 19.0

aneh saran lebih baik tingkat kwalitas per nasi pa ...
Actual label:28.0
Predicted label: 11.0

politik kambing hitam siap lihat pro rakyat sangku ...
Actual label:3.0
Predicted label: 21.0



In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               5120512   
                                                                 
 activation (Activation)     (None, 512)               0         
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 35)                17955     
                                                                 
 activation_1 (Activation)   (None, 35)                0         
                                                                 
Total params: 5,138,467
Trainable params: 5,138,467
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.save('Model_Klasifikasi_Topik.h5')