In [14]:
import numpy as np
import json
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
# Here to pre-process the data input
places = []
labels = []
training_places = []
training_labels = []
testing_places = []
testing_labels = []

with open("/tmp/Yogyakarta.json", 'r') as f:
    datastore = json.load(f)

# to access: print(datastore['results'][2]['name'])
for item in datastore['results']:
  places.append(item['name'])
  labels.append(item['types'])

In [16]:
one_hot = MultiLabelBinarizer()
labels = one_hot.fit_transform(labels)
class_types = one_hot.classes_

In [17]:
def extractPlaces(lst):
  return list(map(lambda x:[x], lst))

In [18]:
dataset = tf.data.Dataset.from_tensor_slices((extractPlaces(places),
                                              labels))

In [19]:
# shuffle data
places_shuffled, labels_shuffled = shuffle(places, labels)

num_split = int(0.8*len(places))

# split data testing and training
training_places = places_shuffled[:num_split]
training_labels = labels_shuffled[:num_split]
testing_places = places_shuffled[num_split:]
testing_labels = labels_shuffled[num_split:]

In [20]:
f = pd.DataFrame(data=labels_shuffled,
                 index=places_shuffled,
                 columns=class_types)
# f # uncomment to show data frame (all data)

g = pd.DataFrame(data=training_labels,
                 index=training_places,
                 columns=class_types)
# g # uncomment to show data frame (training data)

h = pd.DataFrame(data=testing_labels,
                 index=testing_places,
                 columns=class_types)
# h # uncomment to show data frame (testing data)

In [21]:
vocab_size = 12000
embedding_dim = 16
max_length = 24
trunc_type = 'post'
oov_tok = "<OOV>"

In [22]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_places)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_places)
training_padded = pad_sequences(sequences,
                                maxlen=max_length,
                                truncating = trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_places)
testing_padded = pad_sequences(testing_sequences)

In [23]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size,
                            embedding_dim,
                            input_length=max_length),
  tf.keras.layers.Conv1D(64, 5, activation='relu'),
  tf.keras.layers.GlobalMaxPooling1D(),
  tf.keras.layers.Dense(48, activation='relu'),
  tf.keras.layers.Dense(5, activation='softmax') 
])

# kategori: **Budaya, Sejarah, Cagar Alam, Maritim, Religi**

model.compile(loss="categorical_crossentropy",
              metrics=["accuracy"],
              optimizer='adam')

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 24, 16)            192000    
_________________________________________________________________
conv1d (Conv1D)              (None, 20, 64)            5184      
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 48)                3120      
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 245       
Total params: 200,549
Trainable params: 200,549
Non-trainable params: 0
_________________________________________________________________


In [24]:
num_epochs = 10

training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

history = model.fit(training_padded, training_labels,
                    epochs=num_epochs,
                    validation_data=(testing_padded,testing_labels),
                    verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# to be done after training is finished

import time
saved_model_path = "./{}.h5".format(int(time.time()))

model.save(saved_model_path)

In [None]:
!tensorflowjs_converter --input_format=keras {saved_model_path} ./