In [36]:
import pandas as pd
import numpy as np
import plotly.express as px
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import InputLayer, TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow import keras

from keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import Precision

import matplotlib.pyplot as plt
import seaborn as sns

import json 
import pickle


In [37]:
df = pd.read_csv('DatasetTag.csv')

In [38]:
df

Unnamed: 0,Kalimat,kata,pos,tag
0,Kalimat 1,forza,,B-Game
1,Kalimat 1,horizon,,E-Game
2,Kalimat 1,5,,O
3,Kalimat 1,om,,O
4,Kalimat 1,install,,O
...,...,...,...,...
2358,Kalimat 449,putih,,O
2359,Kalimat 450,ganti,,B-Request
2360,Kalimat 450,mobonya,,B-Spek
2361,Kalimat 450,b660,,O


In [39]:
df = df[['Kalimat','kata','tag']]

In [40]:
df.isnull().sum()

Kalimat    0
kata       0
tag        0
dtype: int64

In [41]:
df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [42]:
print("Unique words in corpus:", df['kata'].nunique())
print("Unique tags in corpus:", df['tag'].nunique())

words = list(set(df["kata"].values))
words.append("ENDPAD")
num_words = len(words)

tags = list(set(df["tag"].values))
num_tags = len(tags)

Unique words in corpus: 753
Unique tags in corpus: 44


In [43]:
# Class to get sentences
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s: [(w, t) for w, t in zip(s['kata'].values.tolist(),
                                                s['tag'].values.tolist())]
        self.grouped = self.df.groupby("Kalimat").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Kalimat: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s
        except KeyError:
            return None

# Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]

  self.grouped = self.df.groupby("Kalimat").apply(agg)


In [44]:
sent = getter.get_text()
print(sent)

None


In [45]:
sentences = getter.sentences

In [46]:
sentences[1]

[('kak', 'O'),
 ('jakarta', 'B-Tempat'),
 ('timur', 'E-Tempat'),
 ('pakai', 'O'),
 ('packing', 'B-Pengiriman'),
 ('kayu', 'E-Pengiriman'),
 ('aman', 'O')]

In [47]:
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [48]:
X = [[word2idx[w[0]] for w in s] for s in sentences]
X

[[514, 496, 733],
 [232, 360, 388, 658, 607, 502, 173],
 [733, 476, 40, 354, 731, 19, 20, 182, 233, 395, 224],
 [1, 230, 586, 377, 294],
 [331, 480, 62, 733],
 [271, 489],
 [691, 378, 733],
 [70, 460, 487, 733, 460, 707, 45, 130, 431],
 [667, 271, 489, 442, 705],
 [40, 443, 182, 36],
 [389, 159, 344],
 [168, 588],
 [223, 733, 398, 586],
 [309, 667, 594],
 [723, 223, 529, 690, 223, 729, 675, 599],
 [398, 75, 309, 733],
 [169, 657, 188, 733],
 [454, 473, 200, 311, 317, 173, 482, 248],
 [41, 644, 199],
 [707, 658, 454],
 [667, 227],
 [24, 67, 282],
 [232, 131, 732],
 [520, 657, 733],
 [724, 464, 733],
 [639, 520, 472, 70, 183, 566, 137, 391],
 [480, 40],
 [667, 271, 489, 733],
 [733, 398, 586, 3, 309, 313],
 [21, 733],
 [733, 396, 359, 658, 522, 91, 707, 657],
 [309, 667, 271, 489, 733],
 [733, 460, 738, 114, 707, 640, 254, 76],
 [686, 132],
 [362],
 [392, 305, 703, 292, 195, 510],
 [733,
  169,
  188,
  629,
  334,
  460,
  431,
  646,
  629,
  733,
  571,
  7,
  657,
  629,
  733,
  516

In [49]:
max_len = 70

X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words-1)

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [50]:
X.shape

(448, 70)

In [51]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [52]:
x_test

array([[ 58, 520, 450, ..., 753, 753, 753],
       [700, 237, 432, ..., 753, 753, 753],
       [667, 412, 554, ..., 753, 753, 753],
       ...,
       [684, 657, 629, ..., 753, 753, 753],
       [158, 234, 605, ..., 753, 753, 753],
       [232, 323, 538, ..., 753, 753, 753]], dtype=int32)

In [53]:
# Define the embedding dimension
embedding_dim = 50  # This is an example value; choose based on your needs

model = keras.Sequential()
model.add(InputLayer((max_len,)))
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.1))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))

model.summary()




In [54]:
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=['accuracy'])

batch_size = 20
epochs = 8

history = model.fit(x_train, np.array(y_train), batch_size=batch_size, epochs=epochs,
                    validation_split=0.1)


Epoch 1/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.5376 - loss: 5.8345 - val_accuracy: 0.9512 - val_loss: 1.0734
Epoch 2/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9532 - loss: 0.7950 - val_accuracy: 0.9556 - val_loss: 0.4120
Epoch 3/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9574 - loss: 0.3339 - val_accuracy: 0.9563 - val_loss: 0.3548
Epoch 4/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9602 - loss: 0.2539 - val_accuracy: 0.9579 - val_loss: 0.3264
Epoch 5/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9638 - loss: 0.2139 - val_accuracy: 0.9591 - val_loss: 0.3066
Epoch 6/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.9671 - loss: 0.1883 - val_accuracy: 0.9587 - val_loss: 0.3006
Epoch 7/8
[1m17/17[0m [32m━━━━━━━━━━━

In [55]:
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [56]:
batch_size = 20
epochs = 8

In [57]:
history = model.fit(x_train, np.array(y_train), batch_size=batch_size, epochs=epochs,
                    validation_split=0.1)

Epoch 1/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.9721 - loss: 0.1472 - val_accuracy: 0.9675 - val_loss: 0.2703
Epoch 2/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.9783 - loss: 0.1415 - val_accuracy: 0.9583 - val_loss: 0.3552
Epoch 3/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9739 - loss: 0.1913 - val_accuracy: 0.9627 - val_loss: 0.3018
Epoch 4/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9743 - loss: 0.2000 - val_accuracy: 0.9611 - val_loss: 0.3034
Epoch 5/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.9767 - loss: 0.1661 - val_accuracy: 0.9651 - val_loss: 0.3002
Epoch 6/8
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.9797 - loss: 0.1442 - val_accuracy: 0.9611 - val_loss: 0.3091
Epoch 7/8
[1m17/17[0m [32m━━━━━━━━━━━

In [58]:
# Evaluation
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, -1)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step


In [61]:
y_pred[0]

array([35, 35, 27, 35, 27, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
       35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
       35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
       35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
       35, 35])

In [None]:
x_test.shape

In [None]:
len(word2idx)

In [None]:
y_pred.shape

In [None]:
# Plot Training Results

model_lstm_1_hist_df = pd.DataFrame(history.history)

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.lineplot(data=model_lstm_1_hist_df[['accuracy', 'val_accuracy']])
plt.grid()
plt.title('Accuracy vs Val-Accuracy')

plt.subplot(1, 2, 2)
sns.lineplot(data=model_lstm_1_hist_df[['loss', 'val_loss']])
plt.grid()
plt.title('Loss vs Val-Loss')
plt.show()

In [None]:
data_inferential = np.array([[264, 554, 404, 668, 404, 668, 581, 753, 753, 753, 753, 753, 753,
       753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753,
       753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753,
       753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753,
       753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753,
       753, 753, 753, 753, 753]])
predict = model.predict(data_inferential)
predict

In [None]:
# export model SVM after hyperparamter tuning to file model_svm.pkl
with open('model.pkl', 'wb') as file_1:
  pickle.dump(model, file_1)

with open("word_dict.txt", 'w') as file_2:
  json.dump(word2idx,file_2)