In [None]:
# Import Library yang dibutuhkan
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import os
import keras
import keras.utils

from sklearn import preprocessing
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
# from keras.callbacks import ModelCheckpoint, EarlyStopping

import time
from keras import metrics

In [None]:
from keras.utils.vis_utils import plot_model

In [None]:
# Setting beberapa variable yang akan digunakan
MAX_SEQUENCES_LENGTH = 50 # Maximum kata pada kalimat berjumlah 50 
MAX_NB_WORDS = 1000 # Maximum Vocabulary size
EMBEDDING_DIM = 50 # Dimensions of Glove word vector 

In [None]:
# load data using Python JSON module
import json
with open('dataset/dataset.json','r') as f:
    data = json.loads(f.read())
    
# Normalizing data
df = pd.json_normalize(data, record_path =['items'])

In [None]:
# Show data
df.head()

In [None]:
# Melakukan Preprocesing Data Question
# Regex (Menghapus angka dan tanda baca) 
# Case Folding (merubah menjadi lower case)

punctuation = '!"#$%&()*+,./:;<=>?@[\]^_`{|}~\'0123456789'
# defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in punctuation])
    return punctuationfree

# Storing the puntuation free text
# Menjadikan df['punc'] sebagai Lower Case
df['punc']= df['questions'].apply(lambda x:remove_punctuation(x).lower())
df['punc']

In [None]:
# NORMALIZATION
# Untuk menormalisasi dan menyamakan beberapa kata
with open('normalisasi.json','r') as f:
    data = json.loads(f.read())
    
# Normalizing data
normalizad_word = pd.json_normalize(data, record_path =['items'])
normalizad_word.head()

In [None]:
# 1. Tokenizing
# 2. Normalisasi
# 3. DeTokenize (dikarenakan akan dilakukan menggunakan library keras)
from nltk.tokenize import word_tokenize 
from nltk.tokenize.treebank import TreebankWordDetokenizer

# Fungsi untuk melakukan Tokenizing
def word_tokenize_wrapper(text):
    return word_tokenize(text)

# Fungsi untuk melakukan normalisasi
def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

# Fungsi untuk melakukan detokenized
def detoken(text):
    text_detokenize = TreebankWordDetokenizer().detokenize(text)
    return text_detokenize

# Memasukan hasil dari penghapusan kalimat ke dalam tokenize
df['tokenize'] = df['punc'].apply(word_tokenize_wrapper)

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 
        
# Merubah Kata menjadi normalisasi
df['normalized'] = df['tokenize'].apply(normalized_term)

# Mengembalikan text normalisasi kembali ke teks
# Karena agar memudah dalam melakukan tokenizing dengan melakukan library keras
df['normalized'] = df['normalized'].apply(lambda x: detoken(x))

df['normalized']

In [None]:
# # Stemming
# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()

# def stemming(text):
#     stream = stemmer.stem(text)
#     return stream

# df['stemming'] = df['normalized'].apply(lambda x: stemming(x))
# df['stemming']

In [None]:
df.head()

In [None]:
# Save dataset to json
new_data = [df['labels'], df['questions'], df['answers'], df['normalized'], df['stemming']]
headers = ['labels', 'questions', 'answers', 'normalized', 'stemming']
new_df = pd.concat(new_data, axis=1, keys=headers)


new_df.to_json('server/v1.1.0/dataset/data_final.json', orient='records')
# lakukan beautiful json di internet

new_df.head()

In [None]:
# Preparing question to X
questions = df.normalized
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions)
sequences = tokenizer.texts_to_sequences(questions)
# Banyak kata yang telah di tokenizer
word_index = tokenizer.word_index

In [None]:
word_index

In [None]:
# memasukan hasil tokenizer ke dalam X 
X = pad_sequences(sequences, maxlen=MAX_SEQUENCES_LENGTH, padding='post')

In [None]:
# Preparing Y
# encode label because label alfabetic
# menjadikan label ke kategorikal
le = LabelEncoder()
label = df.labels
labelEncode = le.fit_transform(label)
y = to_categorical(labelEncode)

In [None]:
# show lebel yang telah diencode
list(le.classes_)

In [None]:
# bagi data menjadi 2 train dan test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20, stratify=y)

In [None]:
# Save hasil dari split dataset
# import pickle
# with open('pengujian/final/split_dataset/datset60.pickle', 'wb') as f:
#     pickle.dump([X_train, X_test, y_train, y_test], f)

In [None]:
# Load split dataset
# with open('pengujian/final/split_dataset/datset60.pickle', 'rb') as f:
#     X_train, X_test, y_train, y_test = pickle.load(f)

In [None]:
# testing Glove
print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join('dataset/glove/vectors.txt'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
num_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
# create model CNN
def build_cnn_model():
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCES_LENGTH,weights=[embedding_matrix],trainable=True))
    model.add(Conv1D(128, 3, padding='same', activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64,activation="relu")) 
    model.add(Dense(15, activation='softmax'))
    model.compile(optimizer="adam",metrics=["accuracy"],loss="categorical_crossentropy")
    model.summary()
    return model

In [None]:
# save model cnn
# cnn_model.save('model.h5')

In [None]:
# load model
# from keras.models import load_model
# cnn_model = load_model('pengujian/final/training60_e20/model60_e20.h5')

In [None]:
# Train Model
cnn_model = build_cnn_model()
cnn_history = cnn_model.fit(X_train,y_train,epochs=30,validation_data=(X_test, y_test))

In [None]:
# plot_model(cnn_model, to_file='model.png', show_shapes=True, show_dtype=False, show_layer_names=True, rankdir='TB', expand_nested=True, dpi=96)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
ypred = cnn_model.predict(X_test)
cnn_accuracy = accuracy_score(y_test.argmax(axis=-1),ypred.argmax(axis=-1))
#print("CNN Accuracy:",cnn_accuracy)
cnn_cn = confusion_matrix(y_test.argmax(axis=-1),ypred.argmax(axis=-1))
plt.subplots(figsize=(18,14))
sns.heatmap(cnn_cn,annot=True,fmt="1d",cbar=False,xticklabels=le.classes_,yticklabels=le.classes_)
plt.title("CNN Accuracy: {}".format(cnn_accuracy),fontsize=50)
plt.xlabel("Predicted",fontsize=15)
plt.ylabel("Actual",fontsize=15)
plt.show()

In [None]:
fig3, axe1 = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
axe1[0].plot(cnn_history.history["accuracy"],label="accuracy",color="blue")
axe1[1].plot(cnn_history.history["loss"],label="loss",color="red")
axe1[0].title.set_text("CNN Accuracy")
axe1[1].title.set_text("CNN Loss")
axe1[0].set_xlabel("Epoch")
axe1[1].set_xlabel("Epoch")
axe1[0].set_ylabel("Rate")
plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test.argmax(axis=-1), ypred.argmax(axis=-1),target_names= le.classes_))

In [None]:
predictions = cnn_model.predict(X_test)
predictions = [np.argmax(predictions[i]) for i in range(len(predictions))]
predictions = np.array(predictions)
labels = [np.argmax(y_test[i]) for i in range(len(y_test80))]
labels = np.array(labels)

In [None]:
from sklearn import metrics

print ("Accuracy: " + str(100*metrics.accuracy_score(labels, predictions)))
print ("Precision: " + str(100*metrics.precision_score(labels, predictions, average="weighted")))
print ("Recall: " + str(100*metrics.recall_score(labels, predictions, average="weighted")))
print ("f1_score: " + str(100*metrics.f1_score(labels, predictions, average="weighted")))