In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import os
import keras
import keras.utils

from sklearn import preprocessing
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
# from keras.callbacks import ModelCheckpoint, EarlyStopping

import time
from keras import metrics

MAX_SEQUENCES_LENGTH = 50 # Maximum kata pada kalimat
MAX_NB_WORDS = 20000 # Vocabulary size
EMBEDDING_DIM = 64 # Dimensions of Glove word vector

In [None]:
import json
# load data using Python JSON module
with open('dataset/chatbot_eksyar.json','r') as f:
    data = json.loads(f.read())
    
# Normalizing data
df = pd.json_normalize(data, record_path =['items'])

In [None]:
# Preparing question to X
question = df.questions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(question)
sequences = tokenizer.texts_to_sequences(question)
# Banyak kata yang telah di tokenizer
word_index = tokenizer.word_index

In [None]:
# memasukan hasil tokenizer ke dalam X 
X = pad_sequences(sequences, maxlen=MAX_SEQUENCES_LENGTH, padding='post')

In [None]:
# Preparing Y
# encode label because label alfabetic
# menjadikan label ke kategorikal
le = LabelEncoder()
label = df.labels
labelEncode = le.fit_transform(label)
y = to_categorical(labelEncode)

In [None]:
# bagi data menjadi 2 train dan test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20, stratify=y)

In [None]:
# checkpoint_path = 'data-target-model-wtl.h5'
# early_stopper = EarlyStopping(monitor='val_accuracy', patience=5, verbose=0, mode='auto')
# checkpointer = ModelCheckpoint(filepath=checkpoint_path, verbose=0, save_best_only=True)

# tinggal memnambahkan callback pada model fit

In [None]:
# create model CNN
def build_cnn_model():
    model = Sequential()
    
    model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCES_LENGTH))

    model.add(Conv1D(64, 3, padding='same', activation='relu'))
    # model.add(MaxPooling1D())
    model.add(GlobalMaxPooling1D())
    
    model.add(Flatten())
    
    model.add(Dense(units=250,activation="relu"))  
    model.add(Dense(units=5,activation="softmax"))
    
    model.compile(optimizer="adam",metrics=["accuracy"],loss="categorical_crossentropy")

    model.summary()
    return model

In [None]:
# Train Model
cnn_model = build_cnn_model()
cnn_history = cnn_model.fit(X_train,y_train,epochs=30,validation_data=(X_test, y_test))

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
ypred = cnn_model.predict(X_test)
cnn_accuracy = accuracy_score(y_test.argmax(axis=-1),ypred.argmax(axis=-1))
#print("CNN Accuracy:",cnn_accuracy)
cnn_cn = confusion_matrix(y_test.argmax(axis=-1),ypred.argmax(axis=-1))
plt.subplots(figsize=(18,14))
sns.heatmap(cnn_cn,annot=True,fmt="1d",cbar=False,xticklabels=le.classes_,yticklabels=le.classes_)
plt.title("CNN Accuracy: {}".format(cnn_accuracy),fontsize=50)
plt.xlabel("Predicted",fontsize=15)
plt.ylabel("Actual",fontsize=15)
plt.show()

In [None]:
fig3, axe1 = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
axe1[0].plot(cnn_history.history["accuracy"],label="accuracy",color="blue")
axe1[1].plot(cnn_history.history["loss"],label="loss",color="red")
axe1[0].title.set_text("CNN Accuracy")
axe1[1].title.set_text("CNN Loss")
axe1[0].set_xlabel("Epoch")
axe1[1].set_xlabel("Epoch")
axe1[0].set_ylabel("Rate")
plt.show()

In [None]:
def cnn_predict(quest):
    puretext = tokenizer.texts_to_sequences(quest)
    text_pad = pad_sequences(puretext,maxlen=50,padding='post')
    predicted = cnn_model.predict(text_pad)
    predicted_category = predicted.argmax(axis=1)
    return le.classes_[predicted_category]

In [None]:
quest = ['apa prinsip-prinsip investasi berdasarkan syariah']
print("CNN Predicted Category: {}".format(cnn_predict(quest)))

Testing code untuk menghasilkan nilai y

In [None]:
quest = ['apa prinsip ekonomi syariah?']

In [None]:
puretext = tokenizer.texts_to_sequences(quest)
text_pad = pad_sequences(puretext,maxlen=50,padding='post')
predicted = cnn_model.predict(text_pad)
predicted_category = predicted.argmax(axis=1)

In [None]:
puretext

In [None]:
text_pad 

In [None]:
predicted

In [None]:
X

In [None]:
predicted_category

In [None]:
le.classes_[predicted_category]

In [None]:
predictions = np.array(predicted)

In [None]:
predictions

Menngunakan Rake untuk mencari jawaban

In [23]:
# algoritma MIT Licesence https://github.com/csurfer/rake-nltk
from rake_nltk import Rake, Metric

TO CONTROL STOPWORD

    r = Rake(
        stopwords=<list of stopwords>,
        punctuations=<string of puntuations to ignore>
    )
    
TO CONTROL METRIC FOR RANKING

    # Paper uses d(w)/f(w) as the final metric. You can use this API with the
    # following metrics:

    # 1. d(w)/f(w) (Default metric) Ratio of degree of word to its frequency.
    r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)

    # 2. d(w) Degree of word only.
    r = Rake(ranking_metric=Metric.WORD_DEGREE)

    # 3. f(w) Frequency of word only.
    r = Rake(ranking_metric=Metric.WORD_FREQUENCY)
 
TO CONTROL THE MAX OR MIN WORDS IN A PHARSE

    r = Rake(min_length=2, max_length=4)


In [30]:
# fungsi stopword indonesia
r = Rake(language="indonesian", min_length=1, max_length=4)

In [None]:
textsu = "muhammad rizki aditia kenapa bisa adalah adanya adapun ataukah ataupun awal awalnya bagai bagaikan"
r.extract_keywords_from_text(textsu)
r.get_ranked_phrases_with_scores()

In [None]:
r.extract_keywords_from_sentences(question)
r.get_ranked_phrases_with_scores()

In [42]:
quests = "Bagaimana bank konvensional menurut Islam?"
r.extract_keywords_from_text(quests)
sentences = r.get_ranked_phrases_with_scores()

In [43]:
sentences 

[(4.0, 'bank konvensional'), (1.0, 'islam')]

Menghitung hasil dari Accuracy dan Precision dari Model CNN

In [None]:
predictions = cnn_model.predict(X_test)
predictions = [np.argmax(predictions[i]) for i in range(len(predictions))]
predictions = np.array(predictions)
labels = [np.argmax(y_test[i]) for i in range(len(y_test))]
labels = np.array(labels)

In [None]:
from sklearn import metrics

print ("Accuracy: " + str(100*metrics.accuracy_score(labels, predictions)))
print ("Precision: " + str(100*metrics.precision_score(labels, predictions, average="weighted")))
print ("Recall: " + str(100*metrics.recall_score(labels, predictions, average="weighted")))
print ("f1_score: " + str(100*metrics.f1_score(labels, predictions, average="weighted")))