In [1]:
# Import Libraries
import json
import nltk
import time
import random
import string
import pickle
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Model
from keras.utils.vis_utils import plot_model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, Embedding, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import concatenate, Dense, Dropout, Conv1D, MaxPooling1D, Flatten

tokenizer = Tokenizer(num_words=1000)
le = LabelEncoder()

In [None]:
# Package sentence tokenizer
nltk.download('punkt')
# Package lemmatization
nltk.download('wordnet')
# Package multilingual wordnet data
nltk.download('omw-1.4')

In [None]:
df = pd.read_csv("indonesian_conversation_data.csv", low_memory = False, encoding='utf8')
df.head()

In [None]:
data = df[['Pertanyaan', 'Kategori']]
data.head()

In [None]:
# Removing Punctuations (Menghilangkan Punktuasi)
data['Pertanyaan'] = data['Pertanyaan'].apply(lambda wrd:[ltrs.lower() for ltrs in wrd if ltrs not in string.punctuation])
data['Pertanyaan'] = data['Pertanyaan'].apply(lambda wrd: ''.join(wrd))
for i in range(data.shape[0]):
  data['Pertanyaan'][i]=re.sub(r'\n', ' ',data['Pertanyaan'][i])
  data['Pertanyaan'][i]=re.sub('\(', '',data['Pertanyaan'][i])
  data['Pertanyaan'][i]=re.sub(r'\)', '',data['Pertanyaan'][i])
  data['Pertanyaan'][i]=re.sub(r',', '',data['Pertanyaan'][i])
  data['Pertanyaan'][i]=re.sub(r'-', '',data['Pertanyaan'][i])
  data['Pertanyaan'][i]=re.sub(r'/', '',data['Pertanyaan'][i])
  data['Pertanyaan'][i]=re.sub(r'/', '',data['Pertanyaan'][i])
  data['Pertanyaan'][i]=re.sub(r"[^\w]", ' ',data['Pertanyaan'][i])

In [6]:
kata_index = {}
for pattern in data['Pertanyaan']:
      kata = pattern.lower().split()
      for w in kata:
            if w not in kata_index:
                  kata_index[w] = len(kata_index) + 1

In [7]:
sequences = []
for sentence in data['Pertanyaan']:
    words = sentence.lower().split()  # Mengubah ke huruf kecil dan memecah kalimat menjadi kata-kata
    sequence = [kata_index[word] for word in words]  # Mengubah setiap kata menjadi angka berdasarkan kamus
    sequences.append(sequence)

In [8]:
# Menentukan panjang maksimum urutan angka
max_length = max(len(sequence) for sequence in sequences)

# Melakukan padding pada setiap urutan angka
padded_sequences = []
for sequence in sequences:
    padded_sequence = sequence + [0] * (max_length - len(sequence))
    padded_sequences.append(padded_sequence)

# Menyimpan hasil padded dalam variabel
padded_sequences_variable = padded_sequences

In [9]:
train = padded_sequences_variable

In [10]:
x_train = np.array(train)

In [11]:
panjang_input = x_train.shape[1]

In [12]:
labels = data['Kategori']
label_mapping = {label: idx for idx, label in enumerate(labels.unique())}
y_train1 = labels.map(label_mapping)
y_train = y_train1.to_numpy()

In [13]:
label_mapping_inverse = {idx: label for label, idx in label_mapping.items()}

In [None]:
# define vocabulary
vocabulary = len(kata_index)
print("Jumlah vocabulary data pertanyaan: ", vocabulary)

# output length
tag = np.unique(data['Kategori'])
panjang_output = len(tag)
print("Jumlah kelas unik: ", panjang_output)
print("Kelas unik: ", tag[0:5])

In [15]:
jawaban = {}

for index, row in df.iterrows():
    kategori = row['Kategori']
    jawaban_1 = row['Jawaban']

    if kategori not in jawaban:
        jawaban[kategori] = [jawaban_1]
    else:
        jawaban[kategori].append(jawaban_1)

In [16]:
model = tf.keras.models.load_model('model.h5')

In [None]:
# Mengambil input dari pengguna
prediction_input = input('👨‍🦰 Kamu : ')

# Menghapus punktuasi dan konversi ke huruf kecil
prediction_input = [letters.lower() for letters in prediction_input if letters not in string.punctuation]
prediction_input = ''.join(prediction_input)

# Tokenisasi input
words = prediction_input.split()
sequence = [kata_index.get(word, 0) for word in words]  # Mengubah kata menjadi indeks berdasarkan kamus
padded_sequence = sequence + [0] * (panjang_input - len(sequence))  # Padding dengan menambahkan nilai 0

# Membuat prediksi
prediction = model.predict(np.array([padded_sequence]))
predicted_class = np.argmax(prediction)

# Mendapatkan tag kategori yang sesuai dengan prediksi
predicted_tag = list(label_mapping.keys())[list(label_mapping.values()).index(predicted_class)]

# Mendapatkan jawaban yang sesuai dengan tag kategori
predicted_answer = jawaban.get(predicted_tag)[0:20]

# Menampilkan hasil prediksi
if predicted_answer is not None:
    print("SMILE :) : ", random.choice(predicted_answer))
else:
    print("Jawaban tidak ditemukan.")