# Import Lib and Dep

In [24]:
import nltk  # Import NLTK (Natural Language Toolkit).
nltk.download('punkt_tab')  # Dataset for tokenization
nltk.download('stopwords')  # Dataset for stopwords

from nltk.tokenize import word_tokenize  # Tokenize text
from nltk.corpus import stopwords  # Stopwords list

import re, string
import numpy as np

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Model and Dataset

In [36]:
from tensorflow.keras.models import load_model
import pandas as pd

# Load data
df = pd.read_csv('app_review.csv')
df_limit = df.sample(n=50000, random_state=42)

# Load trained model
model_lstm = load_model('model_lstm.h5')
model_gru = load_model('model_gru.h5')
model_rnn = load_model('model_simplernn.h5')

# Label mapping
labels = ['negative', 'neutral', 'positive']



# Preprocessing

## Removing Special Characters

In [5]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # delete mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # delete hashtag
    text = re.sub(r'RT[\s]', '', text) # delete RT (for tweets only)
    text = re.sub(r"http\S+", '', text) # delete link
    text = re.sub(r'[0-9]+', '', text) # delete number
    text = re.sub(r'[^\w\s]', '', text) # delete other special characters

    text = text.replace('\n', ' ') # replace new row with space
    text = text.translate(str.maketrans('', '', string.punctuation)) # replace punctuation
    text = text.strip(' ') # delete space from left and right side of the text
    return text

## Case Folding

In [6]:
def casefoldingText(text): # Lower the text
    text = text.lower()
    return text

## Tokenizing

In [7]:
def tokenizingText(text): # Tokenize the text
    text = word_tokenize(text)
    return text

## Stopwords & Slangwords

In [8]:
def filteringText(text): # delete stopwords
    listStopwords = set(stopwords.words('english'))

    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

In [9]:
slangwords = {}

def fix_slangwords(text):
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text

## Lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

def lemmatizingText(text): # Reduce the word to its base
    # Create lemmatizer object
    lemmatizer = WordNetLemmatizer()

    words = text

    # Implement lemmatizing for each word
    lemma_words = [lemmatizer.lemmatize(word.lower()) for word in words]

    # Join the lemmatized word
    lemmatized_text = ' '.join(lemma_words)

    return lemmatized_text

[nltk_data] Downloading package wordnet to /root/nltk_data...


## Applying

In [13]:
def preprocessingText(text):
  # clean the text and save it to text_clean
  clean_text = cleaningText(text)

  # Lower the text and save it to text_casefoldingText
  casefolding_text = casefoldingText(clean_text)

  # Tokenize the text and save it to text_tokenizingText
  tokenize_text = tokenizingText(casefolding_text)

  # Delete stopwords and save it to text_stopword
  stopword_text = filteringText(tokenize_text)

  # reduce the word to its base word
  lemmatized_text = lemmatizingText(stopword_text)

  return lemmatized_text

# Prediction Setup

In [37]:
# Tokenizer Setup
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

preprocessed_texts = df_limit['content'].apply(preprocessingText)  # pastikan list of string

tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(preprocessed_texts)

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [42]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def textToSequence(text):
  # preprocessing the text
  preprocessed_text = preprocessingText(text)

  # print(preprocessed_text)

  # Use the same tokenizer for the same tokenize index
  with open('tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)

  # Konversi ke urutan angka
  sequences = tokenizer.texts_to_sequences([preprocessed_text])

  # print(sequences)

  # padding to standardize the length
  maxlen = 100
  padded_text = pad_sequences(sequences, padding='post', maxlen=maxlen)

  return padded_text

def predictLstm(text):
  # convert text to sequence
  padded_text = textToSequence(text)

  # prediction
  pred = model_lstm.predict(padded_text)

  predicted_class = np.argmax(pred, axis=1)[0]
  confidence = np.max(pred)

  return labels[predicted_class], float(confidence)

def predictGru(padded_text):
  # convert text to sequence
  padded_text = textToSequence(text)

  # prediction
  pred = model_gru.predict(padded_text)

  predicted_class = np.argmax(pred, axis=1)[0]
  confidence = np.max(pred)

  return labels[predicted_class], float(confidence)

def predictRnn(padded_text):
  # convert text to sequence
  padded_text = textToSequence(text)

  # prediction
  pred = model_rnn.predict(padded_text)

  predicted_class = np.argmax(pred, axis=1)[0]
  confidence = np.max(pred)

  return labels[predicted_class], float(confidence)

# Inference

In [45]:
# Positive
text = "This app is perfect!"
sent_lstm, conf_lstm = predictLstm(text)
print(f"Prediksi LSTM: {sent_lstm} (Confidence LSTM: {conf_lstm:.2f})")
sent_gru, conf_gru = predictGru(text)
print(f"Prediksi GRU: {sent_gru} (Confidence LSTM: {conf_gru:.2f})")
sent_rnn, conf_rnn = predictRnn(text)
print(f"Prediksi RNN: {sent_rnn} (Confidence LSTM: {conf_rnn:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
Prediksi LSTM: positive (Confidence LSTM: 1.00)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
Prediksi GRU: positive (Confidence LSTM: 1.00)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Prediksi RNN: positive (Confidence LSTM: 1.00)


In [46]:
# Negative
text = "This app is really bad!"
sent_lstm, conf_lstm = predictLstm(text)
print(f"Prediksi LSTM: {sent_lstm} (Confidence LSTM: {conf_lstm:.2f})")
sent_gru, conf_gru = predictGru(text)
print(f"Prediksi GRU: {sent_gru} (Confidence LSTM: {conf_gru:.2f})")
sent_rnn, conf_rnn = predictRnn(text)
print(f"Prediksi RNN: {sent_rnn} (Confidence LSTM: {conf_rnn:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Prediksi LSTM: negative (Confidence LSTM: 1.00)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Prediksi GRU: negative (Confidence LSTM: 1.00)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Prediksi RNN: negative (Confidence LSTM: 0.54)


In [47]:
# Neutral
text = "This app allows you to listen to music all day!"
sent_lstm, conf_lstm = predictLstm(text)
print(f"Prediksi LSTM: {sent_lstm} (Confidence LSTM: {conf_lstm:.2f})")
sent_gru, conf_gru = predictGru(text)
print(f"Prediksi GRU: {sent_gru} (Confidence LSTM: {conf_gru:.2f})")
sent_rnn, conf_rnn = predictRnn(text)
print(f"Prediksi RNN: {sent_rnn} (Confidence LSTM: {conf_rnn:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Prediksi LSTM: neutral (Confidence LSTM: 0.77)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Prediksi GRU: neutral (Confidence LSTM: 0.96)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Prediksi RNN: positive (Confidence LSTM: 0.72)
