In [1]:
import pandas as pd
import nltk
import string
import re
import ast
import numpy as np
import json
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from keras.models import load_model

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

In [3]:
!pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory



# Preprocessing Function

In [4]:
with open("slang_words.txt", "r") as slang_file:
  slang_content = slang_file.read()
  slang_words = ast.literal_eval(slang_content)

In [5]:
def remove_punctuation(text):
  text = re.sub('-',' ',text)
  text = text.translate(str.maketrans('', '', string.punctuation))
  return text

def case_folding(text):
  text = text.lower()
  return text

def tokenizingText(text):
  text = word_tokenize(text)
  return text

def slang_word(text):
  filtered = []
  for txt in text:
    if txt not in slang_words.keys():
      filtered.append(txt)
    if txt in slang_words.keys():
      x = txt.replace(txt, slang_words[txt])
      filtered.append(x)
  text = filtered
  return text

def stemmingText(text):
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  text = [stemmer.stem(word) for word in text]
  return text

def toSentence(list_words):
  sentence = ' '.join(word for word in list_words)
  return sentence

In [6]:
def data_preprocessing(list):
  cleaned_text = [remove_punctuation(x) for x in list]
  cleaned_text = [case_folding(x) for x in cleaned_text]
  tokenized_text = [tokenizingText(x) for x in cleaned_text]
  standarized_text = [slang_word(x) for x in tokenized_text]
  stemmed_text = [stemmingText(x) for x in standarized_text]
  preprocessed_text = [toSentence(x) for x in tokenized_text]

  return preprocessed_text

# Tokenizing Function

In [7]:
tokenizer = Tokenizer(oov_token='<OOV>')
with open('word_index.json') as json_word_index:
  word_index = json.load(json_word_index)
tokenizer.word_index = word_index

In [8]:
def token_for_sequences(preprocessed_text):
  sequences = tokenizer.texts_to_sequences(preprocessed_text)
  padded_sequences = pad_sequences(sequences, padding='post', maxlen=150)

  return padded_sequences

# Model and Prediction

In [9]:
model = load_model('lstm_model.h5')

In [10]:
def preprocess_to_predict(text):
  preprocessed_text = data_preprocessing(text)
  padded_sequences = token_for_sequences(preprocessed_text)
  result = model.predict(padded_sequences)
  weight = 0
  denominator = 0
  for value in result:
    weight +=1
    value *= weight
    denominator+=weight
  avg_result = np.sum(result)/denominator
  return avg_result

# Testing

In [11]:
preprocess_to_predict(['Sangat agresif saat melihat anjing lain; Saya takut berantem dengan anjing lain.',
                       'Ketika aku memanggilnya untuk makan, Henry akan datang dengan cepat, mengibaskan ekornya. Dia akan duduk dengan sabar menunggu sinyal dariku, dan kemudian menikmati makanannya dengan lahap.',
                       'Anjingku tidak nafsu makan, meskipun dia masih sering bermain dengan teman-temannya',
                       'anjingku benar-benar malas banget, dia cuma tidur-tiduran tidak karuan. AKu juga sudah mencoba membawanya berkeliling, tapi dia kelihatan gak semangat.',
                       'Hari ini anjingku tidak lagi kencing sembarangan dan saya senang sekali',
                       'Ia selalu nyenyak saat majikannya, Rani, memberinya tempat tidur yang nyaman, seperti kasur, bantal, atau selimut.',
                       'Gani adalah anjing yang suka bersosialisasi. Gani adalah anjing yang ramah dan asertif. Ia selalu senang saat majikannya, Rani, membawanya ke tempat-tempat ramai, seperti pasar, mall, atau taman.',
                       ])



0.5429657186780658