# 1. Import Libraries

In [1]:
# Import libraryu for load file pkl and txt
import pickle  
import json 

# import libnrary for proses data manipulation
import pandas as pd 
import numpy as np
import math

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from tensorflow.keras.preprocessing.sequence import pad_sequences

# 2. Load Model

In [2]:
# load model from file pkl
with open('model.pkl', 'rb') as file_1:
  model = pickle.load(file_1)

with open('word_dict.txt', 'r') as file_1:
  word_dict = json.load(file_1)

# 3. Data Pre Processing Text

In [3]:
# create data dummy for model inferential 
data_inferential = "met untuk bisa recordd MLBB kepulauan dann PUBG kuat kah gan?? "

In [4]:

def clean_special_character(text):
    '''
    This function is used to transofrm text so there are no special character ( alphabetic and numeric only)

    parameter description
    ===========================
    text = question or regular sentence 

    usage example 
    ===================
    data_inferential = "untuk record MLBB dan PUBG kuat kah gan??"
    data_inferential = clean_special_character(data_inferential)
    '''
    result = "";
    for char in text:
        if char == " " or char.isalnum():
            result+= char
    return result

# Remove special character in data inferential
data_inferential = clean_special_character(data_inferential)
data_inferential

'met untuk bisa recordd MLBB kepulauan dann PUBG kuat kah gan '

In [5]:
# convert data inferential to lower case 
data_inferential = data_inferential.lower()
data_inferential

'met untuk bisa recordd mlbb kepulauan dann pubg kuat kah gan '

In [6]:
# remove whitespaces in data inferential 
data_inferential = data_inferential.strip()
data_inferential

'met untuk bisa recordd mlbb kepulauan dann pubg kuat kah gan'

In [7]:
data_inferential_token = data_inferential.split(" ")
data_inferential_token

['met',
 'untuk',
 'bisa',
 'recordd',
 'mlbb',
 'kepulauan',
 'dann',
 'pubg',
 'kuat',
 'kah',
 'gan']

In [9]:
# import file Slang2.csv
slang = pd.read_csv("../dataset/Slang2.csv")

# Create a dictionary from the slang DataFrame for faster lookups
slang_dict = dict(zip(slang['slang'], slang['formal']))

def replace_slang(tokens):
    '''
    This function is used for replace slang word indonesia into standard word 
    example "met" become "selamat"

    parameter description
    =============================
    tokens = list of word 

    example usage 
    ==============================
    data_inferential_token = ['untuk', 'recordd', 'mlbb', 'dann', 'pubg', 'kuat', 'kah', 'gan']
    data_inferential_token  = replace_slang(data_inferential_token)
    '''
    # Replace each token if it matches a slang term
    list_result = [];
    for token in tokens:
        steming_slang = slang_dict.get(token)
        if steming_slang == None:
            list_result.append(token)
        else:
            try:
                if(math.isnan(steming_slang)):
                    list_result.append(token)
                else:
                    list_result.append(steming_slang)
            except:
                list_result.append(steming_slang)
    return list_result
    # return [token if math.isnan(slang_dict.get(token)) else slang_dict.get(token)  for token in tokens]

# Apply the slang replacement function to the data_inferential_token
data_inferential_token  = replace_slang(data_inferential_token)
data_inferential_token

['selamat',
 'untuk',
 'bisa',
 'record',
 'mlb',
 'kepulauan',
 'dan',
 'pubg',
 'kuat',
 'kah',
 'gan']

In [10]:
# Initiate object that can be used for Stemming word in indonesia
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def steming_word_sastrawi(list_word):
    '''
    This function is used for stemming word in indonesia 
    example word "kepulauan" become "pulau"

    parameter description 
    ======================
    list_word = list of word 

    usage example 
    ======================
    data_inferential_token = ['untuk', 'recordd', 'mlbb', 'dann', 'pubg', 'kuat', 'kah', 'gan']
    data_inferential_token  = steming_word_sastrawi(data_inferential_token)
    '''
    list_result = []
    for word in list_word:
        stemmed_word = stemmer.stem(word)
        list_result.append(stemmed_word)
    return list_result

# apply function steming_word_sastrawi in data_inferential_token
data_inferential_token= steming_word_sastrawi(data_inferential_token)
data_inferential_token

['selamat',
 'untuk',
 'bisa',
 'record',
 'mlb',
 'pulau',
 'dan',
 'pubg',
 'kuat',
 'kah',
 'gan']

In [11]:
# Initiate object that can be used to remove stopword in list word
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

def remove_stopword(list_word):
    '''
    This function is used to remove stopword in list word using Sastrawati 
    example word "untuk" will be removed because it doesn't have meaning

    parameter description 
    ========================
    list_word = list of word 

    example usage 
    ========================
    data_inferential_token = ['untuk', 'recordd', 'mlbb', 'dann', 'pubg', 'kuat', 'kah', 'gan']
    data_inferential_token  = remove_stopword(data_inferential_token)ata_inferential_token= remove_stopword(data_inferential_token)
    data_inferential_token
    '''
    list_result = []
    for word in list_word:
        stopword_word = stopword_remover.remove(word)
        if stopword_word != '':
            list_result.append(word)
    return list_result
    
data_inferential_token= remove_stopword(data_inferential_token)
data_inferential_token

['selamat', 'record', 'mlb', 'pulau', 'pubg', 'kuat', 'gan']

In [12]:
def convert_word_to_nominal_category(list_word):
    '''
    This function is used to change list word to categorial nominal based on word_dict that created when 
    process modelling 

    parameter_description 
    ========================
    list_word = list of word

    example usage 
    ====================
    data_inferential_token = ['untuk', 'recordd', 'mlbb', 'dann', 'pubg', 'kuat', 'kah', 'gan']
    data_inferential_token  = convert_word_to_nominal_category(data_inferential_token)ata_inferential_token= remove_stopword(data_inferential_token)
    data_inferential_token
    '''
    list_result = []
    for word in list_word:
        try:
            index = word_dict[word]
        except:
            index = word_dict['ENDPAD']
        list_result.append(index)
    return list_result

 
data_inferential_token= convert_word_to_nominal_category(data_inferential_token)
data_inferential_token

[754, 30, 754, 754, 36, 641, 133]

In [15]:
max_len = 70
num_words = len(word_dict)
X = [data_inferential_token]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words-1)
X

array([[754,  30, 754, 754,  36, 641, 133, 753, 753, 753, 753, 753, 753,
        753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753,
        753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753,
        753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753,
        753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753, 753,
        753, 753, 753, 753, 753]], dtype=int32)

In [14]:
X

[754, 30, 754, 754, 36, 641, 133]

In [16]:
predict = model.predict(X)

ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("sequential_1/Cast:0", shape=(32,), dtype=float32). Expected shape (None, 70), but input has incompatible shape (32,)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32,), dtype=int32)
  • training=False
  • mask=None