## 1. Import Libraries

In [1]:
# Import libraryu for load file pkl and txt
import pickle  
import json 

# import libnrary for proses data manipulation
import pandas as pd 
import numpy as np
import math

# import for process stemming using sastrawi 
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory


from collections import defaultdict
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
with open("model_NER.pkl","rb") as file_1:
    model = pickle.load(file_1)

with open("wordDict.json","r") as file_2:
    word2idx = json.load(file_2)

with open("tagDict.json","r") as file_3:
    tag2idx = json.load(file_3)

In [3]:
# create data dummy for model inferential 
data_inferential =  "met forza horizon bisa recordd MLBB kepulauan dann PUBG kuat kah gan?? "

In [4]:
def clean_special_character(text):
    '''
    This function is used to transofrm text so there are no special character ( alphabetic and numeric only)

    parameter description
    ===========================
    text = question or regular sentence 

    usage example 
    ===================
    data_inferential = "untuk record MLBB dan PUBG kuat kah gan??"
    data_inferential = clean_special_character(data_inferential)
    '''
    result = "";
    for char in text:
        if char == " " or char.isalnum():
            result+= char
    return result

# Remove special character in data inferential
data_inferential = clean_special_character(data_inferential)
data_inferential

'met forza horizon bisa recordd MLBB kepulauan dann PUBG kuat kah gan '

In [5]:
# convert data inferential to lower case 
data_inferential = data_inferential.lower()
data_inferential

'met forza horizon bisa recordd mlbb kepulauan dann pubg kuat kah gan '

In [6]:
# remove whitespaces in data inferential 
data_inferential = data_inferential.strip()
data_inferential

'met forza horizon bisa recordd mlbb kepulauan dann pubg kuat kah gan'

In [7]:
data_inferential_token = data_inferential.split(" ")
data_inferential_token

['met',
 'forza',
 'horizon',
 'bisa',
 'recordd',
 'mlbb',
 'kepulauan',
 'dann',
 'pubg',
 'kuat',
 'kah',
 'gan']

In [8]:
# import file Slang2.csv
slang = pd.read_csv("Slang2.csv")

# Create a dictionary from the slang DataFrame for faster lookups
slang_dict = dict(zip(slang['slang'], slang['formal']))

def replace_slang(tokens):
    '''
    This function is used for replace slang word indonesia into standard word 
    example "met" become "selamat"

    parameter description
    =============================
    tokens = list of word 

    example usage 
    ==============================
    data_inferential_token = ['untuk', 'recordd', 'mlbb', 'dann', 'pubg', 'kuat', 'kah', 'gan']
    data_inferential_token  = replace_slang(data_inferential_token)
    '''
    # Replace each token if it matches a slang term
    list_result = [];
    for token in tokens:
        steming_slang = slang_dict.get(token)
        if steming_slang == None:
            list_result.append(token)
        else:
            try:
                if(math.isnan(steming_slang)):
                    list_result.append(token)
                else:
                    list_result.append(steming_slang)
            except:
                list_result.append(steming_slang)
    return list_result
    # return [token if math.isnan(slang_dict.get(token)) else slang_dict.get(token)  for token in tokens]

# Apply the slang replacement function to the data_inferential_token
data_inferential_token  = replace_slang(data_inferential_token)
data_inferential_token

['selamat',
 'forza',
 'horizon',
 'bisa',
 'recordd',
 'mlbb',
 'kepulauan',
 'dann',
 'pubg',
 'kuat',
 'kah',
 'gan']

In [9]:
def remove_stopwords(tokens):
    text = ' '.join(tokens)  # Convert list of tokens to a single string
    filtered_text = stopword.remove(text)  # Remove stopwords
    return filtered_text.split()  # Convert the filtered string back to a list of tokens

In [10]:
data_inferential_token = remove_stopwords(data_inferential_token)

In [11]:
original_tokens = defaultdict(list)

In [12]:
for tokens in data_inferential_token:
    for token in tokens:
        stemmed_token = stemmer.stem(token)
        original_tokens[stemmed_token].append(token)

In [13]:
common_original_tokens = {stem: max(set(tokens), key=tokens.count) for stem, tokens in original_tokens.items()}

# Step 2: Define the stemming function
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Apply the stemming function to the tokens
data_inferential_token = stem_tokens(data_inferential_token)

In [14]:
def convert_word_to_nominal_category(list_word):
    '''
    This function is used to change list word to categorial nominal based on word_dict that created when 
    process modelling 

    parameter_description 
    ========================
    list_word = list of word

    example usage 
    ====================
    data_inferential_token = ['untuk', 'recordd', 'mlbb', 'dann', 'pubg', 'kuat', 'kah', 'gan']
    data_inferential_token  = convert_word_to_nominal_category(data_inferential_token)ata_inferential_token= remove_stopword(data_inferential_token)
    data_inferential_token
    '''
    list_result = []
    for word in list_word:
        if word in word2idx:
            index = word2idx[word]
            list_result.append(index)
    return list_result

In [15]:
data_inferential_token_id = convert_word_to_nominal_category(data_inferential_token)

In [16]:
data_inferential_token_id

[30, 146, 496, 339, 410, 438]

In [17]:
data_inferential_token_id = np.array([data_inferential_token_id])

In [18]:
data_inferential_token_id

array([[ 30, 146, 496, 339, 410, 438]])

In [19]:
word2idx

{'370': 1,
 '1': 2,
 'fortnite': 3,
 'last': 4,
 'assalmmualaikum kak': 5,
 'blender': 6,
 'call': 7,
 'ssdny': 8,
 'hhd': 9,
 'ethernal': 10,
 'desain': 11,
 'mini': 12,
 'intel': 13,
 'aplikasi': 14,
 'casingnya': 15,
 'simpan': 16,
 'case': 17,
 'eksternal': 18,
 'steamnya': 19,
 'alamat': 20,
 'microsd': 21,
 'varian': 22,
 'dessert': 23,
 'ram': 24,
 'i7gen': 25,
 'cyber': 26,
 '24mp88': 27,
 'vanguard': 28,
 'lengkap': 29,
 'forza': 30,
 'us': 31,
 'harga56jtan': 32,
 'cod': 33,
 'merek': 34,
 'redy': 35,
 'gta': 36,
 'paking': 37,
 'pes': 38,
 '8x2': 39,
 'kredit': 40,
 'ongkir': 41,
 'baja': 42,
 'pointblank': 43,
 'lokasi': 44,
 'core': 45,
 'cooler': 46,
 'vesta': 47,
 'redemtion2': 48,
 'center': 49,
 'include': 50,
 'mouse': 51,
 'ganti': 52,
 'usb': 53,
 '27': 54,
 'duty': 55,
 'memori': 56,
 'gtx750': 57,
 '16gb': 58,
 'alden': 59,
 'pcnya': 60,
 'fs2020': 61,
 'indonesia': 62,
 'asus': 63,
 'ride': 64,
 'ngerakit': 65,
 'vganya': 66,
 'wifii': 67,
 'point': 68,
 'ets2': 

In [20]:
y_pred = model.predict(data_inferential_token_id)
y_pred = np.argmax(y_pred, axis=-1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


In [21]:
y_pred

array([[ 42,  42,  42, 130, 167, 128]], dtype=int64)

In [22]:
tag2idx

{'B-Kegiatan': 0,
 'E-GamePerformance': 1,
 'E-Aksesoris': 2,
 'B-Lengkap': 3,
 'B-DevicePerformance': 4,
 'B-FullsetAtauRakit': 5,
 'E-Original': 6,
 'B-Request': 7,
 'B-Pengiriman': 8,
 'E-Request': 9,
 'E-Tempat': 10,
 'B-Garansi': 11,
 'E-Software': 12,
 'B-Bayar': 13,
 'I-Device': 14,
 'B-Harga': 15,
 'E-Spek': 16,
 'I-Merek': 17,
 'B-Merek': 18,
 'I-Spek': 19,
 'I-Aksesoris': 20,
 'B-GamePerformance': 21,
 'I-Bayar': 22,
 'I-GamePerformance': 23,
 'B-rekomendasi': 24,
 'I-Request': 25,
 'I-FullsetAtauRakit': 26,
 'B-Penipuan': 27,
 'I-Pengiriman': 28,
 'B-Software': 29,
 'B-Kecewa': 30,
 'B-Rekomendasi': 31,
 'B-Tempat': 32,
 'E-Pengiriman': 33,
 'I-Software': 34,
 'E-Kegiatan': 35,
 'E-Merek': 36,
 'E-Bayar': 37,
 'E-FullsetAtauRakit': 38,
 'B-Aksesoris': 39,
 'I-Game': 40,
 'B-Game': 41,
 'E-Game': 42,
 'I-Kegiatan': 43,
 'E-Device': 44,
 'B-Spek': 45,
 'B-Original': 46,
 'B-Device': 47,
 'B-Ketersediaan': 48}

In [26]:
idx2tag = {v: k for k, v in tag2idx.items()}
idx2word = {v: k for k, v in word2idx.items()}

# Decode data_inferential_token_id
decoded_words = np.vectorize(lambda x: idx2word.get(x, 'Unknown'))(data_inferential_token_id)
# Decode y_pred
decoded_tags = np.vectorize(lambda x: idx2tag.get(x, 'None'))(y_pred)

# Check for 'None' values in decoded_tags and handle them
for row in decoded_tags:
    for i, tag in enumerate(row):
        if tag == 'None':
            # Handle the 'None' value appropriately, e.g., replace with a default tag or remove
            row[i] = 'ENPAD'  # Replace 'None' with 'Unknown'

print("Decoded words:", decoded_words)
print("Decoded tags:", decoded_tags)

Decoded words: [['forza' 'horizon' 'mlbb' 'pulau' 'pubg' 'kuat']]
Decoded tags: [['E-Game' 'E-Game' 'E-Game' 'ENPAD' 'ENPAD' 'ENPAD']]


In [27]:
print("{:15}\t{}".format("Word", "Pred"))
print("-" * 30)

for word, tag in zip(decoded_words[0], decoded_tags[0]):
    print("{:15}\t{}".format(word, tag))

Word           	Pred
------------------------------
forza          	E-Game
horizon        	E-Game
mlbb           	E-Game
pulau          	ENPAD
pubg           	ENPAD
kuat           	ENPAD
