## 1. Import Libraries

In [1]:
# Import libraryu for load file pkl and txt
import pickle  
import json 

# import libnrary for proses data manipulation
import pandas as pd 
import numpy as np
import math

# import for process stemming using sastrawi 
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory


from collections import defaultdict
from tensorflow.keras.preprocessing.sequence import pad_sequences

## 2. Load model and tag,word dictionary.

In [2]:
with open("model_NER.pkl","rb") as file_1:
    model = pickle.load(file_1)

with open("wordDict.json","r") as file_2:
    word2idx = json.load(file_2)

with open("tagDict.json","r") as file_3:
    tag2idx = json.load(file_3)

## 3. Data Inferential Preparation

In [3]:
# create data dummy for model inferential 
data_inferential =  "Fortnite Brp fps kk "

### 3.1 Replace enter with space

In [4]:
# Replace newline into space
data_inferential = data_inferential.replace('\n',' ')

### ***3.2 Remove unnecessary characters (Only alphabetic remaining)***

In [5]:
result = ""
for char in data_inferential:
    if (char == " " or char.isalpha()) and char != "²":
        result+= char
    else:
        result += " "

data_inferential = result
data_inferential

'Forza horizon   bsa om  bsa skalian di install  '

### ***3.3 Change letters to lowercase***

In [6]:
# convert data inferential to lower case 
data_inferential = data_inferential.lower()
data_inferential

'forza horizon   bsa om  bsa skalian di install  '

### ***3.4 Remove White Spaces***

In [7]:
# remove whitespaces in data inferential 
data_inferential = data_inferential.strip()
data_inferential

'forza horizon   bsa om  bsa skalian di install'

### ***3.5  Tokenization***

In [8]:
data_inferential_token = data_inferential.split(" ")
data_inferential_token

['forza',
 'horizon',
 '',
 '',
 'bsa',
 'om',
 '',
 'bsa',
 'skalian',
 'di',
 'install']

### ***3.6 Change Slang Word into Normal***

In [9]:
# read file csv for slang dictionary 
slang = pd.read_csv("../dataset/Slang2.csv")
slang_dict = dict(zip(slang['slang'], slang['formal']))

# del specific key because it is not neccessary for PC GAMING NER scenario 
del slang_dict['main']
del slang_dict['banget']
del slang_dict['uhh']
del slang_dict['takut']
del slang_dict['da']
del slang_dict['uhhh']

# edit specific key in slang_dict 
slang_dict['dahhhh'] = 'sudah'
slang_dict['kalo'] = 'kalau'

In [10]:
list_result = []
for word in data_inferential_token:
    steming_slang = slang_dict.get(word)
    if steming_slang == None:
        list_result.append(word)
    else:
        try:
            if(math.isnan(steming_slang)):
                list_result.append(word)
            else:
                list_result.append(steming_slang)
        except:
            list_result.append(steming_slang)

data_inferential_token = list_result
data_inferential_token


['forza',
 'horizon',
 '',
 '',
 'bisa',
 'om',
 '',
 'bisa',
 ' sekalian',
 'di',
 'install']

### ***3.7 Stemming***

In [11]:
# Create object that use to stemming word in indonesia using Sastrawati
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [12]:
list_skip_steming_word = ['kinemaster','setingan','bekasi','seandainya','seting','rohan','lemot','kesing','diseting']

In [13]:
list_result = []
for word in data_inferential_token:
    stemmed_word = word
    if word not in list_skip_steming_word:
        stemmed_word = stemmer.stem(word)
    list_result.append(stemmed_word)
data_inferential_token = list_result
data_inferential_token

['forza',
 'horizon',
 '',
 '',
 'bisa',
 'om',
 '',
 'bisa',
 'sekali',
 'di',
 'install']

In [14]:
# Steming using manual word
list_kata_dasar = ['setting','packing','offline','pc','seting','memory','software','ssd','halo','render','ongkir','ganti','upgrade','vga','mobo','case','casing','install','keyboard','ddr','processor','hdd','storage']
list_result = []
for token in data_inferential_token:
    word_result = token
    for kata_dasar in list_kata_dasar:
        if token.find(kata_dasar) != -1:
            word_result = kata_dasar
            break
    list_result.append(word_result)

data_inferential_token = list_result 
data_inferential_token

['forza',
 'horizon',
 '',
 '',
 'bisa',
 'om',
 '',
 'bisa',
 'sekali',
 'di',
 'install']

### ***3.8 Stopword***

In [15]:
# initiate object that use to remove stopword using sastrawi
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

In [16]:
def remove_stopwords(tokens):
    text = ' '.join(tokens)  # Convert list of tokens to a single string
    filtered_text = stopword_remover.remove(text)  # Remove stopwords
    return filtered_text.split()  # Convert the filtered string back to a list of tokens

In [17]:
data_inferential_token = remove_stopwords(data_inferential_token)
data_inferential_token

['forza', 'horizon', 'om', 'sekali', 'install']

### 3.9 convert word into categorial nominal

In [18]:
def convert_word_to_nominal_category(list_word):
    '''
    This function is used to change list word to categorial nominal based on word_dict that created when 
    process modelling 

    parameter_description 
    ========================
    list_word = list of word

    example usage 
    ====================
    data_inferential_token = ['untuk', 'recordd', 'mlbb', 'dann', 'pubg', 'kuat', 'kah', 'gan']
    data_inferential_token  = convert_word_to_nominal_category(data_inferential_token)ata_inferential_token= remove_stopword(data_inferential_token)
    data_inferential_token
    '''
    list_result = []
    for word in list_word:
        if word in word2idx:
            index = word2idx[word]
            list_result.append(index)
        else:
            list_result.append(0)
    return list_result

In [19]:
data_inferential_token

['forza', 'horizon', 'om', 'sekali', 'install']

In [20]:
data_inferential_token_id = convert_word_to_nominal_category(data_inferential_token)
list_result = []
for index in data_inferential_token_id:
    if index != 0:
        list_result.append(index)
data_inferential_token_id = list_result
total_word = len(data_inferential_token_id)
data_inferential_token_id

[223, 145, 122]

In [21]:
data_inferential_token_id = np.array([data_inferential_token_id])
data_inferential_token_id

array([[223, 145, 122]])

In [22]:
data_inferential_token_id = pad_sequences(maxlen=40, sequences=data_inferential_token_id, padding="post")
data_inferential_token_id

array([[223, 145, 122,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0]], dtype=int32)

In [23]:
pred = model.predict(data_inferential_token_id)
pred = np.argmax(pred, axis=-1)
pred = pred[0][0:total_word]
pred = np.array([pred])
pred

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step


array([[ 6, 58, 22]])

In [24]:
idx2tag = {v: k for k, v in tag2idx.items()}
idx2word = {v: k for k, v in word2idx.items()}

# Decode data_inferential_token_id
decoded_words = np.vectorize(lambda x: idx2word.get(x, 'Unknown'))(data_inferential_token_id)
# Decode y_pred
decoded_tags = np.vectorize(lambda x: idx2tag.get(x, 'None'))(pred)

# Check for 'None' values in decoded_tags and handle them
for row in decoded_tags:
    for i, tag in enumerate(row):
        if tag == 'None':
            # Handle the 'None' value appropriately, e.g., replace with a default tag or remove
            row[i] = 'ENPAD'  # Replace 'None' with 'Unknown'

print("Decoded words:", decoded_words)
print("Decoded tags:", decoded_tags)

Decoded words: [['forza' 'horizon' 'install' 'Unknown' 'Unknown' 'Unknown' 'Unknown'
  'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown'
  'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown'
  'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown'
  'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown'
  'Unknown' 'Unknown' 'Unknown' 'Unknown' 'Unknown']]
Decoded tags: [['B-Game' 'ENPAD' 'B-Request']]


In [25]:
print("{:15}\t{}".format("Word", "Pred"))
print("-" * 30)

for word, tag in zip(decoded_words[0], decoded_tags[0]):
    print("{:15}\t{}".format(word, tag))

Word           	Pred
------------------------------
forza          	B-Game
horizon        	ENPAD
install        	B-Request
