### ***1. Import Libraries***

----

In [4]:
import pandas as pd
# import tensorflow_hub as tf_hub

import math

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

----
----

<vspace>

### ***2.Load Data***

----

In [5]:
# load data from file question.csv
df = pd.read_csv("../dataset/question.csv")
df

Unnamed: 0,question
0,Forza horizon 5 bsa om .bsa skalian di install?
1,gan itu pc only udah sama dalemannya?
2,"gan, bisa maenin game EA FC 24 dan call of dut..."
3,gan ini kalo buat pb atau gta 5 bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...
...,...
1552,gan kalo di pake game microsoft flight simulat...
1553,Min minecraft berapa fps tolong ya min mau bel...
1554,Min untuk hardisk-nya kalau diubah ke 1TB apa ...
1555,kalo ada yg mau di ubah bisa min?


#### ***2.1 Handling Data Duplicate***

In [6]:
# Check data duplicated
df.duplicated().sum()

166

In [7]:
# Remove data duplicate
df = df.drop_duplicates(ignore_index=True)

In [8]:
# Check data duplicated
df.duplicated().sum()

0

#### ***2.2 Handling Missing Value***

In [9]:
# Check Missing Value
df.isnull().sum()

question    0
dtype: int64

In [None]:
# Handling missing value by delete it
df.dropna(inplace=True)

----
----

<vspace>

### ***3.Preprocessing Data***

----

#### ***3.1 Replace enter into space***

In [10]:
# Check if question has newline
df[df['question'].str.contains("\n")]

Unnamed: 0,question
5,bang kalo beli gamenya bisa di ilangin?\nsaya ...
17,sore gan \napakah pc nya masih ada \nterima kasih
24,bisa di kirim ke tangsel?\nrequest game maks brp?
26,"Pagi, mo nanya dong gan\n1. Barangnya baru sem..."
84,Buat editing video gmn gan?\nAtau bisa di reko...
...,...
1278,1. ka ini vga nya kita yg pilih diantara 3 itu...
1289,kak untuk spek yang kayak gini ada gak/spek pr...
1300,kak kalo memorynya itu DDR4 \nvga 4gb ddr5 ber...
1302,apakah vga bisa di upgrade?\n\nada slot tambah...


In [11]:
# Replace newline into space
df['question'] = df['question'].str.replace('\n',' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question'] = df['question'].str.replace('\n',' ')


In [12]:
# Check if question has newline
df[df['question'].str.contains("\n")]

Unnamed: 0,question


<vspace>

#### ***3.2 Remove unnecessary characters (Only alphabetic remaining)***

In [13]:
# show top 5 data in df
df.head()

Unnamed: 0,question
0,Forza horizon 5 bsa om .bsa skalian di install?
1,gan itu pc only udah sama dalemannya?
2,"gan, bisa maenin game EA FC 24 dan call of dut..."
3,gan ini kalo buat pb atau gta 5 bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...


In [14]:
def clean_special_character(text):
    '''
    This function is used to transofrm text so there are no special character ( alphabetic and numeric only)

    parameter description
    ===========================
    text = question or regular sentence 

    usage example 
    ===================
    data_inferential = "untuk record MLBB dan PUBG kuat kah gan??"
    data_inferential = clean_special_character(data_inferential)
    '''
    result = "";
    for char in text:
        if (char == " " or char.isalpha()) and char != "²":
            result+= char
        else:
            result += " "
    return result
df['question_after_preprocessing'] = df['question'].apply(clean_special_character)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_after_preprocessing'] = df['question'].apply(clean_special_character)


In [15]:
df.head()

Unnamed: 0,question,question_after_preprocessing
0,Forza horizon 5 bsa om .bsa skalian di install?,Forza horizon bsa om bsa skalian di install
1,gan itu pc only udah sama dalemannya?,gan itu pc only udah sama dalemannya
2,"gan, bisa maenin game EA FC 24 dan call of dut...",gan bisa maenin game EA FC dan call of dut...
3,gan ini kalo buat pb atau gta 5 bisa,gan ini kalo buat pb atau gta bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...,Bisa installin ets dan gta roleplay Sisanya...


<vspace>

#### ***3.3 Change letters to lowercase***

-----

In [16]:
# show top 5 data in df
df.head()

Unnamed: 0,question,question_after_preprocessing
0,Forza horizon 5 bsa om .bsa skalian di install?,Forza horizon bsa om bsa skalian di install
1,gan itu pc only udah sama dalemannya?,gan itu pc only udah sama dalemannya
2,"gan, bisa maenin game EA FC 24 dan call of dut...",gan bisa maenin game EA FC dan call of dut...
3,gan ini kalo buat pb atau gta 5 bisa,gan ini kalo buat pb atau gta bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...,Bisa installin ets dan gta roleplay Sisanya...


In [17]:
# Change question text to lower case
df['question_after_preprocessing'] = df['question_after_preprocessing'].apply(lambda x: " ".join(x.lower() for x in x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_after_preprocessing'] = df['question_after_preprocessing'].apply(lambda x: " ".join(x.lower() for x in x.split()))


In [18]:
# show top 5 data in df
df.head()

Unnamed: 0,question,question_after_preprocessing
0,Forza horizon 5 bsa om .bsa skalian di install?,forza horizon bsa om bsa skalian di install
1,gan itu pc only udah sama dalemannya?,gan itu pc only udah sama dalemannya
2,"gan, bisa maenin game EA FC 24 dan call of dut...",gan bisa maenin game ea fc dan call of duty mo...
3,gan ini kalo buat pb atau gta 5 bisa,gan ini kalo buat pb atau gta bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...,bisa installin ets dan gta roleplay sisanya ko...


<vspace>

#### ***3.4 Remove White Spaces***

----

In [19]:
# Check data if there is question with whitespace
df[(df['question_after_preprocessing'].str.get(0) == ' ') | (df['question_after_preprocessing'].str.get(-1) == ' ')]

Unnamed: 0,question,question_after_preprocessing


In [20]:
# Remove white space in df question text 
df['question_after_preprocessing'] = df['question_after_preprocessing'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_after_preprocessing'] = df['question_after_preprocessing'].str.strip()


In [21]:
# Check data if there is question with whitespace
df[(df['question_after_preprocessing'].str.get(0) == ' ') | (df['question_after_preprocessing'].str.get(-1) == ' ')]

Unnamed: 0,question,question_after_preprocessing


#### ***3.5 Create dataframe for tagging/labeling Or Tokenization***

----

In [22]:
# This cell code is used to convert data to tokenization
list_result = []
for index in df.index:
    list_word = df.iloc[index]['question_after_preprocessing'].split(" ")
    for word in list_word:
        list_result.append({
            'sentence':f'Kalimat {index+1}',
            'kata': word,
            'tag':''
        })
df_token = pd.DataFrame(list_result)
df_token

Unnamed: 0,sentence,kata,tag
0,Kalimat 1,forza,
1,Kalimat 1,horizon,
2,Kalimat 1,bsa,
3,Kalimat 1,om,
4,Kalimat 1,bsa,
...,...,...,...
12104,Kalimat 1390,bisa,
12105,Kalimat 1390,min,
12106,Kalimat 1391,ini,
12107,Kalimat 1391,ready,


<vspace>

<vspace>

#### ***3.6 Change Slang Word into Normal***

----

In [23]:
# read file csv for slang dictionary 
slang = pd.read_csv("../dataset/Slang2.csv")
slang_dict = dict(zip(slang['slang'], slang['formal']))

# del specific key because it is not neccessary for PC GAMING NER scenario 
del slang_dict['main']
del slang_dict['banget']
del slang_dict['uhh']
del slang_dict['takut']
del slang_dict['da']
del slang_dict['uhhh']

# edit specific key in slang_dict 
slang_dict['dahhhh'] = 'sudah'
slang_dict['kalo'] = 'kalau'

In [25]:
# THis notebook is used to check if there is slang word in df_token
def check_slang_word(word):
    steming_slang = slang_dict.get(word)
    if steming_slang == None:
        return 
    else:
        try:
            if(math.isnan(steming_slang)):
                return
            else:
                print(f"{word} should be {steming_slang}")
                return steming_slang
        except:
            print(f"{word} should be {steming_slang}")
            return steming_slang
        
df_token['kata'].apply(check_slang_word)

bsa should be bisa
bsa should be bisa
skalian should be  sekalian
udah should be sudah
ea should be iya
kalo should be kalau
kalo should be kalau
gak should be tidak
ori should be orisinal
c should be sih
gk should be tidak
kalo should be kalau
kalo should be kalau
trus should be terus
gk should be tidak
smpe should be sampai
k should be ke
klau should be kalau
w should be saya
gk should be tidak
ga should be tidak
brapa should be berapa
utk should be  untuk
klo should be kalau
d should be di
yg should be yang
tdk should be tidak
kalo should be kalau
nambah should be menambah
brp should be berapa
ori should be orisinal
kalo should be kalau
g should be begitu
gak should be tidak
nambah should be menambah
brp should be berapa
brp should be berapa
mo should be mau
ori should be orisinal
ms should be masih
ori should be orisinal
thn should be tahun
jkt should be jakarta
brp should be berapa
yah should be iya
ap should be apa
brpa should be berapa
boss should be bos
brp should be berapa
gk 

0        None
1        None
2        bisa
3        None
4        bisa
         ... 
12104    None
12105    None
12106    None
12107    None
12108    None
Name: kata, Length: 12109, dtype: object

In [26]:
def replace_slang(word):
    # Replace each token if it matches a slang term
    steming_slang = slang_dict.get(word)
    if steming_slang == None:
        return word
    else:
        try:
            if(math.isnan(steming_slang)):
                return word
            else:
                return steming_slang
        except:
            return steming_slang

df_token['kata'] = df_token['kata'].apply(replace_slang)


In [27]:
# THis notebook is used to check if there is slang word in df_token
def check_slang_word(word):
    steming_slang = slang_dict.get(word)
    if steming_slang == None:
        return 
    else:
        try:
            if(math.isnan(steming_slang)):
                return
            else:
                print(f"{word} should be {steming_slang}")
                return 
        except:
            print(f"{word} should be {steming_slang}")
            return 
        
df_token['kata'].apply(check_slang_word)

0        None
1        None
2        None
3        None
4        None
         ... 
12104    None
12105    None
12106    None
12107    None
12108    None
Name: kata, Length: 12109, dtype: object

<b>

#### ***3.7 Stemming***

----

In [28]:
# Create object that use to stemming word in indonesia using Sastrawati
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [29]:
list_skip_steming_word = ['kinemaster','setingan','bekasi','seandainya','seting','rohan','lemot','kesing','diseting']

In [30]:
# Stemming word using sastrawi
def steming_word_sastrawi(word):
    stemmed_word = word
    if word not in list_skip_steming_word:
        stemmed_word = stemmer.stem(word)
    return stemmed_word

df_token['kata_steming'] = df_token['kata'].apply(steming_word_sastrawi)

In [31]:
# Steming using manual word
list_kata_dasar = ['setting','packing','offline','pc','seting','memory','software','ssd','halo','render','ongkir','ganti','upgrade','vga','mobo','case','casing','install','keyboard','ddr','processor','hdd','storage']
for kata_dasar in list_kata_dasar:
    df_token.loc[df_token['kata'].str.contains(kata_dasar),'kata'] = kata_dasar

In [950]:
# df_token.loc[df_token['kata'].str.contains('storage'),'kata'] 

3616     storagenya
3897        storage
3900        storage
3967        storage
4246        storage
4351        storage
11725       storage
Name: kata, dtype: object

In [32]:
# Check word before and after steming
test = df_token[df_token['kata'] != df_token['kata_steming']][['kata','kata_steming']].drop_duplicates()
test.to_csv('test_steming.csv',index=False)

#### ***3.8 Stopword***

In [33]:
# initiate object that use to remove stopword using sastrawi
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

In [34]:
# Remove stopword in df_token saraswati
def remove_stopword(word):
    stopword_word = stopword_remover.remove(word)
    return stopword_word
    
df_token['kata_steming'] = df_token['kata_steming'].apply(remove_stopword)

In [35]:
# Check stoword in csv
test = df_token[df_token['kata_steming'] == ''][['kata']].drop_duplicates()
test.to_csv('test_stopword.csv',index=False)

In [36]:
df_token = df_token[df_token['kata_steming'] != '']

#### ***3.9 Labeling word tag with existing data***

In [37]:
df_token['kata'] = df_token['kata_steming']
df_token = df_token[['sentence','kata','tag']]
df_token

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_token['kata'] = df_token['kata_steming']


Unnamed: 0,sentence,kata,tag
0,Kalimat 1,forza,
1,Kalimat 1,horizon,
3,Kalimat 1,om,
5,Kalimat 1,sekali,
7,Kalimat 1,install,
...,...,...,...
12101,Kalimat 1390,mau,
12103,Kalimat 1390,ubah,
12105,Kalimat 1390,min,
12107,Kalimat 1391,ready,


In [39]:
# import DatasetWithTagFinal csv
df_import = pd.read_csv("../dataset/DatasetWithTagFinal.csv")

In [44]:
def fill_tag(sentence,kata):
    tag = df_import[(df_import['sentence'] == sentence) & (df_import['kata'] == kata)][['tag']].values
    try:
        return tag[0][0]
    except:
        return ''

df_token['tag'] = df_token.apply(lambda row_data: fill_tag(row_data['sentence'],row_data['kata']),axis=1)

In [46]:
# Check if there are missing tag
df_token[(df_token['tag'] == '') | (df_token['tag'].isna())]

Unnamed: 0,sentence,kata,tag
36,Kalimat 5,installin,
555,Kalimat 68,ongkirx,
826,Kalimat 105,rendering,
854,Kalimat 109,settingan,
874,Kalimat 111,ongkirnya,
...,...,...,...
11531,Kalimat 1325,offlinenya,
11544,Kalimat 1327,packingnya,
11890,Kalimat 1368,diseting,
11961,Kalimat 1376,settingan,


#### ***3.10 Manual Labeling (If necessary)***

In [47]:
# def add_tag(kata,tag,sentence):
#     if kata == 'getar' :
#     # if kata == 'studio' and sentence == 'Kalimat 190':
#         return 'E-Device'
#     return tag
# df_token['tag'] = df_token.apply(lambda x: add_tag(x['kata'],x['tag'],x['sentence']), axis=1)

In [48]:
# df_token[df_token['tag'] == 'B-Aksesoris']

In [49]:
# df_token.groupby('sentence').count()

In [50]:
# df_token['tag'].unique()

In [51]:
# df_token[df_token['kata'] == 'getar']

In [52]:
# df_token[df_token['tag'].isna()][df_token['sentence'] == 'Kalimat 1237']

In [53]:
# df.iloc[1236]['question']

In [56]:
# df_token[(df_token['tag'] == '') | (df_token['tag'].isna())]

In [57]:
# test = df_token[(df_token['tag'] == '') | (df_token['tag'].isna())][['kata']]
# test['kata2'] = test['kata']
# # test
# test.groupby('kata').count().sort_values(by='kata2',ascending=False)

In [55]:
# df_token.to_csv("../dataset/DatasetWithTagFinal.csv",index=False)

----
----