### ***1.Import Libraries***

----

In [1]:
import pandas as pd
import nltk
import re
# Import Libraries
import string
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
# import tensorflow_hub as tf_hub

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import math

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

----
----

<vspace>

### ***2.Load Data***

----

In [2]:
# load data from file question.csv
df = pd.read_csv("../dataset/question.csv")
df

Unnamed: 0,question
0,Forza horizon 5 bsa om .bsa skalian di install?
1,gan itu pc only udah sama dalemannya?
2,"gan, bisa maenin game EA FC 24 dan call of dut..."
3,gan ini kalo buat pb atau gta 5 bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...
...,...
1553,gan kalo di pake game microsoft flight simulat...
1554,Min minecraft berapa fps tolong ya min mau bel...
1555,Min untuk hardisk-nya kalau diubah ke 1TB apa ...
1556,kalo ada yg mau di ubah bisa min?


#### ***2.1 Handling Data Duplicate***

In [3]:
# Check data duplicated
df.duplicated().sum()

166

In [4]:
# Remove data duplicate
df = df.drop_duplicates(ignore_index=True)

In [5]:
df.duplicated().sum()

0

#### ***2.2 Handling Missing Value***

In [6]:
# Check Missing Value
df.isnull().sum()

question    0
dtype: int64

----
----

<vspace>

### ***3.Preprocessing Data***

----

<vspace>

#### ***3.1 Remove unnecessary characters (Only alphabetic and numeric remaining)***

In [7]:
df

Unnamed: 0,question
0,Forza horizon 5 bsa om .bsa skalian di install?
1,gan itu pc only udah sama dalemannya?
2,"gan, bisa maenin game EA FC 24 dan call of dut..."
3,gan ini kalo buat pb atau gta 5 bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...
...,...
1387,gan kalo di pake game microsoft flight simulat...
1388,Min minecraft berapa fps tolong ya min mau bel...
1389,Min untuk hardisk-nya kalau diubah ke 1TB apa ...
1390,kalo ada yg mau di ubah bisa min?


In [8]:
def clean_special_character(text):
    result = "";
    for char in text:
        if char == " " or char.isalnum():
            result+= char
    return result
df['question_after_preprocessing'] = df['question'].apply(clean_special_character)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_after_preprocessing'] = df['question'].apply(clean_special_character)


In [9]:
df.head()

Unnamed: 0,question,question_after_preprocessing
0,Forza horizon 5 bsa om .bsa skalian di install?,Forza horizon 5 bsa om bsa skalian di install
1,gan itu pc only udah sama dalemannya?,gan itu pc only udah sama dalemannya
2,"gan, bisa maenin game EA FC 24 dan call of dut...",gan bisa maenin game EA FC 24 dan call of duty...
3,gan ini kalo buat pb atau gta 5 bisa,gan ini kalo buat pb atau gta 5 bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...,Bisa installin ets2 dan gta roleplay Sisanya k...


<vspace>

#### ***3.2 Change letters to lowercase***

-----

In [10]:
df.head()

Unnamed: 0,question,question_after_preprocessing
0,Forza horizon 5 bsa om .bsa skalian di install?,Forza horizon 5 bsa om bsa skalian di install
1,gan itu pc only udah sama dalemannya?,gan itu pc only udah sama dalemannya
2,"gan, bisa maenin game EA FC 24 dan call of dut...",gan bisa maenin game EA FC 24 dan call of duty...
3,gan ini kalo buat pb atau gta 5 bisa,gan ini kalo buat pb atau gta 5 bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...,Bisa installin ets2 dan gta roleplay Sisanya k...


In [11]:
df['question_after_preprocessing'] = df['question_after_preprocessing'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_after_preprocessing'] = df['question_after_preprocessing'].apply(lambda x: " ".join(x.lower() for x in x.split()))


Unnamed: 0,question,question_after_preprocessing
0,Forza horizon 5 bsa om .bsa skalian di install?,forza horizon 5 bsa om bsa skalian di install
1,gan itu pc only udah sama dalemannya?,gan itu pc only udah sama dalemannya
2,"gan, bisa maenin game EA FC 24 dan call of dut...",gan bisa maenin game ea fc 24 dan call of duty...
3,gan ini kalo buat pb atau gta 5 bisa,gan ini kalo buat pb atau gta 5 bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...,bisa installin ets2 dan gta roleplay sisanya k...
...,...,...
1387,gan kalo di pake game microsoft flight simulat...,gan kalo di pake game microsoft flight simulat...
1388,Min minecraft berapa fps tolong ya min mau bel...,min minecraft berapa fps tolong ya min mau bel...
1389,Min untuk hardisk-nya kalau diubah ke 1TB apa ...,min untuk hardisknya kalau diubah ke 1tb apa p...
1390,kalo ada yg mau di ubah bisa min?,kalo ada yg mau di ubah bisa min


<vspace>

#### ***3.3 Remove White Spaces***

----

In [12]:
df['question_after_preprocessing'] = df['question_after_preprocessing'].str.strip()
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_after_preprocessing'] = df['question_after_preprocessing'].str.strip()


Unnamed: 0,question,question_after_preprocessing
0,Forza horizon 5 bsa om .bsa skalian di install?,forza horizon 5 bsa om bsa skalian di install
1,gan itu pc only udah sama dalemannya?,gan itu pc only udah sama dalemannya
2,"gan, bisa maenin game EA FC 24 dan call of dut...",gan bisa maenin game ea fc 24 dan call of duty...
3,gan ini kalo buat pb atau gta 5 bisa,gan ini kalo buat pb atau gta 5 bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...,bisa installin ets2 dan gta roleplay sisanya k...
...,...,...
1387,gan kalo di pake game microsoft flight simulat...,gan kalo di pake game microsoft flight simulat...
1388,Min minecraft berapa fps tolong ya min mau bel...,min minecraft berapa fps tolong ya min mau bel...
1389,Min untuk hardisk-nya kalau diubah ke 1TB apa ...,min untuk hardisknya kalau diubah ke 1tb apa p...
1390,kalo ada yg mau di ubah bisa min?,kalo ada yg mau di ubah bisa min


<vspace>

#### ***3.4 Tokenisasi***

----

In [13]:
# Define the tokenization function
def word_token_default(doc):
    if isinstance(doc, str):
        return doc.split()
    else:
        return []

# Apply the tokenization function to the 'question' column
df['question_tokens'] = df['question_after_preprocessing'].apply(word_token_default)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_tokens'] = df['question_after_preprocessing'].apply(word_token_default)


In [14]:
df.head()

Unnamed: 0,question,question_after_preprocessing,question_tokens
0,Forza horizon 5 bsa om .bsa skalian di install?,forza horizon 5 bsa om bsa skalian di install,"[forza, horizon, 5, bsa, om, bsa, skalian, di,..."
1,gan itu pc only udah sama dalemannya?,gan itu pc only udah sama dalemannya,"[gan, itu, pc, only, udah, sama, dalemannya]"
2,"gan, bisa maenin game EA FC 24 dan call of dut...",gan bisa maenin game ea fc 24 dan call of duty...,"[gan, bisa, maenin, game, ea, fc, 24, dan, cal..."
3,gan ini kalo buat pb atau gta 5 bisa,gan ini kalo buat pb atau gta 5 bisa,"[gan, ini, kalo, buat, pb, atau, gta, 5, bisa]"
4,Bisa installin ets2 dan gta roleplay?? Sisanya...,bisa installin ets2 dan gta roleplay sisanya k...,"[bisa, installin, ets2, dan, gta, roleplay, si..."


#### ***3.5 Remove redundant character in word***

In [15]:
df['question_tokens']

0       [forza, horizon, 5, bsa, om, bsa, skalian, di,...
1            [gan, itu, pc, only, udah, sama, dalemannya]
2       [gan, bisa, maenin, game, ea, fc, 24, dan, cal...
3          [gan, ini, kalo, buat, pb, atau, gta, 5, bisa]
4       [bisa, installin, ets2, dan, gta, roleplay, si...
                              ...                        
1387    [gan, kalo, di, pake, game, microsoft, flight,...
1388    [min, minecraft, berapa, fps, tolong, ya, min,...
1389    [min, untuk, hardisknya, kalau, diubah, ke, 1t...
1390            [kalo, ada, yg, mau, di, ubah, bisa, min]
1391                                    [ini, ready, gan]
Name: question_tokens, Length: 1392, dtype: object

In [16]:
def remove_reduncdant_char_in_word(list_word):
    list_result = []
    for word in list_word:
        temp_word = ""
        for index,char in enumerate(word):
            if index == 0:
                temp_word += char
            elif word[index-1] != char:
                temp_word += char
        list_result.append(temp_word)
    return list_result

df['question_tokens'] = df['question_tokens'].apply(remove_reduncdant_char_in_word)
df['question_tokens']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_tokens'] = df['question_tokens'].apply(remove_reduncdant_char_in_word)


0       [forza, horizon, 5, bsa, om, bsa, skalian, di,...
1             [gan, itu, pc, only, udah, sama, dalemanya]
2       [gan, bisa, maenin, game, ea, fc, 24, dan, cal...
3          [gan, ini, kalo, buat, pb, atau, gta, 5, bisa]
4       [bisa, instalin, ets2, dan, gta, roleplay, sis...
                              ...                        
1387    [gan, kalo, di, pake, game, microsoft, flight,...
1388    [min, minecraft, berapa, fps, tolong, ya, min,...
1389    [min, untuk, hardisknya, kalau, diubah, ke, 1t...
1390            [kalo, ada, yg, mau, di, ubah, bisa, min]
1391                                    [ini, ready, gan]
Name: question_tokens, Length: 1392, dtype: object

<vspace>

----

#### ***3.6 Change Slang Word into Normal***

In [17]:
slang = pd.read_csv("../dataset/Slang2.csv")

In [18]:
slang

Unnamed: 0,slang,formal
0,woww,wow
1,aminn,amin
2,met,selamat
3,netaas,menetas
4,keberpa,keberapa
...,...,...
4255,baguss,baik
4256,utk,untuk
4257,skalian,sekalian
4258,proxessor,processor


In [19]:
df

Unnamed: 0,question,question_after_preprocessing,question_tokens
0,Forza horizon 5 bsa om .bsa skalian di install?,forza horizon 5 bsa om bsa skalian di install,"[forza, horizon, 5, bsa, om, bsa, skalian, di,..."
1,gan itu pc only udah sama dalemannya?,gan itu pc only udah sama dalemannya,"[gan, itu, pc, only, udah, sama, dalemanya]"
2,"gan, bisa maenin game EA FC 24 dan call of dut...",gan bisa maenin game ea fc 24 dan call of duty...,"[gan, bisa, maenin, game, ea, fc, 24, dan, cal..."
3,gan ini kalo buat pb atau gta 5 bisa,gan ini kalo buat pb atau gta 5 bisa,"[gan, ini, kalo, buat, pb, atau, gta, 5, bisa]"
4,Bisa installin ets2 dan gta roleplay?? Sisanya...,bisa installin ets2 dan gta roleplay sisanya k...,"[bisa, instalin, ets2, dan, gta, roleplay, sis..."
...,...,...,...
1387,gan kalo di pake game microsoft flight simulat...,gan kalo di pake game microsoft flight simulat...,"[gan, kalo, di, pake, game, microsoft, flight,..."
1388,Min minecraft berapa fps tolong ya min mau bel...,min minecraft berapa fps tolong ya min mau bel...,"[min, minecraft, berapa, fps, tolong, ya, min,..."
1389,Min untuk hardisk-nya kalau diubah ke 1TB apa ...,min untuk hardisknya kalau diubah ke 1tb apa p...,"[min, untuk, hardisknya, kalau, diubah, ke, 1t..."
1390,kalo ada yg mau di ubah bisa min?,kalo ada yg mau di ubah bisa min,"[kalo, ada, yg, mau, di, ubah, bisa, min]"


In [20]:
def replace_slang(tokens):
    # Create a dictionary from the slang DataFrame for faster lookups
    slang_dict = dict(zip(slang['slang'], slang['formal']))
    
    # Replace each token if it matches a slang term
    list_result = [];
    for token in tokens:
        steming_slang = slang_dict.get(token)
        if steming_slang == None:
            list_result.append(token)
        else:
            try:
                if(math.isnan(steming_slang)):
                    list_result.append(token)
                else:
                    list_result.append(steming_slang)
            except:
                list_result.append(steming_slang)
    return list_result
    # return [token if math.isnan(slang_dict.get(token)) else slang_dict.get(token)  for token in tokens]

# Apply the slang replacement function to the 'question_tokens' column
df['question_tokens']  = df['question_tokens'].apply(lambda tokens: replace_slang(tokens))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_tokens']  = df['question_tokens'].apply(lambda tokens: replace_slang(tokens))


In [21]:
df['question_tokens']

0       [forza, horizon, 5, bisa, om, bisa,  sekalian,...
1            [gan, itu, pc, only, sudah, sama, dalemanya]
2       [gan, bisa, maenin, game, iya, fc, 24, dan, ca...
3        [gan, ini,  kalau, buat, pb, atau, gta, 5, bisa]
4       [bisa, instalin, ets2, dan, gta, roleplay, sis...
                              ...                        
1387    [gan,  kalau, di, pakai, game, microsoft, flig...
1388    [min, minecraft, berapa, fps, tolong, ya, min,...
1389    [min, untuk, hardisknya, kalau, diubah, ke, 1t...
1390        [ kalau, ada, yang, mau, di, ubah, bisa, min]
1391                                    [ini, ready, gan]
Name: question_tokens, Length: 1392, dtype: object

----

<b>

#### ***3.7 Stemming***

----

In [22]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()



In [23]:
df

Unnamed: 0,question,question_after_preprocessing,question_tokens
0,Forza horizon 5 bsa om .bsa skalian di install?,forza horizon 5 bsa om bsa skalian di install,"[forza, horizon, 5, bisa, om, bisa, sekalian,..."
1,gan itu pc only udah sama dalemannya?,gan itu pc only udah sama dalemannya,"[gan, itu, pc, only, sudah, sama, dalemanya]"
2,"gan, bisa maenin game EA FC 24 dan call of dut...",gan bisa maenin game ea fc 24 dan call of duty...,"[gan, bisa, maenin, game, iya, fc, 24, dan, ca..."
3,gan ini kalo buat pb atau gta 5 bisa,gan ini kalo buat pb atau gta 5 bisa,"[gan, ini, kalau, buat, pb, atau, gta, 5, bisa]"
4,Bisa installin ets2 dan gta roleplay?? Sisanya...,bisa installin ets2 dan gta roleplay sisanya k...,"[bisa, instalin, ets2, dan, gta, roleplay, sis..."
...,...,...,...
1387,gan kalo di pake game microsoft flight simulat...,gan kalo di pake game microsoft flight simulat...,"[gan, kalau, di, pakai, game, microsoft, flig..."
1388,Min minecraft berapa fps tolong ya min mau bel...,min minecraft berapa fps tolong ya min mau bel...,"[min, minecraft, berapa, fps, tolong, ya, min,..."
1389,Min untuk hardisk-nya kalau diubah ke 1TB apa ...,min untuk hardisknya kalau diubah ke 1tb apa p...,"[min, untuk, hardisknya, kalau, diubah, ke, 1t..."
1390,kalo ada yg mau di ubah bisa min?,kalo ada yg mau di ubah bisa min,"[ kalau, ada, yang, mau, di, ubah, bisa, min]"


In [24]:
def steming_word_sastrawi(list_word):
    try:
        list_result = []
        for word in list_word:
            stemmed_word = stemmer.stem(word)
            list_result.append(stemmed_word)
        return list_result
    except:
        print(list_word)
        return list_word

df['question_tokens'] = df['question_tokens'].apply(steming_word_sastrawi)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_tokens'] = df['question_tokens'].apply(steming_word_sastrawi)


#### ***3.8 Stopword***

In [25]:
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

In [27]:
df['question_tokens']

0       [forza, horizon, 5, bisa, om, bisa, sekali, di...
1            [gan, itu, pc, only, sudah, sama, dalemanya]
2       [gan, bisa, maenin, game, iya, fc, 24, dan, ca...
3         [gan, ini, kalau, buat, pb, atau, gta, 5, bisa]
4       [bisa, instalin, ets2, dan, gta, roleplay, sis...
                              ...                        
1387    [gan, kalau, di, pakai, game, microsoft, fligh...
1388    [min, minecraft, berapa, fps, tolong, ya, min,...
1389    [min, untuk, hardisknya, kalau, ubah, ke, 1tb,...
1390         [kalau, ada, yang, mau, di, ubah, bisa, min]
1391                                    [ini, ready, gan]
Name: question_tokens, Length: 1392, dtype: object

In [30]:
def remove_stopword(list_word):
    try:
        list_result = []
        for word in list_word:
            stopword_word = stopword_remover.remove(word)
            if stopword_word != '':
                list_result.append(word)
        return list_result
    except:
        print(list_word)
        return list_word
    
df['question_tokens'] = df['question_tokens'].apply(remove_stopword)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_tokens'] = df['question_tokens'].apply(remove_stopword)


In [31]:
df

Unnamed: 0,question,question_after_preprocessing,question_tokens
0,Forza horizon 5 bsa om .bsa skalian di install?,forza horizon 5 bsa om bsa skalian di install,"[forza, horizon, 5, om, sekali, instal]"
1,gan itu pc only udah sama dalemannya?,gan itu pc only udah sama dalemannya,"[gan, pc, only, sama, dalemanya]"
2,"gan, bisa maenin game EA FC 24 dan call of dut...",gan bisa maenin game ea fc 24 dan call of duty...,"[gan, maenin, game, iya, fc, 24, cal, of, duty..."
3,gan ini kalo buat pb atau gta 5 bisa,gan ini kalo buat pb atau gta 5 bisa,"[gan, kalau, buat, pb, gta, 5]"
4,Bisa installin ets2 dan gta roleplay?? Sisanya...,bisa installin ets2 dan gta roleplay sisanya k...,"[instalin, ets2, gta, roleplay, sisa, kosong]"
...,...,...,...
1387,gan kalo di pake game microsoft flight simulat...,gan kalo di pake game microsoft flight simulat...,"[gan, kalau, pakai, game, microsoft, flight, s..."
1388,Min minecraft berapa fps tolong ya min mau bel...,min minecraft berapa fps tolong ya min mau bel...,"[min, minecraft, berapa, fps, min, mau, beli, ..."
1389,Min untuk hardisk-nya kalau diubah ke 1TB apa ...,min untuk hardisknya kalau diubah ke 1tb apa p...,"[min, hardisknya, kalau, ubah, 1tb, apa, perlu..."
1390,kalo ada yg mau di ubah bisa min?,kalo ada yg mau di ubah bisa min,"[kalau, mau, ubah, min]"
