### ***Import Libraries***

----

In [1]:
import pandas as pd
import nltk
import re
# Import Libraries
import string
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
# import tensorflow_hub as tf_hub

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer




----
----

<vspace>

### ***Load Data***

----

In [2]:
df = pd.read_csv("question1.csv")
df

Unnamed: 0,question
0,Forza horizon 5 bsa om .bsa skalian di install?
1,gan itu pc only udah sama dalemannya?
2,"gan, bisa maenin game EA FC 24 dan call of dut..."
3,gan ini kalo buat pb atau gta 5 bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...
...,...
1553,gan kalo di pake game microsoft flight simulat...
1554,Min minecraft berapa fps tolong ya min mau bel...
1555,Min untuk hardisk-nya kalau diubah ke 1TB apa ...
1556,kalo ada yg mau di ubah bisa min?


In [3]:
df.duplicated().sum()

166

In [4]:
df = df.drop_duplicates(ignore_index=True)
df

Unnamed: 0,question
0,Forza horizon 5 bsa om .bsa skalian di install?
1,gan itu pc only udah sama dalemannya?
2,"gan, bisa maenin game EA FC 24 dan call of dut..."
3,gan ini kalo buat pb atau gta 5 bisa
4,Bisa installin ets2 dan gta roleplay?? Sisanya...
...,...
1387,gan kalo di pake game microsoft flight simulat...
1388,Min minecraft berapa fps tolong ya min mau bel...
1389,Min untuk hardisk-nya kalau diubah ke 1TB apa ...
1390,kalo ada yg mau di ubah bisa min?


----
----

<vspace>

### ***Preprocessing Data***

----

<vspace>

#### ***Remove unnecessary characters***

----

In [5]:
def clean(txt):
    txt = txt.str.replace("()", "")
    txt = txt.str.replace('(<a).*(>).*()', '')
    txt = txt.str.replace('(&amp)', '')
    txt = txt.str.replace('(&gt)', '')
    txt = txt.str.replace('(&lt)', '')
    txt = txt.str.replace('(\xa0)', ' ')
    txt = txt.str.replace('?','')
    txt = txt.str.replace('.','')
    txt = txt.str.replace('[^\w\s]', '')
    txt = txt.str.replace('"', '')
    txt = txt.str.replace('\d+', '')
    return txt

In [6]:
df['question'] = clean(df['question'])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question'] = clean(df['question'])


Unnamed: 0,question
0,Forza horizon 5 bsa om bsa skalian di install
1,gan itu pc only udah sama dalemannya
2,"gan, bisa maenin game EA FC 24 dan call of dut..."
3,gan ini kalo buat pb atau gta 5 bisa
4,Bisa installin ets2 dan gta roleplay Sisanya k...
...,...
1387,gan kalo di pake game microsoft flight simulat...
1388,Min minecraft berapa fps tolong ya min mau bel...
1389,Min untuk hardisk-nya kalau diubah ke 1TB apa ...
1390,kalo ada yg mau di ubah bisa min


<vspace>

#### ***Change letters to lowercase***

-----

In [7]:
df['question'] = df['question'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question'] = df['question'].apply(lambda x: " ".join(x.lower() for x in x.split()))


Unnamed: 0,question
0,forza horizon 5 bsa om bsa skalian di install
1,gan itu pc only udah sama dalemannya
2,"gan, bisa maenin game ea fc 24 dan call of dut..."
3,gan ini kalo buat pb atau gta 5 bisa
4,bisa installin ets2 dan gta roleplay sisanya k...
...,...
1387,gan kalo di pake game microsoft flight simulat...
1388,min minecraft berapa fps tolong ya min mau bel...
1389,min untuk hardisk-nya kalau diubah ke 1tb apa ...
1390,kalo ada yg mau di ubah bisa min


<vspace>

#### ***Discarding Connecting Words***

----

In [8]:
df['question'] = df['question'].str.replace(r'\b(?:dan|atau|tetapi|kenapa|tapi|yang|kok|hanya|yg|sekali|cenderung|sangat|meskipun|nya|lah|ya|sih|dah|malah|ato|nya|se)\b', '', regex=True)
display(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question'] = df['question'].str.replace(r'\b(?:dan|atau|tetapi|kenapa|tapi|yang|kok|hanya|yg|sekali|cenderung|sangat|meskipun|nya|lah|ya|sih|dah|malah|ato|nya|se)\b', '', regex=True)


Unnamed: 0,question
0,forza horizon 5 bsa om bsa skalian di install
1,gan itu pc only udah sama dalemannya
2,"gan, bisa maenin game ea fc 24 call of duty m..."
3,gan ini kalo buat pb gta 5 bisa
4,bisa installin ets2 gta roleplay sisanya kosong
...,...
1387,gan kalo di pake game microsoft flight simulat...
1388,min minecraft berapa fps tolong min mau beli ...
1389,min untuk hardisk- kalau diubah ke 1tb apa per...
1390,kalo ada mau di ubah bisa min


<vspace>

#### ***Tokenisasi***

----

In [9]:
# Define the tokenization function
def word_token_default(doc):
    if isinstance(doc, str):
        return doc.split()
    else:
        return []

# Apply the tokenization function to the 'question' column
df['question_tokens'] = df['question'].apply(word_token_default)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_tokens'] = df['question'].apply(word_token_default)


In [27]:
df['question_tokens']

0       [forza, horizon, 5, bisa, om, bisa,  sekalian,...
1           [gan, itu, pc, only, sudah, sama, dalemannya]
2       [gan,, bisa, maenin, game, iya, fc, 24, call, ...
3              [gan, ini,  kalau, buat, pb, gta, 5, bisa]
4       [bisa, installin, ets2, gta, roleplay, sisanya...
                              ...                        
1387    [gan,  kalau, di, pakai, game, microsoft, flig...
1388    [min, minecraft, berapa, fps, tolong, min, mau...
1389    [min, untuk, hardisk-, kalau, diubah, ke, 1tb,...
1390              [ kalau, ada, mau, di, ubah, bisa, min]
1391                                    [ini, ready, gan]
Name: question_tokens, Length: 1392, dtype: object

<vspace>

----
----

#### ***Change Slang Word into Normal***

In [10]:
slang = pd.read_csv("Slang2.csv")

In [11]:
slang

Unnamed: 0,slang,formal
0,woww,wow
1,aminn,amin
2,met,selamat
3,netaas,menetas
4,keberpa,keberapa
...,...,...
4255,baguss,baik
4256,utk,untuk
4257,skalian,sekalian
4258,proxessor,processor


In [12]:
def replace_slang(tokens, slang):
    # Create a dictionary from the slang DataFrame for faster lookups
    slang_dict = dict(zip(slang['slang'], slang['formal']))
    
    # Replace each token if it matches a slang term
    return [slang_dict.get(token, token) for token in tokens]

# Apply the slang replacement function to the 'question_tokens' column
df['question_tokens'] = df['question_tokens'].apply(lambda tokens: replace_slang(tokens, slang))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question_tokens'] = df['question_tokens'].apply(lambda tokens: replace_slang(tokens, slang))


In [13]:
df['question_tokens']

0       [forza, horizon, 5, bisa, om, bisa,  sekalian,...
1           [gan, itu, pc, only, sudah, sama, dalemannya]
2       [gan,, bisa, maenin, game, iya, fc, 24, call, ...
3              [gan, ini,  kalau, buat, pb, gta, 5, bisa]
4       [bisa, installin, ets2, gta, roleplay, sisanya...
                              ...                        
1387    [gan,  kalau, di, pakai, game, microsoft, flig...
1388    [min, minecraft, berapa, fps, tolong, min, mau...
1389    [min, untuk, hardisk-, kalau, diubah, ke, 1tb,...
1390              [ kalau, ada, mau, di, ubah, bisa, min]
1391                                    [ini, ready, gan]
Name: question_tokens, Length: 1392, dtype: object

----

<b>

#### ***Filtering with NLTK***

----

#### ***Stemming***

----

In [23]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

stopword_factory = StopWordRemoverFactory()
stopword = stopword_factory.create_stop_word_remover()

factory = StemmerFactory()
stemmer = factory.create_stemmer()

