#Preprocessing

##Import Data

In [None]:
import pandas as pd

file_id = '1b-sWWtURbl_Vy2_gmMMOFFXwVzWrmfaK'
url = f'https://drive.google.com/uc?id={file_id}'

In [None]:
df = pd.read_csv(url)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388264 entries, 0 to 388263
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    388263 non-null  object
 1   label   388264 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 5.9+ MB


In [None]:
# Check whether there is null data
df.isnull().sum()

Unnamed: 0,0
text,1
label,0


In [None]:
df = df.dropna()

In [None]:
# Check whether there is duplicate data
df.duplicated().sum()

0

In [None]:
df

Unnamed: 0,text,label
0,"aku merasa sedikit kotor, sedikit tercemar jik...",0
1,aku merasa gelisah sepanjang waktu seperti ham...,3
2,Saya ingat merasa bahwa saya tidak tahan denga...,3
3,aku merasa kangen dengan popeye dan bayamnya,1
4,"Saya merasa kewalahan, stres, dan sejujurnya s...",4
...,...,...
388259,Setelah selesai mengunjungi museum lokal secar...,2
388260,"Setelah selesai menonton acara komedi ringan, ...",2
388261,"Setelah selesai berolahraga ringan di halaman,...",2
388262,"Tadi pagi saya memasak makan siang sederhana, ...",2


##Data Cleaning

In [None]:
!pip install PySastrawi

Collecting PySastrawi
  Downloading PySastrawi-1.2.0-py2.py3-none-any.whl.metadata (892 bytes)
Downloading PySastrawi-1.2.0-py2.py3-none-any.whl (210 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/210.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m174.1/210.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PySastrawi
Successfully installed PySastrawi-1.2.0


In [None]:
import pandas as pd
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Create stopword remover
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

# Define cleaning text function
def clean_text(text):
    text = re.sub(r'[^\w\s]', ' ', text)                                       # Remove punctuation
    text = re.sub(r'#\w+', '', text)                                          # Remove hashtags
    text = re.sub(r'@\w+', '', text)                                          # Remove mentions
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)   # Remove URLs
    text = re.sub(r'\s+', ' ', text).strip()                                  # Remove excessive whitespace
    text = text.lower()                                                       # Case folding
    text = stopword.remove(text)                                              # Remove stopwords
    return text

In [None]:
# Apply data cleaning
df['clean_text'] = df['text'].apply(clean_text)

In [None]:
df

Unnamed: 0,text,label,clean_text
0,"aku merasa sedikit kotor, sedikit tercemar jik...",0,kotor tercemar mengirim enam belas tweet waktu...
1,aku merasa gelisah sepanjang waktu seperti ham...,3,gelisah waktu hamster mencari roda
2,Saya ingat merasa bahwa saya tidak tahan denga...,3,tahan karakter karakter seinfeld alasan repot ...
3,aku merasa kangen dengan popeye dan bayamnya,1,kangen popeye bayamnya
4,"Saya merasa kewalahan, stres, dan sejujurnya s...",4,kewalahan stres sejujurnya kehilangan akal sehat
...,...,...,...
388259,Setelah selesai mengunjungi museum lokal secar...,2,selesai mengunjungi museum lokal virtual mengh...
388260,"Setelah selesai menonton acara komedi ringan, ...",2,selesai menonton acara komedi ringan menghabis...
388261,"Setelah selesai berolahraga ringan di halaman,...",2,selesai berolahraga ringan halaman tenang
388262,"Tadi pagi saya memasak makan siang sederhana, ...",2,pagi memasak makan siang sederhana produktif


##Tokenization

In [None]:
# Define tokenization function
def tokenize_text(text):
    return re.findall(r'\b\w+\b', text)

In [None]:
# Apply tokenization
df['tokenized'] = df['clean_text'].apply(tokenize_text)

In [None]:
df

Unnamed: 0,text,label,clean_text,tokenized
0,"aku merasa sedikit kotor, sedikit tercemar jik...",0,kotor tercemar mengirim enam belas tweet waktu...,"[kotor, tercemar, mengirim, enam, belas, tweet..."
1,aku merasa gelisah sepanjang waktu seperti ham...,3,gelisah waktu hamster mencari roda,"[gelisah, waktu, hamster, mencari, roda]"
2,Saya ingat merasa bahwa saya tidak tahan denga...,3,tahan karakter karakter seinfeld alasan repot ...,"[tahan, karakter, karakter, seinfeld, alasan, ..."
3,aku merasa kangen dengan popeye dan bayamnya,1,kangen popeye bayamnya,"[kangen, popeye, bayamnya]"
4,"Saya merasa kewalahan, stres, dan sejujurnya s...",4,kewalahan stres sejujurnya kehilangan akal sehat,"[kewalahan, stres, sejujurnya, kehilangan, aka..."
...,...,...,...,...
388259,Setelah selesai mengunjungi museum lokal secar...,2,selesai mengunjungi museum lokal virtual mengh...,"[selesai, mengunjungi, museum, lokal, virtual,..."
388260,"Setelah selesai menonton acara komedi ringan, ...",2,selesai menonton acara komedi ringan menghabis...,"[selesai, menonton, acara, komedi, ringan, men..."
388261,"Setelah selesai berolahraga ringan di halaman,...",2,selesai berolahraga ringan halaman tenang,"[selesai, berolahraga, ringan, halaman, tenang]"
388262,"Tadi pagi saya memasak makan siang sederhana, ...",2,pagi memasak makan siang sederhana produktif,"[pagi, memasak, makan, siang, sederhana, produ..."


## Lemmatization/Stemming

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
# Define stemming function
def stemming_text(tokens):
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

In [None]:
# Apply stemming
df['stemmed'] = df['tokenized'].apply(stemming_text)
df

Unnamed: 0,text,label,clean_text,tokenized,stemmed
0,"aku merasa sedikit kotor, sedikit tercemar jik...",0,kotor tercemar mengirim enam belas tweet waktu...,"[kotor, tercemar, mengirim, enam, belas, tweet...","[kotor, cemar, kirim, enam, belas, tweet, wakt..."
1,aku merasa gelisah sepanjang waktu seperti ham...,3,gelisah waktu hamster mencari roda,"[gelisah, waktu, hamster, mencari, roda]","[gelisah, waktu, hamster, cari, roda]"
2,Saya ingat merasa bahwa saya tidak tahan denga...,3,tahan karakter karakter seinfeld alasan repot ...,"[tahan, karakter, karakter, seinfeld, alasan, ...","[tahan, karakter, karakter, seinfeld, alas, re..."
3,aku merasa kangen dengan popeye dan bayamnya,1,kangen popeye bayamnya,"[kangen, popeye, bayamnya]","[kangen, popeye, bayam]"
4,"Saya merasa kewalahan, stres, dan sejujurnya s...",4,kewalahan stres sejujurnya kehilangan akal sehat,"[kewalahan, stres, sejujurnya, kehilangan, aka...","[kewalahan, stres, jujur, hilang, akal, sehat]"
...,...,...,...,...,...
388259,Setelah selesai mengunjungi museum lokal secar...,2,selesai mengunjungi museum lokal virtual mengh...,"[selesai, mengunjungi, museum, lokal, virtual,...","[selesai, unjung, museum, lokal, virtual, hibur]"
388260,"Setelah selesai menonton acara komedi ringan, ...",2,selesai menonton acara komedi ringan menghabis...,"[selesai, menonton, acara, komedi, ringan, men...","[selesai, tonton, acara, komedi, ringan, habis..."
388261,"Setelah selesai berolahraga ringan di halaman,...",2,selesai berolahraga ringan halaman tenang,"[selesai, berolahraga, ringan, halaman, tenang]","[selesai, olahraga, ringan, halaman, tenang]"
388262,"Tadi pagi saya memasak makan siang sederhana, ...",2,pagi memasak makan siang sederhana produktif,"[pagi, memasak, makan, siang, sederhana, produ...","[pagi, masak, makan, siang, sederhana, produktif]"


## Padding

In [None]:
import pandas as pd
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create a vocabulary of unique stemmed tokens
vocabulary = set()
for tokens in df['stemmed']:
    vocabulary.update(tokens)

# Create a word-to-index mapping
word_to_index = {token: index + 1 for index, token in enumerate(vocabulary)}

In [None]:
# Define function to convert tokens to indices
def tokens_to_indices(tokens):
    return [word_to_index.get(token, 0) for token in tokens]  # 0 for unknown words

# Convert stemmed tokens to indices
df['indexed'] = df['stemmed'].apply(tokens_to_indices)

In [None]:
# Find the maximum sequence length
max_len = df['indexed'].apply(len).max()

In [None]:
# Define padding function
def padding_text(tokens):
    return pad_sequences([tokens], maxlen=max_len, padding='post', truncating='post')[0]

In [None]:
# Apply padding
df['padded'] = df['indexed'].apply(padding_text)
df

Unnamed: 0,text,label,clean_text,tokenized,stemmed,indexed,padded
0,"aku merasa sedikit kotor, sedikit tercemar jik...",0,kotor tercemar mengirim enam belas tweet waktu...,"[kotor, tercemar, mengirim, enam, belas, tweet...","[kotor, cemar, kirim, enam, belas, tweet, wakt...","[34238, 12112, 20109, 45885, 19433, 46451, 253...","[34238, 12112, 20109, 45885, 19433, 46451, 253..."
1,aku merasa gelisah sepanjang waktu seperti ham...,3,gelisah waktu hamster mencari roda,"[gelisah, waktu, hamster, mencari, roda]","[gelisah, waktu, hamster, cari, roda]","[2491, 25349, 34749, 44691, 8407]","[2491, 25349, 34749, 44691, 8407, 0, 0, 0, 0, ..."
2,Saya ingat merasa bahwa saya tidak tahan denga...,3,tahan karakter karakter seinfeld alasan repot ...,"[tahan, karakter, karakter, seinfeld, alasan, ...","[tahan, karakter, karakter, seinfeld, alas, re...","[30892, 40218, 40218, 19054, 2084, 17470, 1747...","[30892, 40218, 40218, 19054, 2084, 17470, 1747..."
3,aku merasa kangen dengan popeye dan bayamnya,1,kangen popeye bayamnya,"[kangen, popeye, bayamnya]","[kangen, popeye, bayam]","[19130, 39227, 9475]","[19130, 39227, 9475, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
4,"Saya merasa kewalahan, stres, dan sejujurnya s...",4,kewalahan stres sejujurnya kehilangan akal sehat,"[kewalahan, stres, sejujurnya, kehilangan, aka...","[kewalahan, stres, jujur, hilang, akal, sehat]","[4457, 19455, 33148, 39229, 34285, 37570]","[4457, 19455, 33148, 39229, 34285, 37570, 0, 0..."
...,...,...,...,...,...,...,...
388259,Setelah selesai mengunjungi museum lokal secar...,2,selesai mengunjungi museum lokal virtual mengh...,"[selesai, mengunjungi, museum, lokal, virtual,...","[selesai, unjung, museum, lokal, virtual, hibur]","[6914, 27757, 23982, 46404, 23707, 1241]","[6914, 27757, 23982, 46404, 23707, 1241, 0, 0,..."
388260,"Setelah selesai menonton acara komedi ringan, ...",2,selesai menonton acara komedi ringan menghabis...,"[selesai, menonton, acara, komedi, ringan, men...","[selesai, tonton, acara, komedi, ringan, habis...","[6914, 16836, 41528, 43318, 3256, 2142, 25349,...","[6914, 16836, 41528, 43318, 3256, 2142, 25349,..."
388261,"Setelah selesai berolahraga ringan di halaman,...",2,selesai berolahraga ringan halaman tenang,"[selesai, berolahraga, ringan, halaman, tenang]","[selesai, olahraga, ringan, halaman, tenang]","[6914, 4168, 3256, 45437, 25425]","[6914, 4168, 3256, 45437, 25425, 0, 0, 0, 0, 0..."
388262,"Tadi pagi saya memasak makan siang sederhana, ...",2,pagi memasak makan siang sederhana produktif,"[pagi, memasak, makan, siang, sederhana, produ...","[pagi, masak, makan, siang, sederhana, produktif]","[4626, 35573, 34829, 15100, 10906, 11767]","[4626, 35573, 34829, 15100, 10906, 11767, 0, 0..."


In [None]:
df["padded"][0]

array([34238, 12112, 20109, 45885, 19433, 46451, 25349, 39189,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0],
      dtype=int32)

## Splitting Data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df['padded'], df['label'], test_size=0.2, random_state=42)