# **Import Library**

In [1]:
import csv
import re
import string
from io import StringIO

import nltk
import numpy as np
import pandas as pd
import requests
import tensorflow as tf
from google_play_scraper import Sort, reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import (
    LSTM,
    Bidirectional,
    Dense,
    Dropout,
    Embedding,
    Reshape,
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Download required NLTK data
nltk.download("stopwords", quiet=True)
nltk.download("punkt_tab", quiet=True)

2025-04-06 16:12:15.186383: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-06 16:12:15.331528: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-06 16:12:16.381953: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-06 16:12:16.833505: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743930737.432948   28744 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743930737.49

True

# **Scrapping Dataset**

In [2]:
def scrape_reviews(app_id, max_reviews, output_file="reviews.csv"):
    """Scrapes reviews from the Google Play Store."""
    try:
        result, _ = reviews(
            app_id, lang="id", country="id", sort=Sort.NEWEST, count=max_reviews
        )
        pd.DataFrame(result).to_csv(output_file, index=False)
        return True
    except Exception as e:
        print(f"Error scraping reviews: {e}")
        return False


scrape_reviews("com.bca.mybca.omni.android", 20000)

True

# **Load Dataset**

In [3]:
reviews_df = pd.read_csv("reviews.csv")
reviews_df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,c8da7d90-cdac-49f2-95cd-d9f3aa6788b7,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,apa sih BCA ini ribet bgt. brpa kli udh ngeres...,1,0,,2025-04-05 13:59:08,"Mohon maaf atas ketidaknyamanan Bapak/Ibu, unt...",2025-04-05 14:14:40,
1,f1fad44b-ecd2-4341-9334-8d3528be5dac,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Sangat bagus. Lebih fleksibel dan rinci. Sama ...,5,0,2.3.1,2025-04-05 13:58:11,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 14:14:41,2.3.1
2,613398d2-3cfd-4d36-9813-dbf6e2bd900c,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,semoga kedepannya ada fitur investasi tabungan...,5,0,2.3.1,2025-04-05 13:21:14,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 14:14:42,2.3.1
3,50713d74-e7ee-4e2e-bd31-e5370644c71e,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"bagus aplikasi nya, detail banget semua makasi...",5,0,2.3.1,2025-04-05 12:57:58,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 14:14:43,2.3.1
4,44499903-6883-4904-9653-888db331483f,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,mantap,5,0,2.3.1,2025-04-05 11:47:57,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 12:25:53,2.3.1


In [4]:
print("Dataset Information:")
reviews_df.info()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19971 entries, 0 to 19970
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              19971 non-null  object
 1   userName              19971 non-null  object
 2   userImage             19971 non-null  object
 3   content               19971 non-null  object
 4   score                 19971 non-null  int64 
 5   thumbsUpCount         19971 non-null  int64 
 6   reviewCreatedVersion  15289 non-null  object
 7   at                    19971 non-null  object
 8   replyContent          19971 non-null  object
 9   repliedAt             19971 non-null  object
 10  appVersion            15289 non-null  object
dtypes: int64(2), object(9)
memory usage: 1.7+ MB


In [5]:
# Clean the dataset
clean_df = reviews_df.dropna().drop_duplicates()  # Drop null values and duplicates

# Convert 'at' column to datetime
clean_df["at"] = pd.to_datetime(clean_df["at"])

clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15289 entries, 1 to 19970
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              15289 non-null  object        
 1   userName              15289 non-null  object        
 2   userImage             15289 non-null  object        
 3   content               15289 non-null  object        
 4   score                 15289 non-null  int64         
 5   thumbsUpCount         15289 non-null  int64         
 6   reviewCreatedVersion  15289 non-null  object        
 7   at                    15289 non-null  datetime64[ns]
 8   replyContent          15289 non-null  object        
 9   repliedAt             15289 non-null  object        
 10  appVersion            15289 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(8)
memory usage: 1.4+ MB


# **Preprocessing**

In [6]:
# Define stop words
stop_words = set(stopwords.words("indonesian"))
stop_words.update(stopwords.words("english"))
stop_words.update(
    [
        "iya",
        "yaa",
        "gak",
        "nya",
        "na",
        "sih",
        "ku",
        "di",
        "ga",
        "ya",
        "gaa",
        "loh",
        "kah",
        "woi",
        "woii",
        "woy",
    ]
)

# Define slang words dictionary (SLANG_WORDS remains the same)
slang_words = {
    "udah": "sudah",
    "gua": "aku",
    "ntar": "nanti",
    "be like": "seperti",
    "aja": "saja",
    "dah": "sudah",
    "anjir": "anjing",
    "begimana": "bagaimana",
    "online": "daring",
    "offline": "luring",
    "gpp": "tidak apa",
    "the": "sebuah",
    "struggle": "perjuangan",
    "is": "adalah",
    "real": "nyata",
    "okeh": "oke",
    "mak": "ibu",
    "tp": "tapi",
    "xpa": "apa",
    "yg": "yang",
    "faham": "paham",
    "gue": "aku",
    "korona": "corona",
    "labtop": "laptop",
    "sohib": "rekat",
    "gak": "tidak",
    "tak": "tidak",
    "gru": "guru",
    "klo": "kalau",
    "ksi": "kasih",
    "g": "tidak",
    "dgn": "dengan",
    "trs": "terus",
    "lgi": "lagi",
    "ny": "nya",
    "jdi": "jadi",
    "dobell": "ganda",
    "scroll": "gulir",
    "lo": "kamu",
    "nggak": "tidak",
    "byk": "banyak",
    "maksuf": "maksud",
    "responsive": "responsif",
    "good luck": "beruntung",
    "nih": "ini",
    "ad": "ada",
    "karna": "karena",
    "tsb": "tersebut",
    "njing": "anjing",
    "deadline": "batas waktu",
    "group": "grup",
    "parents": "orang tua",
    "ayolohhh": "ayo",
    "mamah": "ibu",
    "twt": "twitter",
    "akhibat": "akibat",
    "tu": "itu",
    "org": "orang",
    "saya": "aku",
    "pinjem": "pinjam",
    "ortu": "orang tua",
    "yookkk": "ayo",
    "cutiepie": "sayang",
    "mami": "ibu",
    "d": "di",
    "ngeblank": "kosong",
    "ibuk": "ibu",
    "lgsg": "langsung",
    "hadeeeh": "aduh",
    "memori": "ingatan",
    "anjim": "anjing",
    "plis": "tolong",
    "bund": "ibu",
    "listri": "listrik",
    "ottoke": "bagaimana",
    "ampe": "sampai",
    "kek": "seperti",
    "ga": "tidak",
    "bener": "benar",
    "amat": "sekali",
    "yak": "ya",
    "nanya": "tanya",
    "jgn": "jangan",
    "yuk": "ayo",
    "tdk": "tidak",
    "waduh": "aduh",
    "kalo": "kalau",
    "gamampu": "tidak mampu",
    "yauda": "yasudah",
    "my": "punyaku",
    "story": "cerita",
    "end": "akhir",
    "ajaa": "saja",
    "budget": "biaya",
    "exactly": "tepat",
    "gaada": "tidak ada",
    "gunanyaa": "guna",
    "onlen": "daring",
    "banget": "sekali",
    "alhamdulillah": "alhamdulilah",
    "bgt": "sekali",
    "gaperlu": "tidak perlu",
    "gaes": "teman",
    "dapet": "dapat",
    "hadu": "aduh",
    "gaess": "teman",
    "susar": "susah",
    "belamra": "belajar",
    "bagaiman": "bagaimana",
    "sm": "sama",
    "kaya": "seperti",
    "privacy": "pribadi",
    "yaudah": "yasudah",
    "orng": "orang",
    "pd": "percaya diri",
    "udh": "sudah",
    "kl": "kalau",
    "ky": "seperti",
    "tim": "grup",
    "I did": "aku",
    "asik": "asyik",
    "mampis": "mati",
    "mampos": "mati",
    "mampus": "mati",
    "yu": "ayo",
    "dlu": "dulu",
    "baubau": "bau",
    "new": "baru",
    "pake": "pakai",
    "naruh": "taruh",
    "job": "pekerjaan",
    "kel": "kelas",
    "on": "nyala",
    "ni": "ini",
    "nk": "ingin",
    "share": "bagi",
    "math": "matematika",
    "sikit": "sedikit",
    "take": "ambil",
    "guys": "teman",
    "gengs": "teman",
    "girls": "gadis",
    "yuks": "ayo",
    "check": "cek",
    "ngabs": "bagaimana",
    "cam": "kamera",
    "buk": "ibu",
    "pengin": "ingin",
    "ngak": "tidak",
    "klau": "kalau",
    "bnget": "sekali",
    "dear": "sayang",
    "bangettt": "sekali",
    "anjinggg": "anjing",
    "muluuu": "selalu",
    "taii": "tahi",
    "bego": "bodoh",
    "abisan": "habis",
    "yukk": "ayo",
    "request": "minta",
    "yng": "yang",
    "chattingan": "obrolan",
    "horror": "seram",
    "horor": "seram",
    "jilbab": "kerudung",
    "tura": "tidur",
    "turu": "tidur",
    "ujug": "akhir",
    "japri": "hubungi",
    "mhsw": "mahasiswa",
    "rek": "teman",
    "piye": "bagaimana",
    "masamu": "menurutmu",
    "pen": "ingin",
    "ngk": "tidak",
    "open": "buka",
    "santuy": "santai",
    "cuman": "hanya",
    "cuma": "hanya",
    "kayanya": "seperti",
    "anda": "kamu",
    "pc": "komputer",
    "fix": "tetap",
    "bejibun": "banyak",
    "hunting": "buru",
    "alesan": "alasan",
    "cekkkk": "cek",
    "emang": "memang",
    "fighting": "berjuang",
    "mahasiswai": "mahasswa",
    "taik": "tahi",
    "doi": "dia",
    "panikgogi": "panik",
    "inj": "ini",
    "onlineshittt": "tahi",
    "apaa": "apa",
    "nihhh": "ini",
    "ama": "sama",
    "selese": "selesai",
    "gegara": "karena",
    "kelarrrr": "selesai",
    "jadika": "jadi",
    "kongkret": "konkret",
    "smpe": "sampai",
    "mls": "malas",
    "boker": "beol",
    "buanyakkkk": "banyak",
    "sampek": "sampai",
    "tapiiiii": "tapi",
    "teross": "terus",
    "weekend": "akhir pekan",
    "utk": "untuk",
    "mamposs": "mati",
    "kao": "kamu",
    "nak": "anak",
    "bnr": "benar",
    "gw": "aku",
    "pliss": "tolong",
    "pak": "bapak",
    "bu": "ibu",
    "kg": "tidak",
    "tetep": "tetap",
    "onlineeeeeeeee": "daring",
    "taudehhh": "tahu",
    "ngantukkkk": "ngantuk",
    "nggk": "tidak",
    "midtest": "uts",
    "bgst": "bangsat",
    "classroom": "kelas",
    "class": "kelas",
    "dedlen": "batas waktu",
    "njim": "anjing",
    "abis": "habis",
    "mid": "uts",
    "human": "manusia",
    "diacc": "terima",
    "acc": "terima",
    "ngelag": "lambat",
    "trus": "terus",
    "bun": "ibu",
    "rekomendasii": "rekomendasi",
    "tetp": "tetap",
    "bpkny": "bapak",
    "ngantuqqq": "ngantuk",
    "gasi": "tidak",
    "momen": "peristiwa",
    "moment": "peristiwa",
    "moments": "peristiwa",
    "mantapp": "mantap",
    "pala": "kepala",
    "ibuknyaa": "ibu",
    "mon": "mohon",
    "maap": "maaf",
    "ye": "ya",
    "lelet": "lambat",
    "aamiin": "amin",
    "notif": "pemberitahuan",
    "njirr": "anjing",
    "kayak": "seperti",
    "linglung": "lupa",
    "wa": "whatsapp",
    "whatsap": "whatsapp",
    "tibatiba": "tiba",
    "tggl": "tanggal",
    "tgl": "tanggal",
    "sem": "semester",
    "poor": "miskin",
    "me": "aku",
    "mabok": "mabuk",
    "dikiiit": "sedikit",
    "diem": "diam",
    "rule": "aturan",
    "kaga": "tidak",
    "paakk": "bapak",
    "ceritaa": "cerita",
    "ceritaaa": "cerita",
    "please": "tolong",
    "lemot": "lambat",
    "reposted": "ulang",
    "moodiyan": "murung",
    "moody": "murung",
    "cewe": "gadis",
    "tranding": "populer",
    "trending": "populer",
    "paansi": "apa",
    "mles": "malas",
    "bet": "sekali",
    "sopinter": "sok pintar",
    "au": "tahu",
    "bangsaadd": "bangsat",
    "beloom ": "belum",
    "ajg": "anjing",
    "ngg": "tidak",
    "sobi": "teman",
    "sobat": "teman",
    "apalagu": "apalagi",
    "tgs": "tugas",
    "allaah": "allah",
    "apain": "apa",
    "bangub": "bangun",
    "lag": "lambat",
    "ambyar": "hancur",
    "trosss": "terus",
    "tross": "terus",
    "anying": "anjing",
    "nanyak": "tanya",
    "kyk": "seperti",
    "skrng": "sekarang",
    "skrg": "sekarang",
    "e learning": "elearning",
    "gae": "buat",
    "pdhl": "padahal",
    "pdahal": "padahal",
    "pdhal": "padahal",
    "mimin": "admin",
    "min": "admin",
    "engga": "tidak",
    "iyaa": "iya",
    "iyaaa": "iya",
    "iyy": "iya",
    "iy": "iya",
    "yes": "iya",
    "y": "iya",
    "dr": "dari",
    "dri": "dari",
    "sumbe": "sumber",
    "swipe": "geser",
    "bngt": "sekali",
    "kir": "kira",
    "dosenya": "dosen",
    "log out": "keluar",
    "biasany": "biasa",
    "ternyta": "nyata",
    "msih": "masih",
    "join": "gabung",
    "bapanya": "bapak",
    "tpi": "tapi",
    "pda": "pada",
    "here": "sini",
    "we": "kita",
    "go": "pergi",
    "again": "lagi",
    "poto": "foto",
    "photo": "foto",
    "ss": "screenshot",
    "dpt": "dapat",
    "idung": "hidung",
    "glowing": "cahaya",
    "wlwpun": "walau",
    "kw": "kawasan",
    "belon": "belum",
    "pakabar": "apa kabar",
    "ntah": "entah",
    "gaberani": "tidak berani",
    "seblm": "sebelum",
    "kezzzel": "kesal",
    "capek": "kesal",
    "cape": "kesal",
    "nnti": "nanti",
    "nti": "nanti",
    "tahajuddan": "tahajud",
    "gwe": "aku",
    "belom": "belum",
    "mw": "ingin",
    "bwt": "untuk",
    "amatt": "sekali",
    "yuuuu": "ayo",
    "yuu": "ayo",
    "bbrp": "beberapa",
    "plus": "tambah",
    "jg": "juga",
    "ituu": "itu",
    "tuuu": "itu",
    "tuh": "itu",
    "anjp": "anjing",
    "anj": "anjing",
    "essensial": "penting",
    "jugak": "juga",
    "sebel": "kesal",
    "unek": "uneg",
    "next": "lanjut",
    "krusial": "penting",
    "tar": "nanti",
    "berenti": "berhenti",
    "takan": "tidak",
    "dlm": "dalam",
    "adl": "adalah",
    "hrs": "harus",
    "handsanitazer": "handsanitizer",
    "mba": "kakak",
    "mas": "kakak",
    "mbak": "kakak",
    "error": "gangguan",
    "eror": "gangguan",
    "gokill": "gokil",
    "gokilll": "gokil",
    "read": "baca",
    "shop": "toko",
    "bangt": "sekali",
    "gaksuka": "tidak suka",
    "drakor": "drama korea",
    "drama korea": "drama korea",
    "toddler": "balita",
    "kasian": "kasihan",
    "why": "kenapa",
    "what": "apa",
    "how": "bagaimana",
    "also": "juga",
    "blur": "buram",
    "elah": "allah",
    "mulu": "selalu",
    "nongol": "muncul",
    "bug": "virus",
    "sekolahhh": "sekolah",
    "sekolahh": "sekolah",
    "leyeh": "santai",
    "thank ": "terimakasih",
    "you": "kamu",
    "mood": "suka",
    "ofline": "luring",
    "knp": "kenapa",
    "power": "kekuatan",
    "mkn": "makan",
    "bs": "bisa",
    "yaallah": "allah",
    "ancur": "hancur",
    "facial": "perawatan muka",
    "nder": "teman",
    "rapog": "raport",
    "sbb": "karena",
    "pgi": "pagi",
    "liat": "lihat",
    "off": "mati",
    "assignment": "tugas",
    "assignments": "tugas",
    "right": "benar",
    "so": "jadi",
    "kal": "kalian",
    "gt": "gitu",
    "duhh": "aduh",
    "logout": "keluar",
    "amattt": "sekali",
    "kaliii": "kali",
    "yaah": "ya",
    "xmau": "tidak",
    "bljar": "belajar",
    "pingin": "ingin",
    "somehow": "bagaimana",
    "miss": "rindu",
    "life": "hidup",
    "almost": "hampir",
    "every": "setiap",
    "day": "hari",
    "meninggoy": "mati",
    "idup": "hidup",
    "interacting": "interaksi",
    "with": "dengan",
    "people": "orang",
    "blas": "sekali",
    "standart": "standar",
    "amburadul": "acak",
    "atuh": "aduh",
    "bat": "sekali",
    "mmbuat": "buat",
    "anjj": "anjing",
    "screenshots": "screenshot",
    "krn": "karena",
    "runut": "runtut",
    "bnyk": "banyak",
    "emak": "ibu",
    "ud": "sudah",
    "samaa": "sama",
    "sedihhh": "sedih",
    "sedihh": "sedih",
    "banyaaak": "banyak",
    "banyaak": "banyak",
    "rilek": "santai",
    "je": "saja",
    "exam": "ujian",
    "dahla": "sudah",
    "room": "ruang",
    "worth it": "guna",
    "storytell": "cerita",
    "ttg": "tentang",
    "bosen": "bosan",
    "bocah": "anak",
    "fis": "fisika",
    "bio": "biologi",
    "ujan": "hujan",
    "gada": "tidak ada",
    "udahlah": "sudah",
    "sambi": "sambil",
    "gimanaaa": "bagaimana",
    "ak": "aku",
    "ayok": "ayo",
    "reply": "balas",
    "too": "juga",
    "pusingg": "pusing",
    "pusinggg": "pusing",
    "duh": "aduh",
    "kls": "kelas",
    "krna": "karena",
    "feel": "merasa",
    "u": "kamu",
    "ade": "adik",
    "lg": "lagi",
    "pngn": "ingin",
    "spt": "seperti",
    "duel": "tanding",
    "circle": "lingkaran",
    "kopid": "covid",
    "cok": "jancuk",
    "semangatt": "semangat",
    "semangattt": "semangat",
    "have": "punya",
    "nice": "baik",
    "first": "pertama",
    "all": "semua",
    "sbaikmya": "baik",
    "anxiety": "gelisah",
    "submit": "kirim",
    "etc": "dsb",
    "dll": "dsb",
    "sekarangg": "sekarang",
    "course": "kursus",
    "courses": "kursus",
    "kesel": "kesal",
    "super": "sangat",
    "mencar": "pisah",
    "ato": "atau",
    "udeh": "sudah",
    "enggak": "tidak",
    "smt": "semester",
    "workshop": "seminar",
    "kdg": "kadang",
    "tpb": "tapi",
    "see": "lihat",
    "mayan": "lumayan",
    "mayann": "lumayan",
    "pdaku": "pada",
    "secape": "kesal",
    "time": "waktu",
    "w": "aku",
    "setaun": "tahun",
    "taun": "tahun",
    "mamposla": "mati",
    "focus": "fokus",
    "twit": "twitter",
    "cepet": "cepat",
    "help": "bantu",
    "emg": "memang",
    "gmn": "bagaimana",
    "lets": "ayo",
    "support": "dukung",
    "each": "sesama",
    "other": "lain",
    "public": "publik",
    "speaking": "bicara",
    "poll": "paling",
    "kea": "seperti",
    "gestur": "sikap",
    "gesture": "sikap",
    "enaaa": "enak",
    "haih": "aduh",
    "jhe": "saja",
    "face": "wajah",
    "to": "ke",
    "convert": "konfersi",
    "clinical": "klinik",
    "years": "tahun",
    "inimah": "ini",
    "abistu": "habis",
    "february": "februari",
    "january": "januari",
    "reti": "sampai",
    "wis": "sudah",
    "wae": "saja",
    "apply": "lamar",
    "project": "proyek",
    "respect": "hormat",
    "kudu": "harus",
    "them": "mereka",
    "set": "setengah",
    "bsk": "besok",
    "hadie": "aduh",
    "iyha": "iya",
    "mama": "ibu",
    "jug": "juga",
    "then": "lalu",
    "fully": "penuh",
    "boss": "bos",
    "hallo": "halo",
    "aje": "saja",
    "bundz": "ibu",
    "apus": "hapus",
    "itung": "hitung",
    "nnton": "nonton",
    "trouble": "masalah",
    "diuruss": "urus",
    "njir": "anjing",
    "takda": "tidak",
    "gtgt": "gitu",
    "practical": "praktikal",
    "boong": "bohong",
    "kepake": "pakai",
    "suke": "suka",
    "sapa": "siapa",
    "semuanyaaaaaa": "semua",
    "student": "siswa",
    "students": "siswa",
    "slalu": "selalu",
    "miiin": "admin",
    "telp": "telpon",
    "aj": "saja",
    "mudeng": "paham",
    "loss": "hilang",
    "ilang": "hilang",
    "emmak": "ibu",
    "jjur": "jujur",
    "bru": "baru",
    "semwa": "semua",
    "gedeg": "kesal",
    "yabalik": "balik",
    "and": "dan",
    "login": "masuk",
    "mo": "ingin",
    "ngapaain": "apa",
    "offcam": "tutup kamera",
    "oncam": "kamera",
    "bomat": "bodoamat",
    "sad": "sedih",
    "mmg": "memang",
    "masuklh": "masuk",
    "bln": "bulan",
    "jatoh": "jatuh",
    "jatohnya": "jatuh",
    "mmf": "maaf",
    "jbjb": "tiba",
    "gtu": "gitu",
    "heyy": "halo",
    "morning": "pagi",
    "dh": "sudah",
    "gi": "lagi",
    "smngt": "semangat",
    "kuis": "quiz",
    "cardi": "cardigan",
    "kampretos": "kampret",
    "wkt": "waktu",
    "bestfriend": "teman",
    "morningg": "pagi",
    "morninggg": "pagi",
    "kak": "kakak",
    "everyone": "semua",
    "met": "selamat",
    "she": "dia",
    "like": "suka",
    "ngadi": "ada",
    "split": "pisah",
    "saking": "sampai",
    "smkin": "semakin",
    "rajiiiin": "rajin",
    "stand by": "siap",
    "pls": "tolong",
    "uda": "sudah",
    "shit": "tahi",
    "shits": "tahi",
    "really": "benar",
    "need": "butuh",
    "someone": "orang",
    "talk": "bicara",
    "bodo": "bodoh",
    "adek": "adik",
    "masi": "masih",
    "school": "sekolah",
    "relate": "hubung",
    "gimanaa": "bagaimana",
    "puyeng": "pusing",
    "achievement": "penghargaan",
    "tai": "tahi",
    "kasi": "kasih",
    "leha": "santai",
    "perfect": "sempurna",
    "attendaance": "hadir",
    "jawapan": "jawaban",
    "jadual": "jadwal",
    "hape": "hp",
    "slow": "lambat",
    "noti": "pemberitahuan",
    "because": "karena",
    "sekaliii": "sekali",
    "msh": "masih",
    "inget": "ingat",
    "bang": "kakak",
    "last": "akhir",
    "hapyyy": "senang",
    "dikick": "usir",
    "intern": "magang",
    "tidurrrrrr": "tidur",
    "ngzoom": "zoom",
    "damn": "sial",
    "mostly": "sering",
    "braw": "kakak",
    "temen": "teman",
    "cont": "lanjut",
    "sregep": "rajin",
    "jugaaa": "juga",
    "blended": "gabung",
    "dg": "dengan",
    "kanca": "teman",
    "akeh": "banyak",
    "seko": "dari",
    "lol": "tertawa",
    "ges": "teman",
    "flat": "datar",
    "fandom": "fan",
    "fangirl": "fan",
    "abang": "kakak",
    "ttp": "tetap",
    "moto": "foto",
    "anjirt": "anjing",
    "fak": "anjing",
    "abs": "habis",
    "wtf": "anjing",
    "out": "keluar",
    "drpd": "daripada",
    "puter": "putar",
    "make": "buat",
    "waktuny": "waktu",
    "selaluu": "selalu",
    "troboos": "trobos",
    "stop": "berhenti",
    "emhnya": "memang",
    "td": "tadi",
    "grgr": "gara",
    "sob": "teman",
    "ngga": "tidak",
    "stupid": "bodoh",
    "nyambi": "sambil",
    "badmood": "murung",
    "demen": "suka",
    "kolor": "boxer",
    "diaa": "dia",
    "diaaa": "dia",
    "bukaan": "bukan",
    "international": "internasional",
    "officeeee": "kantor",
    "free": "gratis",
    "gitcu": "gitu",
    "darii": "dari",
    "uni": "universitas",
    "buna": "ibu",
    "bunanya": "ibu",
    "upload": "unggah",
    "kalii": "kali",
    "positip": "positif",
    "brooooooo": "teman",
    "bruuuhhhh": "teman",
    "webminar": "webinar",
    "skolah": "sekolah",
    "gimanasi": "bagaimana",
    "skap": "skip",
    "cmn": "cuma",
    "gblk": "bodoh",
    "jim": "anjing",
    "eek": "beol",
    "cuk": "jancuk",
    "sgt": "sekali",
    "taekk": "tahi",
    "that": "itu",
    "hate": "benci",
    "college": "kuliah",
    "offlen": "luring",
    "offlennya": "luring",
    "dama": "sama",
    "gapapaa": "tidak apa",
    "tell": "beritahu",
    "us": "kami",
    "will": "akan",
    "nannya": "tanya",
    "anjrit": "anjing",
    "smpt": "sempat",
    "ayen": "aku",
    "univ": "universitas",
    "nile": "nilai",
    "anjlog": "turun",
    "boi": "anak",
    "siah": "bagaimana",
    "napas": "nafas",
    "bernapas": "nafas",
    "robb": "allah",
    "dgrin": "dengar",
    "sek": "masih",
    "dikongkon": "suruh",
    "kongkon": "suruh",
    "najan": "karena",
    "salahno": "salah",
    "dkk": "dsb",
    "mder": "teman",
    "dat": "itu",
    "prnh": "pernah",
    "mhswnya": "mahasiswa",
    "kick": "usir",
    "paniiikkk": "panik",
    "paniikkk": "panik",
    "idupin": "hidup",
    "rank": "peringkat",
    "fav": "suka",
    "masup": "masuk",
    "psti": "pasti",
    "dpn": "depan",
    "kamerlah": "kamera",
    "kamer": "kamera",
    "bangett": "sekali",
    "ngntuk": "ngantuk",
    "yutub": "youtube",
    "reeeeekk": "teman",
    "bangeeeeeeeeddd": "sekali",
    "even": "bahkan",
    "biasae": "biasa",
    "trmsk": "termasuk",
    "subject": "subjek",
    "he": "dia",
    "wants": "ingin",
    "happy": "senang",
    "combination": "kombinasi",
    "dude": "teman",
    "wrong": "salah",
    "crack": "pecah",
    "serek": "serak",
    "or": "atau",
    "jwb": "jawab",
    "scr": "secara",
    "balance": "imbang",
    "left": "keluar",
    "kat": "dekat",
    "dek": "dik",
    "nyokap": "ibu",
    "bokap": "bapak",
    "wit": "twitter",
    "brisik": "berisik",
    "bcs": "karena",
    "dikasi": "kasih",
    "slese": "selesai",
    "mggu": "minggu",
    "menabahh": "tambah",
    "nabah": "tambah",
    "gasin": "gas",
    "gass": "gas",
    "gasss": "gas",
    "hayu": "ayo",
    "abah": "bapak",
    "sklh": "sekolah",
    "boring": "bosan",
    "pagiiiiii": "pagi",
    "hayo": "ayo",
    "home": "rumah",
    "ketiwi": "tawa",
    "ketawa": "tawa",
    "sampe": "sampai",
    "alfa": "alpa",
    "hr": "hari",
    "br": "baru",
    "mincot": "admin",
    "iyhh": "iya",
    "team": "grup",
    "gemeter": "getar",
    "kelar": "selesai",
    "kaak": "kakak",
    "bobooo": "tidur",
    "bobo": "tidur",
    "bobok": "tidur",
    "njirrrr": "anjing",
    "mellow": "lembut",
    "prefer": "suka",
    "naok": "naik",
    "naro": "taruh",
    "by": "dari",
    "ori": "asli",
    "yt": "youtube",
    "jk": "gurau",
    "mtk": "matematika",
    "saiki": "sekarang",
    "blass": "blas",
    "po": "apa",
    "call": "telpon",
    "mhs": "mahasiswa",
    "tauu": "tahu",
    "arek": "anak",
    "ambis": "ambisius",
    "blank": "kosong",
    "sja": "saja",
    "voice": "suara",
    "syg": "sayang",
    "fake": "palsu",
    "mblenger": "terlalu",
    "walo": "walau",
    "walopun": "walau",
    "kekeuh": "kukuh",
    "yaambunnn": "allah",
    "punggu": "punggung",
    "redirect": "alih",
    "nyatet": "catat",
    "bakal": "akan",
    "bakalan": "akan",
    "bkalan": "akan",
    "leave": "keluar",
    "cengini": "siang",
    "dm": "pesan",
    "volunteer": "relawan",
    "kagok": "susah",
    "ngelambe": "bicara",
    "personal": "pribadi",
    "rotate": "putar",
    "takpa": "tidak apa",
    "ssejak": "sejak",
    "coroni": "corona",
    "beres": "selesai",
    "sumpahh": "sumpah",
    "sumpahhh": "sumpah",
    "tuhhh": "itu",
    "harusnyaa": "harus",
    "praktek": "praktik",
    "les": "kursus",
    "bapack": "bapak",
    "drop": "jatuh",
    "pulak": "pula",
    "give": "beri",
    "away": "jauh",
    "aslii": "asli",
    "mksudnya": "maksud",
    "mksud": "maksud",
    "kyknya": "seperti",
    "ja": "saja",
    "duk": "duduk",
    "cakap": "bicara",
    "rejeki": "rezeki",
    "rizki": "rezeki",
    "rizkinya": "rezeki",
    "rejekinya": "rezeki",
    "maybe": "mungkin",
    "brou": "kakak",
    "keep": "simpan",
    "spirit": "semangat",
    "prayer": "doa",
    "believe": "percaya",
    "can": "dapat",
    "skola": "sekolah",
    "gatauuu": "tidak tahu",
    "perna": "pernah",
    "tolol": "bodoh",
    "murid": "siswa",
    "ngaco": "kacau",
    "jela": "saja",
    "book": "buku",
    "ikit": "ikut",
    "pape": "apa",
    "hrsnya": "harus",
    "mlu": "selalu",
    "pertma": "pertama",
    "tdr": "tidur",
    "anjm": "anjing",
    "repeat": "ulang",
    "pgn": "ingin",
    "gtloh": "gitu",
    "ktmu": "temu",
    "sik": "masih",
    "smlm": "malam",
    "looooove": "cinta",
    "in": "di",
    "sembuhhhhh": "sembuh",
    "mcm": "macam",
    "dgr": "dengar",
    "tgk": "lihat",
    "dst": "dsb",
    "cepetaaan": "cepat",
    "university": "universitas",
    "coronces": "corona",
    "apsen": "absen",
    "apalgi": "apa",
    "adenya": "adik",
    "buanyak": "banyak",
    "baique": "baik",
    "as": "sebagai",
    "contact": "kontak",
    "jur": "jurusan",
    "mnyimak": "simak",
    "fifty": "limapuluh",
    "dedel": "susah",
    "sby": "surabaya",
    "mna": "mana",
    "pagii": "pagi",
    "sbg": "sebagai",
    "beginiiiiii": "begini",
    "sekaliiii": "sekali",
    "kawan": "teman",
    "listen": "dengar",
    "up": "atas",
    "pede": "percaya diri",
    "maleeees": "malas",
    "duit": "uang",
    "duid": "uang",
    "money": "uang",
    "triak": "teriak",
    "noooo": "tidak",
    "ajj": "anjing",
    "sister": "kakak",
    "brother": "kakak",
    "nite": "malam",
    "esok": "besok",
    "equals": "sama",
    "lamaaaaa": "lama",
    "lamaaaa": "lama",
    "lamaaa": "lama",
    "lamaa": "lama",
    "yailaaah": "allah",
    "maksain": "paksa",
    "manaa": "mana",
    "full": "penuh",
    "award": "penghargaan",
    "kene": "sini",
    "mention": "sebut",
    "degdegan": "deg",
    "gituu": "gitu",
    "gituuu": "gitu",
    "yainti": "inti",
    "yaintinya": "inti",
    "gatau": "tidak tahu",
    "hype": "terkenal",
    "burem": "buram",
    "muluu": "selalu",
    "alhasil": "hasil",
    "makasii": "terimakasih",
    "makasi": "terimakasih",
    "touch": "sentuh",
    "ileran": "liur",
    "thinking": "pikir",
    "maksd": "maksud",
    "maksdny": "maksud",
    "maksdnya": "maksud",
    "mkst": "maksud",
    "mkstnya": "maksud",
    "ngewe": "mesum",
    "pr": "tugas",
    "justice": "adil",
    "manjun": "manjur",
    "stadd": "ustaz",
    "stay": "tinggal",
    "healthy": "sehat",
    "brrti": "arti",
    "ka": "kakak",
    "kreta": "kereta",
    "berbeza": "beda",
    "mantep": "mantap",
    "friver": "kemudi",
    "bgsd": "bangsat",
    "ngapa": "apa",
    "ngapain": "apa",
    "stres": "stress",
    "back": "kembali",
    "start": "mulai",
    "batre": "baterai",
    "batrenya": "baterai",
    "pengen": "ingin",
    "tuker": "tukar",
    "tukeran": "tukar",
    "msuk": "masuk",
    "socialize": "sosialisasi",
    "don": "tidak",
    "lngsung": "langsung",
    "bngun": "bangun",
    "shopping": "belanja",
    "postpone": "tunda",
    "comment": "komentar",
    "sorry": "maaf",
    "not": "tidak",
    "for": "untuk",
    "kuh": "aku",
    "akuu": "aku",
    "yukkk": "ayo",
    "sendiriiii": "sendiri",
    "nderrr": "teman",
    "sirkel": "lingkaran",
    "boro": "jangan",
    "tkt": "takut",
    "graduate": "lulus",
    "hactic": "sibuk",
    "hectic": "sibuk",
    "malem": "malam",
    "fikiran": "pikir",
    "fikir": "pikir",
    "begokk": "bodoh",
    "btw": "omong",
    "berasaaa": "rasa",
    "guee": "aku",
    "allahuu": "allah",
    "bisnes": "bisnis",
    "sebenarnyaaa": "benar",
    "allahu": "allah",
    "service": "layan",
    "spot": "titik",
    "anjr": "anjing",
    "dont": "jangan",
    "things": "hal",
    "getting": "jadi",
    "worse": "buruk",
    "dunt": "tidak",
    "hv": "punya",
    "friends": "teman",
    "diff": "beda",
    "bawel": "cerewet",
    "mkin": "semakin",
    "jek": "ojek",
    "giveaway": "hadiah",
    "baij": "baik",
    "nga": "tidak",
    "ocehan": "omong",
    "oceh": "omong",
    "lbh": "lebih",
    "pass": "pas",
    "gk": "tidak",
    "terbaikkkkk": "baik",
    "very": "sangat",
    "idea": "ide",
    "pandemik": "pandemi",
    "rapih": "rapi",
    "manage": "atur",
    "mgga": "tidak",
    "pastu": "pasti",
    "signal": "jaringan",
    "benr": "benar",
    "fuck": "persetan",
    "luama": "lama",
    "tegor": "tegur",
    "ditegor": "tegur",
    "anjeng": "anjing",
    "quota": "kuota",
    "dsj": "dsb",
    "unfair": "tidak adil",
    "btul": "betul",
    "ofc": "tentu",
    "kerumahz": "rumah",
    "rumahz": "rumah",
    "enek": "eneg",
    "encourage": "semangat",
    "hadeh": "aduh",
    "automatically": "auto",
    "done": "sudah",
    "thanks": "terimakasih",
    "wish": "harap",
    "luck": "beruntung",
    "mantau": "pantau",
    "donlod": "unduh",
    "download": "unduh",
    "least": "setidaknya",
    "bye": "selamat tinggal",
    "byebye": "selamat tinggal",
    "enau": "enak",
    "god": "tuhan",
    "nimb": "nimbun",
    "mlah": "malah",
    "smst": "semester",
    "weekdays": "pekan",
    "uninstal": "uninstall",
    "gausah": "tidak",
    "cman": "cuma",
    "aing": "aku",
    "manteman": "teman",
    "gapake": "tidak pakai",
    "hijab": "kerudung",
    "anjae": "anjing",
    "anjayani": "anjing",
    "proper": "layak",
    "dec": "desember",
    "brng": "bareng",
    "gaspol": "gas",
    "rapot": "raport",
    "rampyungg": "sudah",
    "maning": "lagi",
    "ngesuk": "besok",
    "esih": "masih",
    "pressure": "tekanan",
    "maloe": "malu",
    "astagaaa": "astaga",
    "lalen": "lupa",
    "hustle": "ramai",
    "kebablasan": "kelewat",
    "bablasa": "kelewat",
    "lebi": "lebih",
    "abiji": "nilai",
    "event": "acara",
    "setidakmungkin": "tidak mungkin",
    "berak": "beol",
    "thought": "pikir",
    "cuddling": "peluk",
    "deadass": "mati",
    "refers": "tuju",
    "satuu": "satu",
    "sakitt": "sakit",
    "semogaaa": "semoga",
    "teruss": "terus",
    "everyday": "setiap hari",
    "gils": "gila",
    "uga": "juga",
    "dengaaaan": "dengan",
    "maraton": "marathon",
    "dinner": "makan malam",
    "client": "klien",
    "useless": "tidak berguna",
    "caper": "cari perhatian",
    "caperr": "cari perhatian",
    "caperrr": "cari perhatian",
    "item": "hitam",
    "belajarpun": "belajar",
    "elahhh": "allah",
    "mapelah": "mapel",
    "only": "hanya",
    "our": "kami",
    "one": "satu",
    "angell": "susah",
    "pleciden": "presiden",
    "through": "lewat",
    "dag": "deg",
    "dig": "deg",
    "dug": "deg",
    "sagne": "mesum",
    "ngekick": "usir",
    "hbis": "habis",
    "maniiis": "manis",
    "details": "detil",
    "wasap": "whatsapp",
    "standby": "tinggal",
    "love": "suka",
    "texting": "obrol",
    "text": "obrol",
    "outfit": "pakaian",
    "kantung": "kantong",
    "mrasa": "merasa",
    "thdp": "hadap",
    "malaya": "malaysia",
    "indo": "indonesia",
    "play": "main",
    "sleep": "tidur",
    "skrang": "sekarang",
    "jirr": "anjing",
    "flop": "gagal",
    "survive": "bertahan",
    "best": "terbaik",
    "feeling": "perasaan",
    "poadcast": "podcast",
    "sometimes": "kadang",
    "test": "ujian",
    "quiz": "kuis",
    "final": "akhir",
    "dewee": "sendiri",
    "crash": "tabrak",
    "yaudahlaa": "sudah",
    "months": "bulan",
    "dahhh": "sudah",
    "jwabin": "jawab",
    "mislkn": "misal",
    "jwbn": "jawab",
    "tkut": "tidak",
    "mirit": "siswa",
    "hqq": "hakiki",
    "aminn": "amin",
    "poop": "beol",
    "sndri": "sendiri",
    "camping": "kemah",
    "doh": "aduh",
    "apps": "aplikasi",
    "app": "aplikasi",
    "sya": "aku",
    "bkan": "bukan",
    "spupu": "sepupu",
    "laqnat": "laknat",
    "eskul": "ekstrakulikuler",
    "ngilang": "hilang",
    "sist": "kakak",
    "luv": "sayang",
    "sabi": "bisa",
    "interact": "interaksi",
    "muridnya": "siswa",
    "apakbar": "apa kabar",
    "opsi": "pilihan",
    "recorder": "rekam",
    "nongkrong": "kumpul",
    "nongki": "kumpul",
    "wonge": "orang",
    "ancen": "memang",
    "iki": "ini",
    "asu": "anjing",
    "udahan": "sudah",
    "aq": "aku",
    "ibun": "ibu",
    "ayah": "bapak",
    "gasanggup": "tidak sanggup",
    "manut": "nurut",
    "rangking": "peringkat",
    "skarang": "sekarang",
    "mindset": "pikir",
    "jugaaaaa": "juga",
    "asuuuuuu": "anjing",
    "bokep": "porno",
    "ngebokep": "porno",
    "taiiii": "tahi",
    "point": "titik",
    "pasrahh": "pasrah",
    "yaudahla": "sudah",
    "kagettt": "kaget",
    "tricky": "rumit",
    "edgy": "gelisah",
    "buosen": "bosan",
    "expectation": "harap",
    "reality": "realitas",
    "attack": "serang",
    "dahal": "padahal",
    "ttep": "tetap",
    "msk": "masuk",
    "silent": "diam",
    "kulaih": "kuliah",
    "ijin": "izin",
    "emng": "memang",
    "smph": "sumpah",
    "excited": "semangat",
    "makasih": "terimakasih",
    "mngkin": "mungkin",
    "apala": "apa",
    "decided": "memutuskan",
    "masturbate": "masturbasi",
    "denger": "dengar",
    "kmrn": "kemarin",
    "kemaren": "kemarin",
    "bdg": "bandung",
    "ikutt": "ikut",
    "oengen": "ingin",
    "kelen": "kalian",
    "fresh": "segar",
    "kuliat": "lihat",
    "isoo": "bisa",
    "abes": "habis",
    "muzikgempak": "musik",
    "pinter": "pintar",
    "jugaa": "juga",
    "males": "malas",
    "hariini": "hari",
    "smgt": "semangat",
    "konsen": "konsentrasi",
    "tehthering": "tethering",
    "sialllllllllll": "sial",
    "tmn": "teman",
    "cowo": "cowok",
    "sumpe": "sumpah",
    "cheating": "curang",
    "cheat": "curang",
    "kakel": "kakak tingkat",
    "sangatttt": "sangat",
    "habisla": "habis",
    "keasikan": "asyik",
    "regis": "registrasi",
    "welcome": "selamat datang",
    "muk": "ingin",
    "masterpiece": "mahakarya",
    "cororong": "corona",
    "gatay": "tidak tahu",
    "entar": "nanti",
    "nongkrrong": "kumpul",
    "eren": "keren",
    "iyegahhh": "tidak",
    "dirumaahh": "rumah",
    "mandipun": "mandi",
    "dha": "sudah",
    "klen": "kalian",
    "anjg": "anjing",
    "beneerr": "benar",
    "bangeettt": "sekali",
    "bpk": "bapak",
    "macem": "macam",
    "adlh": "adalah",
    "penat": "pusing",
    "sekola": "sekolah",
    "tapii": "tapi",
    "kdng": "kadang",
    "kaliam": "kalian",
    "hayuk": "ayo",
    "party": "pesta",
    "embo": "tidak tahu",
    "bolot": "tuli",
    "budeg": "tuli",
    "omg": "tuhan",
    "bundir": "bunuh diri",
    "alpokat": "alpukat",
    "avocado": "alpukat",
    "reformasj": "reformasi",
    "mengajarr": "ajar",
    "ajarr": "ajar",
    "cintaim": "cinta",
    "probably": "mungkin",
    "overthinking": "pikir",
    "betulllll": "benar",
    "doswn": "dosen",
    "seger": "segar",
    "rona": "corona",
    "sepineee": "sepi",
    "polllllll": "sangat",
    "orgorg": "orang",
    "dn": "dan",
    "problem": "masalah",
    "hrus": "harus",
    "schedule": "jadwal",
    "gasik": "tidak",
    "stgh": "setengah",
    "goblokkk": "bodoh",
    "lagiiii": "lagi",
    "alloh": "allah",
    "ln": "lain",
    "gaadak": "tidak ada",
    "jaman": "zaman",
    "etapi": "tapi",
    "bgtt": "sekali",
    "iyaaaaa": "iya",
    "sng": "senang",
    "connection": "koneksi",
    "ikt": "ikut",
    "adapted": "adaptasi",
    "seneng": "senang",
    "weird": "aneh",
    "konsul": "konsultasi",
    "pa": "bapak",
    "maloee": "malu",
    "lagii": "lagi",
    "pakek": "pakai",
    "semangak": "semangat",
    "menurutgw": "nurut",
    "tbh": "jujur",
    "internship": "magang",
    "scholarship": "beasiswa",
    "ilmunyaaa": "ilmu",
    "djancokk": "jancuk",
    "serep": "serap",
    "enjoy": "nikmat",
    "plan": "rencana",
    "hopefully": "harap",
    "ginii": "begini",
    "brpa": "berapa",
    "tarsoktarsok": "nanti",
    "panikk": "panik",
    "bangeeet": "sekali",
    "ambyarrrrr": "hancur",
    "ampunddd": "ampun",
    "maen": "main",
    "wayahe": "waktu",
    "entok": "dapat",
    "kiraiin": "kira",
    "asw": "anjing",
    "monangesss": "nangis",
    "punyaaaaa": "punya",
    "adi": "adik",
    "pgen": "ingin",
    "music": "musik",
    "favorite": "suka",
    "sbgai": "sebagai",
    "gabs": "tidak bisa",
    "jdny": "jadi",
    "sajahh": "saja",
    "bapa": "bapak",
    "without": "tanpa",
    "bilding": "gedung",
    "stuffs": "barang",
    "normalize": "normal",
    "kmi": "kami",
    "afternoon": "sore",
    "gilak": "gila",
    "ndaa": "tidak",
    "mslah": "masalah",
    "drumah": "rumah",
    "tntang": "tentang",
    "ripyuu": "tinjau",
    "sinyal": "jaringan",
    "meneh": "lagi",
    "anaku": "anak",
    "blum": "belum",
}

In [7]:
def preprocess_text(text: str) -> str:
    """Preprocesses the given text."""

    text = re.sub(r"\d+", "", text)
    text = text.replace("\n", " ")
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()
    text = re.sub(r"\s+", " ", text)

    text = text.lower()

    text = " ".join([slang_words.get(word, word) for word in text.split()])

    filtered_tokens = [word for word in word_tokenize(text) if word not in stop_words]

    return " ".join(filtered_tokens)

In [8]:
print("Original DataFrame:")
display(clean_df.head())

Original DataFrame:


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
1,f1fad44b-ecd2-4341-9334-8d3528be5dac,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Sangat bagus. Lebih fleksibel dan rinci. Sama ...,5,0,2.3.1,2025-04-05 13:58:11,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 14:14:41,2.3.1
2,613398d2-3cfd-4d36-9813-dbf6e2bd900c,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,semoga kedepannya ada fitur investasi tabungan...,5,0,2.3.1,2025-04-05 13:21:14,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 14:14:42,2.3.1
3,50713d74-e7ee-4e2e-bd31-e5370644c71e,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"bagus aplikasi nya, detail banget semua makasi...",5,0,2.3.1,2025-04-05 12:57:58,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 14:14:43,2.3.1
4,44499903-6883-4904-9653-888db331483f,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,mantap,5,0,2.3.1,2025-04-05 11:47:57,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 12:25:53,2.3.1
6,98b9ceb2-ec68-4804-b974-e23169134968,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,mantal,5,0,2.3.1,2025-04-05 11:12:42,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 11:35:36,2.3.1


In [9]:
clean_df["processed_text"] = clean_df["content"].apply(preprocess_text)

In [10]:
print("\nDataFrame after Preprocessing:")
display(clean_df.head())


DataFrame after Preprocessing:


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,processed_text
1,f1fad44b-ecd2-4341-9334-8d3528be5dac,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Sangat bagus. Lebih fleksibel dan rinci. Sama ...,5,0,2.3.1,2025-04-05 13:58:11,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 14:14:41,2.3.1,bagus fleksibel rinci payment suport contactle...
2,613398d2-3cfd-4d36-9813-dbf6e2bd900c,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,semoga kedepannya ada fitur investasi tabungan...,5,0,2.3.1,2025-04-05 13:21:14,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 14:14:42,2.3.1,semoga kedepannya fitur investasi tabungan ema...
3,50713d74-e7ee-4e2e-bd31-e5370644c71e,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"bagus aplikasi nya, detail banget semua makasi...",5,0,2.3.1,2025-04-05 12:57:58,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 14:14:43,2.3.1,bagus aplikasi detail terimakasih bca
4,44499903-6883-4904-9653-888db331483f,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,mantap,5,0,2.3.1,2025-04-05 11:47:57,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 12:25:53,2.3.1,mantap
6,98b9ceb2-ec68-4804-b974-e23169134968,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,mantal,5,0,2.3.1,2025-04-05 11:12:42,Terima kasih atas ulasannya. Semoga aplikasi m...,2025-04-05 11:35:36,2.3.1,mantal


# **Labelling**

In [11]:
def load_lexicon(url):
    """Loads lexicon data from a CSV file at the given URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        reader = csv.reader(StringIO(response.text), delimiter=",")
        return {row[0]: int(row[1]) for row in reader}
    except requests.exceptions.RequestException as e:
        print(f"Error fetching lexicon data: {e}")
        return {}


# Load positive and negative lexicons
LEXICON_POSITIVE = load_lexicon(
    "https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv"
)
LEXICON_NEGATIVE = load_lexicon(
    "https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv"
)

In [12]:
def sentiment_analysis(text):
    """Performs sentiment analysis on text using lexicons."""
    score = 0
    for token in word_tokenize(text):
        if token in LEXICON_POSITIVE:
            score += 1
        elif token in LEXICON_NEGATIVE:
            score -= 1

    if score >= 1:
        return "positive"
    elif score <= -1:
        return "negative"
    else:
        return "neutral"

In [13]:
clean_df["sentiment"] = clean_df["processed_text"].apply(sentiment_analysis)
print(clean_df["sentiment"].value_counts())

sentiment
positive    7909
negative    3723
neutral     3657
Name: count, dtype: int64


# **Preprocessing dan Tokenisasi Data**

In [14]:
# --- Ambil teks dan label dari DataFrame ---
X = clean_df["processed_text"].values
y = clean_df["sentiment"].values

# --- Tokenisasi teks ---
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# --- Padding: semua input jadi panjang 150 ---
max_length = 150
X_padded = pad_sequences(
    X_sequences, maxlen=max_length, padding="post", truncating="post"
)

# --- Label Encoding (String → Integer) ---
unique_labels = list(set(y))
label_encoder = tf.keras.layers.StringLookup(
    vocabulary=unique_labels, mask_token=None, num_oov_indices=0
)
y_encoded = label_encoder(y).numpy()

# --- Hitung ukuran vocab tokenizer ---
vocabulary_size = min(10000, len(tokenizer.word_index) + 1)


2025-04-06 16:13:05.209748: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


# **Deep Learning Model Training**

## Skema 1 - Skip-gram Pretrained + BiLSTM (70/30 split)

In [15]:
# --- Hyperparameter ---
embedding_dim = 300
window_size = 4

# --- Split 70/30 ---
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y_encoded, test_size=0.3, random_state=42
)

# --- Skip-gram pairs ---
skip_grams = []
for text in X_train:
    for i, target in enumerate(text):
        if target == 0:
            continue
        for j in range(max(i - window_size, 0), min(i + window_size + 1, len(text))):
            context = text[j]
            if i != j and context != 0:
                skip_grams.append([target, context])

target_words = np.array([pair[0] for pair in skip_grams])
context_words = np.array([pair[1] for pair in skip_grams])

# --- Train Skip-gram embedding model ---
model_skipgram = Sequential(
    [
        Embedding(input_dim=vocabulary_size, output_dim=embedding_dim),
        Reshape((embedding_dim,)),
        Dense(vocabulary_size, activation="softmax"),
    ]
)
model_skipgram.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
model_skipgram.fit(target_words, context_words, epochs=5, batch_size=1024)

embedding_matrix = model_skipgram.layers[0].get_weights()[0]

# --- Final BiLSTM model with more Dense layers + trainable embedding ---
model1 = Sequential(
    [
        Embedding(
            input_dim=vocabulary_size,
            output_dim=embedding_dim,
            weights=[embedding_matrix],
            trainable=True,
        ),  # <== now trainable!
        Bidirectional(LSTM(64, return_sequences=True)),
        Dropout(0.3),
        Bidirectional(LSTM(32)),
        Dropout(0.3),
        Dense(64, activation="relu"),
        Dropout(0.2),
        Dense(32, activation="relu"),
        Dense(len(label_encoder.get_vocabulary()), activation="softmax"),
    ]
)

model1.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"]
)

# --- Early stopping ---
early_stop = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

# --- Train the model ---
model1.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stop],
)

# --- Evaluate ---
train_acc1 = model1.evaluate(X_train, y_train, verbose=0)[1]
test_acc1 = model1.evaluate(X_test, y_test, verbose=0)[1]
print(
    f"\n✅ [Skema 1] BiLSTM + Skip-gram | Train Acc: {train_acc1:.4f}, Test Acc: {test_acc1:.4f}"
)

Epoch 1/5
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 184ms/step - loss: 8.0051
Epoch 2/5
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 207ms/step - loss: 6.5172
Epoch 3/5
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 196ms/step - loss: 6.3340
Epoch 4/5
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 218ms/step - loss: 6.1937
Epoch 5/5
[1m416/416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 190ms/step - loss: 6.0659
Epoch 1/20
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 246ms/step - accuracy: 0.5978 - loss: 0.8580 - val_accuracy: 0.7835 - val_loss: 0.5172
Epoch 2/20
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 247ms/step - accuracy: 0.8048 - loss: 0.4891 - val_accuracy: 0.8821 - val_loss: 0.3091
Epoch 3/20
[1m168/168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 232ms/step - accuracy: 0.8805 - loss: 0.3193 - val_accuracy: 0.9041 - val_los

## Skema 2 - BiLSTM + Embedding (80/20 split)

In [16]:
model2 = Sequential(
    [
        Embedding(input_dim=vocabulary_size, output_dim=128),
        Bidirectional(LSTM(64)),
        Dense(len(label_encoder.get_vocabulary()), activation="softmax"),
    ]
)
model2.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
model2.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# --- Evaluasi ---
train_acc2 = model2.evaluate(X_train, y_train, verbose=0)[1]
test_acc2 = model2.evaluate(X_test, y_test, verbose=0)[1]
print(
    f"\n✅ [Skema 2] BiLSTM + Embedding | Train Acc: {train_acc2:.4f}, Test Acc: {test_acc2:.4f}"
)


Epoch 1/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 111ms/step - accuracy: 0.6437 - loss: 0.7694 - val_accuracy: 0.9004 - val_loss: 0.2817
Epoch 2/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 79ms/step - accuracy: 0.9310 - loss: 0.1962 - val_accuracy: 0.9250 - val_loss: 0.2178
Epoch 3/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 79ms/step - accuracy: 0.9647 - loss: 0.1055 - val_accuracy: 0.9220 - val_loss: 0.2381
Epoch 4/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 84ms/step - accuracy: 0.9809 - loss: 0.0565 - val_accuracy: 0.9141 - val_loss: 0.2658
Epoch 5/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 79ms/step - accuracy: 0.9899 - loss: 0.0389 - val_accuracy: 0.9200 - val_loss: 0.2945
Epoch 6/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 81ms/step - accuracy: 0.9902 - loss: 0.0346 - val_accuracy: 0.9150 - val_loss: 0.3188
Epoch 7/10
[1m

##  Skema 3: Dense + TF-IDF (70/30 split)

In [17]:
# Split raw text (bukan yang padded)
X_raw_train, X_raw_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42
)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_raw_train).toarray()
X_test_tfidf = vectorizer.transform(X_raw_test).toarray()

model3 = Sequential(
    [
        Dense(128, activation="relu"),
        Dense(64, activation="relu"),
        Dense(len(label_encoder.get_vocabulary()), activation="softmax"),
    ]
)
model3.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
model3.fit(
    X_train_tfidf,
    y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_tfidf, y_test),
)

# --- Evaluasi ---
train_acc3 = model3.evaluate(X_train_tfidf, y_train, verbose=0)[1]
test_acc3 = model3.evaluate(X_test_tfidf, y_test, verbose=0)[1]
print(
    f"\n✅ [Skema 3] Dense + TF-IDF | Train Acc: {train_acc3:.4f}, Test Acc: {test_acc3:.4f}"
)


Epoch 1/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.6572 - loss: 0.7511 - val_accuracy: 0.9019 - val_loss: 0.2853
Epoch 2/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9493 - loss: 0.1655 - val_accuracy: 0.9137 - val_loss: 0.2614
Epoch 3/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9828 - loss: 0.0683 - val_accuracy: 0.9182 - val_loss: 0.2692
Epoch 4/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.9939 - loss: 0.0305 - val_accuracy: 0.9215 - val_loss: 0.2889
Epoch 5/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.9966 - loss: 0.0226 - val_accuracy: 0.9187 - val_loss: 0.3070
Epoch 6/10
[1m335/335[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9956 - loss: 0.0210 - val_accuracy: 0.9196 - val_loss: 0.3278
Epoch 7/10
[1m335/335[0m 

# **Inference**

In [18]:
# --- Fungsi Inference ---
def predict_sentiment(text, model, tokenizer, label_encoder, max_length=150):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(
        sequence, maxlen=max_length, padding="post", truncating="post"
    )
    pred = model.predict(padded)
    pred_label_idx = np.argmax(pred, axis=1)[0]
    label_vocab = label_encoder.get_vocabulary()
    return label_vocab[pred_label_idx]


# --- Penggunaan ---
banking_texts = [
    "CS sangat ramah dan membantu menjelaskan fitur kartu kredit.",
    "Aplikasi jelek, lambat banget.",
    "Biasa saja, bintang 3 dulu",
]

for txt in banking_texts:
    hasil = predict_sentiment(txt, model1, tokenizer, label_encoder)
    print(f"Teks: '{txt}' → Prediksi Sentimen: {hasil}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 765ms/step
Teks: 'CS sangat ramah dan membantu menjelaskan fitur kartu kredit.' → Prediksi Sentimen: positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
Teks: 'Aplikasi jelek, lambat banget.' → Prediksi Sentimen: negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Teks: 'Biasa saja, bintang 3 dulu' → Prediksi Sentimen: neutral
