In [None]:
!pip install Sastrawi
import nltk
nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Membaca dataset
file_path = 'IKD_Dataset.csv'
data = pd.read_csv(file_path)

# Inisialisasi Stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Inisialisasi StopWords Indonesia
stop_words = set(stopwords.words('indonesian'))

# Membaca dataset normalisasi
normalization_df = pd.read_csv('Kamus_Alay.csv')
normalization_dict = pd.Series(normalization_df.formal.values, index=normalization_df.slang).to_dict()

# Fungsi untuk case folding
def case_folding(text):
    return text.lower()

# Fungsi untuk menghapus emoji
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Fungsi untuk data cleaning
def clean_text(text):
    text = remove_emoji(text)  # Menghapus emoji
    text = re.sub(r'\d+', '', text)  # Menghapus angka
    text = text.translate(str.maketrans('', '', string.punctuation))  # Menghapus tanda baca
    text = text.strip()  # Menghapus spasi di awal dan akhir teks
    text = re.sub(r'\s+', ' ', text)  # Menghapus spasi ganda
    return text

# Fungsi untuk tokenisasi
def tokenize(text):
    return word_tokenize(text)

# Fungsi untuk stopword removal
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Fungsi untuk normalisasi
def normalize(tokens):
    return [normalization_dict.get(word, word) for word in tokens]

# Fungsi untuk stemming
def stem(tokens):
    return [stemmer.stem(word) for word in tokens]

# Preprocessing lengkap
def preprocess(text):
    text = case_folding(text)
    text = clean_text(text)
    tokens = tokenize(text)
    # tokens = remove_stopwords(tokens)  # Tidak digunakan untuk meningkatkan akurasi model
    tokens = normalize(tokens)
    tokens = stem(tokens)
    return ' '.join(tokens)




In [None]:
# Terapkan preprocessing pada kolom 'content'
data['processed_content'] = data['content'].apply(preprocess)

# Simpan hasil preprocessing ke file baru dengan kolom 'review_text'
output_file_path = 'IKD_Dataset_Processed.csv'
data[['processed_content']].rename(columns={'processed_content': 'review_text'}).to_csv(output_file_path, index=False)