In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def hapus_kolom_tidak_digunakan(file_input, file_output, kolom_yang_digunakan):
    df = pd.read_csv(file_input, delimiter=';')
    df = df[kolom_yang_digunakan]
    df.to_csv(file_output, index=False)

def preprocess_text(text):
    text = text.lower()
    text = text.replace('\\t', " ").replace('\\n', " ").replace('\\u', " ").replace('\\', "")
    text = text.encode('ascii', 'replace').decode('ascii')
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    return text

def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

if __name__ == "__main__":
    file_input = 'dataset.csv'
    file_output = 'datasetfix.csv'
    kolom_yang_digunakan = ['Sentiment', 'Text Tweet']

    hapus_kolom_tidak_digunakan(file_input, file_output, kolom_yang_digunakan)

    TWEET_DATA = pd.read_csv(file_output, encoding="ISO-8859-1")
    TWEET_DATA['Text Tweet'] = TWEET_DATA['Text Tweet'].apply(preprocess_text)
    TWEET_DATA['tweet_tokens'] = TWEET_DATA['Text Tweet'].apply(word_tokenize)

    list_stopwords = stopwords.words('indonesian')
    list_stopwords.extend([
        "yg", "dg", "rt", "dgn", "ny", "d", 'klo', 'kalo', 'amp', 'biar', 'bikin', 'bilang', 
        'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
        'jd', 'jgn', 'sdh', 'aja', 'n', 't', 'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt', '&amp', 'yah'
    ])
    txt_stopword = pd.read_csv("stopword.txt", names=["stopwords"], header=None)
    list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
    list_stopwords = set(list_stopwords)

    TWEET_DATA['tweet_tokens_WSW'] = TWEET_DATA['tweet_tokens'].apply(stopwords_removal)

    nlp_id = spacy.load("xx_ent_wiki_sm")
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    TWEET_DATA['tweet_tokens_stemmed'] = TWEET_DATA['tweet_tokens_WSW'].apply(lambda x: [stemmer.stem(word) for word in x])
    TWEET_DATA['tweet_tokens_stemmed'] = TWEET_DATA['tweet_tokens_stemmed'].apply(lambda x: ' '.join(x))

    selected_columns = TWEET_DATA[['Sentiment', 'tweet_tokens_stemmed']]
    selected_columns.to_csv('final_dataset.csv', index=False)


In [2]:
print(selected_columns)

    Sentiment                               tweet_tokens_stemmed
0    positive  undang shanijkt hitamputih menang ssk jkt mjkt...
1    positive   selamat buka puasa moga amal ibadah terima allah
2    positive                   trans hitam putih harga norwegia
3    positive                                 selamat hitamputih
4    positive                   asiknya nonton hitam putih trans
..        ...                                                ...
395  negative                  banget kesel debat pake emosi gin
396  negative                  miskin miskin sekolah pungut liar
397  negative                emosi cepat tua nonton emosi bicara
398  negative             tampil kyk preman tau bkin kisruh usak
399  negative                     berbelitbelit muter buang mutu

[400 rows x 2 columns]
