In [11]:
import pandas as pd
import numpy as np
import re
import string
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import spacy
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def hapus_kolom_tidak_digunakan(file_input, file_output, kolom_yang_digunakan):
   
    df = pd.read_csv(file_input, delimiter=';')
    print("Nama Kolom dalam DataFrame:", df.columns)

    try:
        df = df[kolom_yang_digunakan]
        df.to_csv(file_output, index=False)
        print(f"File {file_output} telah berhasil dibuat dengan kolom yang diinginkan.")
    except KeyError as e:
        print(f"Error: {e}. Pastikan nama kolom yang digunakan sesuai dengan nama kolom dalam file CSV.")

def case_folding(text):
    return text.lower()

def remove_tweet_special(text):
    text = text.replace('\\t', " ").replace('\\n', " ").replace('\\u', " ").replace('\\', "")
    text = text.encode('ascii', 'replace').decode('ascii')
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)", " ", text).split())
    return text.replace("http://", " ").replace("https://", " ")

def remove_number(text):
    return re.sub(r"\d+", "", text)

def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

def remove_whitespace_LT(text):
    return text.strip()

def remove_whitespace_multiple(text):
    return re.sub('\s+', ' ', text)

def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

def word_tokenize_wrapper(text):
    return word_tokenize(text)

def freqDist_wrapper(text):
    return FreqDist(text)

def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

def normalized_term_id(document):
    doc = nlp_id(document)
    return [token.lemma_ if token.lemma_ != '-PRON-' else token.text for token in doc]

def stemmed_wrapper(term):
    return stemmer.stem(term)

def get_stemmed_term(document):
    return [term_dict.get(term, term) for term in document]

if __name__ == "__main__":
    file_input = 'dataset.csv'
    file_output = 'datasetfix.csv'
    kolom_yang_digunakan = ['Sentiment', 'Text Tweet']

    hapus_kolom_tidak_digunakan(file_input, file_output, kolom_yang_digunakan)

    TWEET_DATA = pd.read_csv(file_output, encoding="ISO-8859-1")

    if 'Text Tweet' not in TWEET_DATA.columns:
        raise KeyError("'Text Tweet' column not found in the input file. Please check the column name.")

    TWEET_DATA['Text Tweet'] = TWEET_DATA['Text Tweet'].apply(case_folding)
    TWEET_DATA['Text Tweet'] = TWEET_DATA['Text Tweet'].apply(remove_tweet_special)
    TWEET_DATA['Text Tweet'] = TWEET_DATA['Text Tweet'].apply(remove_number)
    TWEET_DATA['Text Tweet'] = TWEET_DATA['Text Tweet'].apply(remove_punctuation)
    TWEET_DATA['Text Tweet'] = TWEET_DATA['Text Tweet'].apply(remove_whitespace_LT)
    TWEET_DATA['Text Tweet'] = TWEET_DATA['Text Tweet'].apply(remove_whitespace_multiple)
    TWEET_DATA['Text Tweet'] = TWEET_DATA['Text Tweet'].apply(remove_single_char)
    TWEET_DATA['tweet_tokens'] = TWEET_DATA['Text Tweet'].apply(word_tokenize_wrapper)
    TWEET_DATA['tweet_tokens_fdist'] = TWEET_DATA['tweet_tokens'].apply(freqDist_wrapper)

    list_stopwords = stopwords.words('indonesian')
    list_stopwords.extend([
        "yg", "dg", "rt", "dgn", "ny", "d", 'klo', 'kalo', 'amp', 'biar', 'bikin', 'bilang', 
        'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
        'jd', 'jgn', 'sdh', 'aja', 'n', 't', 'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt', '&amp', 'yah'
    ])
    txt_stopword = pd.read_csv("stopword.txt", names=["stopwords"], header=None)
    list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
    list_stopwords = set(list_stopwords)

    TWEET_DATA['tweet_tokens_WSW'] = TWEET_DATA['tweet_tokens'].apply(stopwords_removal)

    nlp_id = spacy.load("xx_ent_wiki_sm")
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    term_dict = {}

    for document in TWEET_DATA['tweet_tokens_WSW']:
        normalized_document_id = normalized_term_id(' '.join(document))
        normalized_document_stemmed = [stemmed_wrapper(term) for term in normalized_document_id]
        for term in normalized_document_stemmed:
            if term not in term_dict:
                term_dict[term] = ' '

    for term in term_dict:
        term_dict[term] = stemmed_wrapper(term)

    TWEET_DATA['tweet_tokens_stemmed'] = TWEET_DATA['tweet_tokens_WSW'].apply(get_stemmed_term)
    TWEET_DATA.to_csv("Text_Preprocessing2.csv")

    text_preprocessing_data = pd.read_csv('Text_Preprocessing2.csv', index_col=0)
    selected_columns = text_preprocessing_data[['Sentiment', 'tweet_tokens_stemmed']]
    selected_columns.to_csv('final_dataset.csv', index=False)

    print(selected_columns)


Nama Kolom dalam DataFrame: Index(['Id', 'Sentiment', 'Acara TV', 'Jumlah Retweet', 'Text Tweet'], dtype='object')
File datasetfix.csv telah berhasil dibuat dengan kolom yang diinginkan.
    Sentiment                               tweet_tokens_stemmed
0    positive  ['undang', 'shanijkt', 'hitamputih', 'pemenang...
1    positive  ['selamat', 'berbuka', 'puasa', 'semoga', 'ama...
2    positive  ['trans', 'hitam', 'putih', 'penghargaan', 'no...
3    positive                          ['selamat', 'hitamputih']
4    positive   ['asiknya', 'nonton', 'hitam', 'putih', 'trans']
..        ...                                                ...
395  negative  ['banget', 'kesel', 'debat', 'pake', 'emosi', ...
396  negative  ['miskin', 'miskin', 'sekolah', 'pungutan', 'l...
397  negative  ['emosi', 'cepat', 'tua', 'nonton', 'emosi', '...
398  negative  ['penampilan', 'kyk', 'preman', 'taunya', 'bki...
399  negative        ['berbelitbelit', 'muter', 'buang', 'mutu']

[400 rows x 2 columns]
