In [1]:
import pandas as pd

data = pd.read_csv('Dataset/Twitter_Emotion_Dataset.csv')
data

Unnamed: 0,label,tweet
0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu..."
1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi..."
2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...
3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng..."
4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata..."
...,...,...
4396,love,"Tahukah kamu, bahwa saat itu papa memejamkan m..."
4397,fear,Sulitnya menetapkan Calon Wapresnya Jokowi di ...
4398,anger,"5. masa depannya nggak jelas. lha iya, gimana ..."
4399,happy,[USERNAME] dulu beneran ada mahasiswa Teknik U...


In [2]:
#Case Folding
data['tweet_case_folding'] = data['tweet'].str.lower()

In [3]:
#Cleansing
import string
import re #regex Library

def remove_tweet_special(text):
    #Remove kata istilah dari dataset ([username], [url], [sensitive-no]) 
    text = text.replace('[username]'," ").replace('[url]'," ").replace('[sensitive-no]'," ")

    #Remove tab, new Line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    
    #Remove non ASCII (emoticon, chinese word, etc) 
    text = text.encode('ascii', 'replace').decode('ascii')

    #Remove mention, Link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())

    #Remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
data['tweet_cleansing'] = data['tweet_case_folding'].apply(remove_tweet_special)

#Remove number
def remove_number (text):
    return re.sub(r"\d+", " ", text)
data['tweet_cleansing'] = data['tweet_cleansing'].apply(remove_number)

#Remove punctuation
def remove_punctuation(text): 
    return text.translate(str.maketrans(" "," ",string.punctuation))
data['tweet_cleansing'] = data['tweet_cleansing'].apply(remove_punctuation)

#Remove whitespace Loading & trailing 
def remove_whitespace_LT(text):
    return text.strip()
data['tweet_cleansing'] = data['tweet_cleansing'].apply(remove_whitespace_LT)

#Remove multiple whitespace into single whitespace 
def remove_whitespace_multiple (text):
    return re.sub('\s+',' ',text)
data['tweet_cleansing'] = data['tweet_cleansing'].apply(remove_whitespace_multiple)

#Remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", " ", text)
data['tweet_cleansing'] = data['tweet_cleansing'].apply(remove_singl_char)

In [4]:
#Tokenization
from nltk.tokenize import word_tokenize

data['tweet_tokenized'] = data['tweet_cleansing'].apply(word_tokenize)

In [5]:
#Normalization
normalizad_word = pd.read_excel("Dataset/kamus_singkatan2.xlsx")

#Tranform Dataframe to Dictionary

normalizad_word_dict = {}
for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict: 
        normalizad_word_dict[row[0]] = row[1]

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

data['tweet_normalized'] = data['tweet_tokenized'].apply(normalized_term)

In [6]:
#Stemming
#!pip install PySastrawi
#!pip install swifter

#import sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory 
import swifter

#Create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

#stemming
def stemmed_wrapper(term): 
    return stemmer.stem(term)

term_dict = {}
for document in data['tweet_normalized']:
    for term in document: 
        if term not in term_dict:
            term_dict[term] = ' '

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term) 
    #print(term, ":", term_dict[term])  

# apply stemmed term to dataframe 
def get_stemmed_term(document): 
    return [term_dict[term] for term in document]

data['tweet_stemming'] = data['tweet_normalized'].swifter.apply(get_stemmed_term)

Pandas Apply:   0%|          | 0/4401 [00:00<?, ?it/s]

In [7]:
#Stopwords
#import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords

#Stopword indonesia
list_stopwords = stopwords.words('indonesian')

#manualy add stopword ---

# append additional stopword 
list_stopwords.extend(['yg', 'dg', 'dgn', 'rt', 'ny', 'd', 'klo', 'kalo', 'amp', 'biar', 'bikin', 
                       'bilang', 'birthday', 'doang', 'dong', 'cari', 'itu', 'krn', 'karna', 'nya', 
                       'nih', 'sih', 'udh', 'sampai', 'kenapa', 'duluan', 'ada', 'abis', 'ugh', 
                       'pengen', 'si', 'tau', 'tuh', 'utk', 'ya', 'trs', 'sm', 'padahal', 'lagi', 
                       'dpt', 'dapat', 'dapet', 'ken', 'mlu', 'jd', 'sdh', 'aja', 'n', 't', 'pas', 
                       'yang', 'apa', 'banyak', 'buat', 'pls', 'mulu', 'cari', 'nyg', 'hehe', 
                       'pen', 'u', 'pap', 'loh', 'emg', 'buat', 'sdg', 'pada', 'pda', 'allah', 
                       'ydh', 'yaudah','&amp', 'banget', 'yah', 'lha', 'lho'])

#read stopword from txt file
#txt_stopword = pd.read_csv("Dataset/stopwords.txt", names = ["stopwords"], header = None)

# convert stopword string to list & append additional stopword 
#list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# convert list to dictionary 
list_stopwords = set(list_stopwords)

#remove stopword pada list token

def stopwords_removal(words): 
    return [word for word in words if word not in list_stopwords]

data['tweet_stopwords'] = data['tweet_stemming'].apply(stopwords_removal) 


In [8]:
import numpy as np

def fit_token(word):
    word = np.array(word)
    word = ' '.join(word)
    return word
data['clean_data'] = data['tweet_stopwords'].apply(fit_token)

In [9]:
import pandas as pd

dataset = {'tweet': data['tweet'], 
           'case_folding': data['tweet_case_folding'], 
           'cleansing': data['tweet_cleansing'], 
           'tokenization': data['tweet_tokenized'], 
           'normalization': data['tweet_normalized'], 
           'stemming': data['tweet_stemming'], 
           'stopwords': data['tweet_stopwords']}
dataset2 = {'label': data['label'],
            'tweet': data['tweet'],
           'clean_data': data['clean_data']}
df = pd.DataFrame(dataset)
df2 = pd.DataFrame(dataset2)
df.head()
df2.head()

Unnamed: 0,label,tweet,clean_data
0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu...",jalan jatibarupolisi gertak gubernur emangny p...
1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi...",cewek kayak rasain sibuk jaga rasain sakit hai...
2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...,kepingin gudeg mbarek bu hj amad foto google s...
3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng...",jalan jatibarubagian wilayah tn abangpengatura...
4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata...",sharing alam kemarin jam batalin tiket stasiun...


In [10]:
df.to_excel('Dataset/Hasil Preprocessing Data.xlsx', index=False)
df2.to_excel('Dataset/Clean Dataset.xlsx', index=False)