In [2]:
import pandas as pd 
import numpy as np

TWEET_DATA = pd.read_csv("crawling-data/dataset_sementara.csv")

TWEET_DATA.head()

Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,image_url,in_reply_to_screen_name,lang,location,quote_count,reply_count,retweet_count,tweet_url,user_id_str,username
0,1814196917410357309,Fri Jul 19 07:26:50 +0000 2024,0,@tanyarlfes org cabul mah cabul ga liat ras ajg,1814200206340358196,,tanyarlfes,in,,0,0,0,https://x.com/calypsoore/status/18142002063403...,1540746774364557317,calypsoore
1,1814197629259264509,Fri Jul 19 07:26:50 +0000 2024,0,@fachrianantyo 200k katanya,1814200206105428467,,fachrianantyo,in,"Tangerang, Indonesia",0,0,0,https://x.com/malvinico/status/181420020610542...,90125063,malvinico
2,1814200205937418468,Fri Jul 19 07:26:49 +0000 2024,0,PERP H4RRY,1814200205937418468,,,in,,0,0,0,https://x.com/svtn28/status/1814200205937418468,1411459219954540545,svtn28
3,1813795378116063597,Fri Jul 19 07:26:49 +0000 2024,0,@cocoro_rods @jawafess Bosku orang situ asli d...,1814200205702832341,,cocoro_rods,in,,0,0,0,https://x.com/hellow__me/status/18142002057028...,1307649603303473152,hellow__me
4,1814087866806624355,Fri Jul 19 07:26:49 +0000 2024,0,@masgah_ Pernah pas sd,1814200205614743990,,masgah_,in,,0,0,0,https://x.com/tekananturgor/status/18142002056...,1272476969079201794,tekananturgor


#### Case Folding

In [3]:
# ------ Case Folding --------
# gunakan fungsi Series.str.lower() pada Pandas
TWEET_DATA['full_text'] = TWEET_DATA['full_text'].str.lower()


print('Case Folding Result : \n')
print(TWEET_DATA['full_text'].head(5))

Case Folding Result : 

0      @tanyarlfes org cabul mah cabul ga liat ras ajg
1                          @fachrianantyo 200k katanya
2                                           perp h4rry
3    @cocoro_rods @jawafess bosku orang situ asli d...
4                               @masgah_ pernah pas sd
Name: full_text, dtype: object


#### Tokenizing

In [4]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
TWEET_DATA['full_text'] = TWEET_DATA['full_text'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

TWEET_DATA['full_text'] = TWEET_DATA['full_text'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

TWEET_DATA['full_text'] = TWEET_DATA['full_text'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

TWEET_DATA['full_text'] = TWEET_DATA['full_text'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

TWEET_DATA['full_text'] = TWEET_DATA['full_text'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

TWEET_DATA['full_text'] = TWEET_DATA['full_text'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

TWEET_DATA['tweet_tokens'] = TWEET_DATA['full_text'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(TWEET_DATA['tweet_tokens'].head())

Tokenizing Result : 

0         [org, cabul, mah, cabul, ga, liat, ras, ajg]
1                                            [katanya]
2                                         [perp, hrry]
3    [rods, bosku, orang, situ, asli, dan, bukan, p...
4                                    [pernah, pas, sd]
Name: tweet_tokens, dtype: object


In [5]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

TWEET_DATA['tweet_tokens_fdist'] = TWEET_DATA['tweet_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(TWEET_DATA['tweet_tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(cabul, 2), (org, 1), (mah, 1), (ga, 1), (lia...
1                                       [(katanya, 1)]
2                               [(perp, 1), (hrry, 1)]
3    [(rods, 1), (bosku, 1), (orang, 1), (situ, 1),...
4                     [(pernah, 1), (pas, 1), (sd, 1)]
Name: tweet_tokens_fdist, dtype: object


#### Stopwords removal

In [6]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
# txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# # convert stopword string to list & append additional stopword
# list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

TWEET_DATA['tweet_tokens_WSW'] = TWEET_DATA['tweet_tokens'].apply(stopwords_removal) 


print(TWEET_DATA['tweet_tokens_WSW'].head())

0             [org, cabul, mah, cabul, liat, ras, ajg]
1                                                   []
2                                         [perp, hrry]
3    [rods, bosku, orang, situ, asli, pendatang, be...
4                                            [pas, sd]
Name: tweet_tokens_WSW, dtype: object


#### Normalization

In [7]:
normalizad_word = pd.read_csv("normalisasi.csv")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

TWEET_DATA['tweet_normalized'] = TWEET_DATA['tweet_tokens_WSW'].apply(normalized_term)

TWEET_DATA['tweet_normalized'].head(10)

0           [orang, cabul, mah, cabul, liat, ras, ajg]
1                                                   []
2                                         [perp, hrry]
3    [rods, bosku, orang, situ, asli, pendatang, be...
4                                            [pas, sd]
5             [waspada, paham, radikalisme, terorisme]
6                            [nontonnya, dimana, link]
7    [info, akun, like, percakapan, bokep, porno, l...
8                     [kirain, pisang, dibelah, laper]
9                            [ditunggutunggu, coyyyyy]
Name: tweet_normalized, dtype: object

#### Stemmer

In [None]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in TWEET_DATA['tweet_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

TWEET_DATA['tweet_tokens_stemmed'] = TWEET_DATA['tweet_normalized'].swifter.apply(get_stemmed_term)
print(TWEET_DATA['tweet_tokens_stemmed'])

893
------------------------
orang : orang
cabul : cabul
mah : mah
liat : liat
ras : ras
ajg : ajg
perp : perp
hrry : hrry
rods : rods
bosku : bos
situ : situ
asli : asli
pendatang : datang
beraniberani : beraniberani
pas : pas
sd : sd
waspada : waspada
paham : paham
radikalisme : radikalisme
terorisme : terorisme
nontonnya : nontonnya
dimana : mana
link : link
info : info
akun : akun
like : like
percakapan : cakap
bokep : bokep
porno : porno
lokal : lokal
full : full
durasi : durasi
kirain : kirain
pisang : pisang
dibelah : belah
laper : laper
ditunggutunggu : ditunggutunggu
coyyyyy : coyyyyy
hyemin : hyemin
alhamdulillah : alhamdulillah
uktnya : uktnya
tingkat : tingkat
tetapi : tetapi
juta : juta
hosqyvhuabvfpmptzfewsqonmxpmhhuiqcti : hosqyvhuabvfpmptzfewsqonmxpmhhuiqcti
anakku : anak
nangis : nang
pulang : pulang
betah : betah
banget : banget
sekolah : sekolah
agakakao : agakakao
pacet : pacet
pls : pls
chudai : chudai
pporno : pporno
christmas : christmas
na : na
eid : eid
zitakuw

In [None]:
TWEET_DATA.to_csv("Text_Preprocessing.csv")