# ------ Import Library --------

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
df = pd.read_csv('hasil_crawling.csv')
df.head(5)

Unnamed: 0,date,username,tweet
0,2022-10-24 14:37:10+00:00,ada_pohan,@SaropudinAde @Dennysiregar7 Lha yg maruk BLT ...
1,2022-10-24 14:33:36+00:00,PRDepok,Cara Daftar BLT BBM 2022 Online Lewat HP di Ap...
2,2022-10-24 14:33:36+00:00,pikiran_rakyat,Cara Daftar BLT BBM 2022 Online Lewat HP di Ap...
3,2022-10-24 14:28:44+00:00,antony_xenruang,@NegeriKolam Kembali terbukti para kadrun itu ...
4,2022-10-24 14:24:45+00:00,suaramerdeka,Siap-siap BLT BBM Bakal Cair Lagi di November ...


## ------ Case Folding --------

In [3]:
df['tweet'] = df['tweet'].str.lower()
df['tweet'].head(5)

0    @saropudinade @dennysiregar7 lha yg maruk blt ...
1    cara daftar blt bbm 2022 online lewat hp di ap...
2    cara daftar blt bbm 2022 online lewat hp di ap...
3    @negerikolam kembali terbukti para kadrun itu ...
4    siap-siap blt bbm bakal cair lagi di november ...
Name: tweet, dtype: object

## ----- Clean Text -------

In [4]:
# --------- Tokenizing --------------
def remove_tweet_special_char(text):
    # remove tab, new line, ans backspace
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
df['tweet'] = df['tweet'].apply(remove_tweet_special_char)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['tweet'] = df['tweet'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

df['tweet'] = df['tweet'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

df['tweet'] = df['tweet'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['tweet'] = df['tweet'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['tweet'] = df['tweet'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['tweet_tokens'] = df['tweet'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
df.head()


Tokenizing Result : 



Unnamed: 0,date,username,tweet,tweet_tokens
0,2022-10-24 14:37:10+00:00,ada_pohan,lha yg maruk blt kelian orang koq,"[lha, yg, maruk, blt, kelian, orang, koq]"
1,2022-10-24 14:33:36+00:00,PRDepok,cara daftar blt bbm online lewat hp di aplikas...,"[cara, daftar, blt, bbm, online, lewat, hp, di..."
2,2022-10-24 14:33:36+00:00,pikiran_rakyat,cara daftar blt bbm online lewat hp di aplikas...,"[cara, daftar, blt, bbm, online, lewat, hp, di..."
3,2022-10-24 14:28:44+00:00,antony_xenruang,kembali terbukti para kadrun itu punya tabiat ...,"[kembali, terbukti, para, kadrun, itu, punya, ..."
4,2022-10-24 14:24:45+00:00,suaramerdeka,siapsiap blt bbm bakal cair lagi di november l...,"[siapsiap, blt, bbm, bakal, cair, lagi, di, no..."


## ------ Kalkulasi Frekuensi Distribusi ---------

In [5]:
# NLTK calc frequency distribution
def freqDist_wrapper(text):
    return FreqDist(text)

df['tweet_tokens_fdist'] = df['tweet_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(df['tweet_tokens_fdist'].head().apply(lambda x : x.most_common()))

Frequency Tokens : 

0    [(lha, 1), (yg, 1), (maruk, 1), (blt, 1), (kel...
1    [(cara, 1), (daftar, 1), (blt, 1), (bbm, 1), (...
2    [(cara, 1), (daftar, 1), (blt, 1), (bbm, 1), (...
3    [(kembali, 1), (terbukti, 1), (para, 1), (kadr...
4    [(di, 2), (siapsiap, 1), (blt, 1), (bbm, 1), (...
Name: tweet_tokens_fdist, dtype: object


## ------- StopWords ---------

In [6]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah','koq'])


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

df['tweet_tokens_WSW'] = df['tweet_tokens'].apply(stopwords_removal) 


print(df['tweet_tokens_WSW'].sample(5))


133             [blt, bbm, jaga, daya, beli, masyarakat]
642    [takalarbabinsa, koramil, marbo, desa, tope, j...
709       [pembagian, blt, mudah, sasaran, dki, jakarta]
163          [diperhatikan, penerimaan, blt, bbm, jambi]
239    [lampung, apresiasi, kebijakan, pemerintah, bl...
Name: tweet_tokens_WSW, dtype: object


## --------- Stemming ----------

In [7]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in df['tweet_tokens_WSW']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    # print(term,":" ,term_dict[term])

print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['tweet_tokens_stemmed'] = df['tweet_tokens_WSW'].swifter.apply(get_stemmed_term)

740
------------------------
------------------------


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

In [11]:
df.tweet_tokens_stemmed.to_csv('data_stemmed.csv',index=True)

## ------- Final Tweet ----------

In [8]:
def final_tweet(text):
    text =  ' '.join(text)
    return text
df['final_tweet'] = df['tweet_tokens_stemmed'].apply(final_tweet)
df

Unnamed: 0,date,username,tweet,tweet_tokens,tweet_tokens_fdist,tweet_tokens_WSW,tweet_tokens_stemmed,final_tweet
0,2022-10-24 14:37:10+00:00,ada_pohan,lha yg maruk blt kelian orang koq,"[lha, yg, maruk, blt, kelian, orang, koq]","{'lha': 1, 'yg': 1, 'maruk': 1, 'blt': 1, 'kel...","[lha, maruk, blt, kelian, orang]","[lha, maruk, blt, kelian, orang]",lha maruk blt kelian orang
1,2022-10-24 14:33:36+00:00,PRDepok,cara daftar blt bbm online lewat hp di aplikas...,"[cara, daftar, blt, bbm, online, lewat, hp, di...","{'cara': 1, 'daftar': 1, 'blt': 1, 'bbm': 1, '...","[daftar, blt, bbm, online, hp, aplikasi, cek, ...","[daftar, blt, bbm, online, hp, aplikasi, cek, ...",daftar blt bbm online hp aplikasi cek bansos c...
2,2022-10-24 14:33:36+00:00,pikiran_rakyat,cara daftar blt bbm online lewat hp di aplikas...,"[cara, daftar, blt, bbm, online, lewat, hp, di...","{'cara': 1, 'daftar': 1, 'blt': 1, 'bbm': 1, '...","[daftar, blt, bbm, online, hp, aplikasi, cek, ...","[daftar, blt, bbm, online, hp, aplikasi, cek, ...",daftar blt bbm online hp aplikasi cek bansos c...
3,2022-10-24 14:28:44+00:00,antony_xenruang,kembali terbukti para kadrun itu punya tabiat ...,"[kembali, terbukti, para, kadrun, itu, punya, ...","{'kembali': 1, 'terbukti': 1, 'para': 1, 'kadr...","[terbukti, kadrun, tabiat, karakter, pecundang...","[bukti, kadrun, tabiat, karakter, cundang, sen...",bukti kadrun tabiat karakter cundang senang re...
4,2022-10-24 14:24:45+00:00,suaramerdeka,siapsiap blt bbm bakal cair lagi di november l...,"[siapsiap, blt, bbm, bakal, cair, lagi, di, no...","{'siapsiap': 1, 'blt': 1, 'bbm': 1, 'bakal': 1...","[siapsiap, blt, bbm, cair, november, lihat, na...","[siapsiap, blt, bbm, cair, november, lihat, na...",siapsiap blt bbm cair november lihat nama teri...
...,...,...,...,...,...,...,...,...
995,2022-10-24 07:13:11+00:00,randy_saputra78,pemerintah berupaya sejahterakan rakyat dengan...,"[pemerintah, berupaya, sejahterakan, rakyat, d...","{'pemerintah': 1, 'berupaya': 1, 'sejahterakan...","[pemerintah, berupaya, sejahterakan, rakyat, b...","[perintah, upaya, sejahtera, rakyat, blt, bbm]",perintah upaya sejahtera rakyat blt bbm
996,2022-10-24 07:12:50+00:00,randy_saputra78,blt bbm sejahterakan rakyat terima kasih presi...,"[blt, bbm, sejahterakan, rakyat, terima, kasih...","{'blt': 1, 'bbm': 1, 'sejahterakan': 1, 'rakya...","[blt, bbm, sejahterakan, rakyat, terima, kasih...","[blt, bbm, sejahtera, rakyat, terima, kasih, p...",blt bbm sejahtera rakyat terima kasih presiden...
997,2022-10-24 07:12:47+00:00,clartDonn034,papua barat bijak manfaatkan blt bbm blt bbm u...,"[papua, barat, bijak, manfaatkan, blt, bbm, bl...","{'papua': 1, 'barat': 1, 'bijak': 1, 'manfaatk...","[papua, barat, bijak, manfaatkan, blt, bbm, bl...","[papua, barat, bijak, manfaat, blt, bbm, blt, ...",papua barat bijak manfaat blt bbm blt bbm blt ...
998,2022-10-24 07:12:40+00:00,Gunawan76595624,blt tepat sasaran,"[blt, tepat, sasaran]","{'blt': 1, 'tepat': 1, 'sasaran': 1}","[blt, sasaran]","[blt, sasar]",blt sasar


In [9]:
# final to Labelling
# df.final_tweet.to_csv('to_labelling3.csv',index=False)

In [12]:
df = pd.read_csv('hasil_labelling3.csv')

setelah cleaning maka perlu dilakukan labelling sentiment menggunakan aplikasi orange ....

In [13]:
df

Unnamed: 0,sentiment,final_tweet
0,0.000000,lha maruk blt kelian orang
1,0.000000,daftar blt bbm online hp aplikasi cek bansos c...
2,0.000000,daftar blt bbm online hp aplikasi cek bansos c...
3,14.285714,bukti kadrun tabiat karakter cundang senang re...
4,11.111111,siapsiap blt bbm cair november lihat nama teri...
...,...,...
994,0.000000,perintah upaya sejahtera rakyat blt bbm
995,25.000000,blt bbm sejahtera rakyat terima kasih presiden...
996,16.666667,papua barat bijak manfaat blt bbm blt bbm blt ...
997,0.000000,blt sasar


In [14]:
def conv(sentiment):
    if sentiment >= 1 :
        return 1
    elif sentiment == 0:
        return 0
    else:
        return -1
df['sentiment_norm'] = df['sentiment'].apply(conv)

In [17]:
df[df['sentiment_norm'] == -1 ]

Unnamed: 0,sentiment,final_tweet,sentiment_norm
27,-20.0,blt bbm cegah ancam inflasi,-1
28,-20.0,blt bbm cegah ancam inflasi,-1
29,-20.0,blt bbm cegah ancam inflasi,-1
30,-20.0,blt bbm cegah ancam inflasi,-1
35,-33.333333,blt bom turun,-1
37,-14.285714,bohong menteri profesional ngutang blt copras ...,-1
45,-16.666667,id dumdum negara wajib layan layan,-1
93,-7.142857,tangan dampak inflasi pemkot surabaya gelontor...,-1
120,-18.75,kurang dampak inflasi pemkot kota surabaya ser...,-1
127,-11.111111,pemkot surabaya bagi blt bbm kemudi angkut tek...,-1


In [21]:
df.to_csv('hasil_labelling.csv',index=False)